plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

cache.c (43774B)


      1 #include "stdinc.h"
      2 #include "dat.h"
      3 #include "fns.h"
      4 #include "error.h"
      5 
      6 #include "9.h"	/* for cacheFlush */
      7 
      8 typedef struct FreeList FreeList;
      9 typedef struct BAddr BAddr;
     10 
     11 enum {
     12 	BadHeap = ~0,
     13 };
     14 
     15 /*
     16  * Store data to the memory cache in c->size blocks
     17  * with the block zero extended to fill it out.  When writing to
     18  * Venti, the block will be zero truncated.  The walker will also check
     19  * that the block fits within psize or dsize as the case may be.
     20  */
     21 
     22 struct Cache
     23 {
     24 	QLock	lk;
     25 	int 	ref;
     26 	int	mode;
     27 
     28 	Disk 	*disk;
     29 	int	size;			/* block size */
     30 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
     31 	VtConn *z;
     32 	u32int	now;			/* ticks for usage timestamps */
     33 	Block	**heads;		/* hash table for finding address */
     34 	int	nheap;			/* number of available victims */
     35 	Block	**heap;			/* heap for locating victims */
     36 	long	nblocks;		/* number of blocks allocated */
     37 	Block	*blocks;		/* array of block descriptors */
     38 	u8int	*mem;			/* memory for all block data & blists */
     39 
     40 	BList	*blfree;
     41 	Rendez	blrend;
     42 
     43 	int 	ndirty;			/* number of dirty blocks in the cache */
     44 	int 	maxdirty;		/* max number of dirty blocks */
     45 	u32int	vers;
     46 
     47 	long hashSize;
     48 
     49 	FreeList *fl;
     50 
     51 	Rendez die;			/* daemon threads should die when QLock != nil */
     52 
     53 	Rendez flush;
     54 	Rendez flushwait;
     55 	Rendez heapwait;
     56 	BAddr *baddr;
     57 	int bw, br, be;
     58 	int nflush;
     59 
     60 	Periodic *sync;
     61 
     62 	/* unlink daemon */
     63 	BList *uhead;
     64 	BList *utail;
     65 	Rendez unlink;
     66 
     67 	/* block counts */
     68 	int nused;
     69 	int ndisk;
     70 };
     71 
     72 struct BList {
     73 	int part;
     74 	u32int addr;
     75 	uchar type;
     76 	u32int tag;
     77 	u32int epoch;
     78 	u32int vers;
     79 
     80 	int recurse;	/* for block unlink */
     81 
     82 	/* for roll back */
     83 	int index;			/* -1 indicates not valid */
     84 	union {
     85 		uchar score[VtScoreSize];
     86 		uchar entry[VtEntrySize];
     87 	} old;
     88 	BList *next;
     89 };
     90 
     91 struct BAddr {
     92 	int part;
     93 	u32int addr;
     94 	u32int vers;
     95 };
     96 
     97 struct FreeList {
     98 	QLock lk;
     99 	u32int last;		/* last block allocated */
    100 	u32int end;		/* end of data partition */
    101 	u32int nused;		/* number of used blocks */
    102 	u32int epochLow;	/* low epoch when last updated nused */
    103 };
    104 
    105 static FreeList *flAlloc(u32int end);
    106 static void flFree(FreeList *fl);
    107 
    108 static Block *cacheBumpBlock(Cache *c);
    109 static void heapDel(Block*);
    110 static void heapIns(Block*);
    111 static void cacheCheck(Cache*);
    112 static void unlinkThread(void *a);
    113 static void flushThread(void *a);
    114 static void unlinkBody(Cache *c);
    115 static int cacheFlushBlock(Cache *c);
    116 static void cacheSync(void*);
    117 static BList *blistAlloc(Block*);
    118 static void blistFree(Cache*, BList*);
    119 static void doRemoveLink(Cache*, BList*);
    120 
    121 /*
    122  * Mapping from local block type to Venti type
    123  */
    124 int vtType[BtMax] = {
    125 	VtDataType,		/* BtData | 0  */
    126 	VtDataType+1,		/* BtData | 1  */
    127 	VtDataType+2,		/* BtData | 2  */
    128 	VtDataType+3,		/* BtData | 3  */
    129 	VtDataType+4,		/* BtData | 4  */
    130 	VtDataType+5,		/* BtData | 5  */
    131 	VtDataType+6,		/* BtData | 6  */
    132 	VtDataType+7,		/* BtData | 7  */
    133 	VtDirType,		/* BtDir | 0  */
    134 	VtDirType+1,		/* BtDir | 1  */
    135 	VtDirType+2,		/* BtDir | 2  */
    136 	VtDirType+3,		/* BtDir | 3  */
    137 	VtDirType+4,		/* BtDir | 4  */
    138 	VtDirType+5,		/* BtDir | 5  */
    139 	VtDirType+6,		/* BtDir | 6  */
    140 	VtDirType+7,		/* BtDir | 7  */
    141 };
    142 
    143 /*
    144  * Allocate the memory cache.
    145  */
    146 Cache *
    147 cacheAlloc(Disk *disk, VtConn *z, ulong nblocks, int mode)
    148 {
    149 	int i;
    150 	Cache *c;
    151 	Block *b;
    152 	BList *bl;
    153 	u8int *p;
    154 	int nbl;
    155 
    156 	c = vtmallocz(sizeof(Cache));
    157 
    158 	/* reasonable number of BList elements */
    159 	nbl = nblocks * 4;
    160 
    161 	c->ref = 1;
    162 	c->disk = disk;
    163 	c->z = z;
    164 	c->size = diskBlockSize(disk);
    165 bwatchSetBlockSize(c->size);
    166 	/* round c->size up to be a nice multiple */
    167 	c->size = (c->size + 127) & ~127;
    168 	c->ndmap = (c->size/20 + 7) / 8;
    169 	c->nblocks = nblocks;
    170 	c->hashSize = nblocks;
    171 	c->heads = vtmallocz(c->hashSize*sizeof(Block*));
    172 	c->heap = vtmallocz(nblocks*sizeof(Block*));
    173 	c->blocks = vtmallocz(nblocks*sizeof(Block));
    174 	c->mem = vtmallocz(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
    175 	c->baddr = vtmallocz(nblocks * sizeof(BAddr));
    176 	c->mode = mode;
    177 	c->vers++;
    178 	p = c->mem;
    179 	for(i = 0; i < nblocks; i++){
    180 		b = &c->blocks[i];
    181 		b->c = c;
    182 		b->data = p;
    183 		b->heap = i;
    184 		b->ioready.l = &b->lk;
    185 		c->heap[i] = b;
    186 		p += c->size;
    187 	}
    188 	c->nheap = nblocks;
    189 	for(i = 0; i < nbl; i++){
    190 		bl = (BList*)p;
    191 		bl->next = c->blfree;
    192 		c->blfree = bl;
    193 		p += sizeof(BList);
    194 	}
    195 	/* separate loop to keep blocks and blists reasonably aligned */
    196 	for(i = 0; i < nblocks; i++){
    197 		b = &c->blocks[i];
    198 		b->dmap = p;
    199 		p += c->ndmap;
    200 	}
    201 
    202 	c->blrend.l = &c->lk;
    203 
    204 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
    205 
    206 	c->fl = flAlloc(diskSize(disk, PartData));
    207 
    208 	c->unlink.l = &c->lk;
    209 	c->flush.l = &c->lk;
    210 	c->flushwait.l = &c->lk;
    211 	c->heapwait.l = &c->lk;
    212 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
    213 
    214 	if(mode == OReadWrite){
    215 		c->ref += 2;
    216 		proccreate(unlinkThread, c, STACK);
    217 		proccreate(flushThread, c, STACK);
    218 	}
    219 	cacheCheck(c);
    220 
    221 	return c;
    222 }
    223 
    224 /*
    225  * Free the whole memory cache, flushing all dirty blocks to the disk.
    226  */
    227 void
    228 cacheFree(Cache *c)
    229 {
    230 	int i;
    231 
    232 	/* kill off daemon threads */
    233 	qlock(&c->lk);
    234 	c->die.l = &c->lk;
    235 	periodicKill(c->sync);
    236 	rwakeup(&c->flush);
    237 	rwakeup(&c->unlink);
    238 	while(c->ref > 1)
    239 		rsleep(&c->die);
    240 
    241 	/* flush everything out */
    242 	do {
    243 		unlinkBody(c);
    244 		qunlock(&c->lk);
    245 		while(cacheFlushBlock(c))
    246 			;
    247 		diskFlush(c->disk);
    248 		qlock(&c->lk);
    249 	} while(c->uhead || c->ndirty);
    250 	qunlock(&c->lk);
    251 
    252 	cacheCheck(c);
    253 
    254 	for(i = 0; i < c->nblocks; i++){
    255 		assert(c->blocks[i].ref == 0);
    256 	}
    257 	flFree(c->fl);
    258 	vtfree(c->baddr);
    259 	vtfree(c->heads);
    260 	vtfree(c->blocks);
    261 	vtfree(c->mem);
    262 	diskFree(c->disk);
    263 	/* don't close vtSession */
    264 	vtfree(c);
    265 }
    266 
    267 static void
    268 cacheDump(Cache *c)
    269 {
    270 	int i;
    271 	Block *b;
    272 
    273 	for(i = 0; i < c->nblocks; i++){
    274 		b = &c->blocks[i];
    275 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
    276 			i, b->part, b->addr, b->score, b->l.type, b->ref,
    277 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
    278 	}
    279 }
    280 
    281 static void
    282 cacheCheck(Cache *c)
    283 {
    284 	u32int size, now;
    285 	int i, k, refed;
    286 	Block *b;
    287 
    288 	size = c->size;
    289 	now = c->now;
    290 
    291 	for(i = 0; i < c->nheap; i++){
    292 		if(c->heap[i]->heap != i)
    293 			sysfatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
    294 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
    295 			sysfatal("bad heap ordering");
    296 		k = (i << 1) + 1;
    297 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
    298 			sysfatal("bad heap ordering");
    299 		k++;
    300 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
    301 			sysfatal("bad heap ordering");
    302 	}
    303 
    304 	refed = 0;
    305 	for(i = 0; i < c->nblocks; i++){
    306 		b = &c->blocks[i];
    307 		if(b->data != &c->mem[i * size])
    308 			sysfatal("mis-blocked at %d", i);
    309 		if(b->ref && b->heap == BadHeap){
    310 			refed++;
    311 		}
    312 	}
    313 if(c->nheap + refed != c->nblocks){
    314 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
    315 cacheDump(c);
    316 }
    317 	assert(c->nheap + refed == c->nblocks);
    318 	refed = 0;
    319 	for(i = 0; i < c->nblocks; i++){
    320 		b = &c->blocks[i];
    321 		if(b->ref){
    322 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
    323 			refed++;
    324 		}
    325 	}
    326 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
    327 }
    328 
    329 
    330 /*
    331  * locate the block with the oldest second to last use.
    332  * remove it from the heap, and fix up the heap.
    333  */
    334 /* called with c->lk held */
    335 static Block *
    336 cacheBumpBlock(Cache *c)
    337 {
    338 	int printed;
    339 	Block *b;
    340 
    341 	/*
    342 	 * locate the block with the oldest second to last use.
    343 	 * remove it from the heap, and fix up the heap.
    344 	 */
    345 	printed = 0;
    346 	if(c->nheap == 0){
    347 		while(c->nheap == 0){
    348 			rwakeup(&c->flush);
    349 			rsleep(&c->heapwait);
    350 			if(c->nheap == 0){
    351 				printed = 1;
    352 				fprint(2, "%s: entire cache is busy, %d dirty "
    353 					"-- waking flush thread\n",
    354 					argv0, c->ndirty);
    355 			}
    356 		}
    357 		if(printed)
    358 			fprint(2, "%s: cache is okay again, %d dirty\n",
    359 				argv0, c->ndirty);
    360 	}
    361 
    362 	b = c->heap[0];
    363 	heapDel(b);
    364 
    365 	assert(b->heap == BadHeap);
    366 	assert(b->ref == 0);
    367 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
    368 	assert(b->prior == nil);
    369 	assert(b->uhead == nil);
    370 
    371 	/*
    372 	 * unchain the block from hash chain
    373 	 */
    374 	if(b->prev){
    375 		*(b->prev) = b->next;
    376 		if(b->next)
    377 			b->next->prev = b->prev;
    378 		b->prev = nil;
    379 	}
    380 
    381 
    382 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
    383 	/* set block to a reasonable state */
    384 	b->ref = 1;
    385 	b->part = PartError;
    386 	memset(&b->l, 0, sizeof(b->l));
    387 	b->iostate = BioEmpty;
    388 
    389 	return b;
    390 }
    391 
    392 /*
    393  * look for a particular version of the block in the memory cache.
    394  */
    395 static Block *
    396 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
    397 	int waitlock, int *lockfailure)
    398 {
    399 	Block *b;
    400 	ulong h;
    401 
    402 	h = addr % c->hashSize;
    403 
    404 	if(lockfailure)
    405 		*lockfailure = 0;
    406 
    407 	/*
    408 	 * look for the block in the cache
    409 	 */
    410 	qlock(&c->lk);
    411 	for(b = c->heads[h]; b != nil; b = b->next){
    412 		if(b->part == part && b->addr == addr)
    413 			break;
    414 	}
    415 	if(b == nil || b->vers != vers){
    416 		qunlock(&c->lk);
    417 		return nil;
    418 	}
    419 	if(!waitlock && !canqlock(&b->lk)){
    420 		*lockfailure = 1;
    421 		qunlock(&c->lk);
    422 		return nil;
    423 	}
    424 	heapDel(b);
    425 	b->ref++;
    426 	qunlock(&c->lk);
    427 
    428 	bwatchLock(b);
    429 	if(waitlock)
    430 		qlock(&b->lk);
    431 	b->nlock = 1;
    432 
    433 	for(;;){
    434 		switch(b->iostate){
    435 		default:
    436 			abort();
    437 		case BioEmpty:
    438 		case BioLabel:
    439 		case BioClean:
    440 		case BioDirty:
    441 			if(b->vers != vers){
    442 				blockPut(b);
    443 				return nil;
    444 			}
    445 			return b;
    446 		case BioReading:
    447 		case BioWriting:
    448 			rsleep(&b->ioready);
    449 			break;
    450 		case BioVentiError:
    451 			blockPut(b);
    452 			werrstr("venti i/o error block 0x%.8ux", addr);
    453 			return nil;
    454 		case BioReadError:
    455 			blockPut(b);
    456 			werrstr("error reading block 0x%.8ux", addr);
    457 			return nil;
    458 		}
    459 	}
    460 	/* NOT REACHED */
    461 }
    462 
    463 
    464 /*
    465  * fetch a local (on-disk) block from the memory cache.
    466  * if it's not there, load it, bumping some other block.
    467  */
    468 Block *
    469 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
    470 {
    471 	Block *b;
    472 	ulong h;
    473 
    474 	assert(part != PartVenti);
    475 
    476 	h = addr % c->hashSize;
    477 
    478 	/*
    479 	 * look for the block in the cache
    480 	 */
    481 	qlock(&c->lk);
    482 	for(b = c->heads[h]; b != nil; b = b->next){
    483 		if(b->part != part || b->addr != addr)
    484 			continue;
    485 		if(epoch && b->l.epoch != epoch){
    486 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
    487 			qunlock(&c->lk);
    488 			werrstr(ELabelMismatch);
    489 			return nil;
    490 		}
    491 		heapDel(b);
    492 		b->ref++;
    493 		break;
    494 	}
    495 
    496 	if(b == nil){
    497 		b = cacheBumpBlock(c);
    498 
    499 		b->part = part;
    500 		b->addr = addr;
    501 		localToGlobal(addr, b->score);
    502 
    503 		/* chain onto correct hash */
    504 		b->next = c->heads[h];
    505 		c->heads[h] = b;
    506 		if(b->next != nil)
    507 			b->next->prev = &b->next;
    508 		b->prev = &c->heads[h];
    509 	}
    510 
    511 	qunlock(&c->lk);
    512 
    513 	/*
    514 	 * BUG: what if the epoch changes right here?
    515 	 * In the worst case, we could end up in some weird
    516 	 * lock loop, because the block we want no longer exists,
    517 	 * and instead we're trying to lock a block we have no
    518 	 * business grabbing.
    519 	 *
    520 	 * For now, I'm not going to worry about it.
    521 	 */
    522 
    523 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
    524 	bwatchLock(b);
    525 	qlock(&b->lk);
    526 	b->nlock = 1;
    527 
    528 	if(part == PartData && b->iostate == BioEmpty){
    529 		if(!readLabel(c, &b->l, addr)){
    530 			blockPut(b);
    531 			return nil;
    532 		}
    533 		blockSetIOState(b, BioLabel);
    534 	}
    535 	if(epoch && b->l.epoch != epoch){
    536 		blockPut(b);
    537 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
    538 		werrstr(ELabelMismatch);
    539 		return nil;
    540 	}
    541 
    542 	b->pc = getcallerpc(&c);
    543 	for(;;){
    544 		switch(b->iostate){
    545 		default:
    546 			abort();
    547 		case BioLabel:
    548 			if(mode == OOverWrite)
    549 				/*
    550 				 * leave iostate as BioLabel because data
    551 				 * hasn't been read.
    552 				 */
    553 				return b;
    554 			/* fall through */
    555 		case BioEmpty:
    556 			diskRead(c->disk, b);
    557 			rsleep(&b->ioready);
    558 			break;
    559 		case BioClean:
    560 		case BioDirty:
    561 			return b;
    562 		case BioReading:
    563 		case BioWriting:
    564 			rsleep(&b->ioready);
    565 			break;
    566 		case BioReadError:
    567 			blockSetIOState(b, BioEmpty);
    568 			blockPut(b);
    569 			werrstr("error reading block 0x%.8ux", addr);
    570 			return nil;
    571 		}
    572 	}
    573 	/* NOT REACHED */
    574 }
    575 
    576 Block *
    577 cacheLocal(Cache *c, int part, u32int addr, int mode)
    578 {
    579 	return _cacheLocal(c, part, addr, mode, 0);
    580 }
    581 
    582 /*
    583  * fetch a local (on-disk) block from the memory cache.
    584  * if it's not there, load it, bumping some other block.
    585  * check tag and type.
    586  */
    587 Block *
    588 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
    589 {
    590 	Block *b;
    591 
    592 	b = _cacheLocal(c, PartData, addr, mode, epoch);
    593 	if(b == nil)
    594 		return nil;
    595 	if(b->l.type != type || b->l.tag != tag){
    596 		fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
    597 			argv0, addr, b->l.type, type, b->l.tag, tag);
    598 		werrstr(ELabelMismatch);
    599 		blockPut(b);
    600 		return nil;
    601 	}
    602 	b->pc = getcallerpc(&c);
    603 	return b;
    604 }
    605 
    606 /*
    607  * fetch a global (Venti) block from the memory cache.
    608  * if it's not there, load it, bumping some other block.
    609  * check tag and type if it's really a local block in disguise.
    610  */
    611 Block *
    612 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
    613 {
    614 	int n;
    615 	Block *b;
    616 	ulong h;
    617 	u32int addr;
    618 
    619 	addr = globalToLocal(score);
    620 	if(addr != NilBlock){
    621 		b = cacheLocalData(c, addr, type, tag, mode, 0);
    622 		if(b)
    623 			b->pc = getcallerpc(&c);
    624 		return b;
    625 	}
    626 
    627 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
    628 
    629 	/*
    630 	 * look for the block in the cache
    631 	 */
    632 	qlock(&c->lk);
    633 	for(b = c->heads[h]; b != nil; b = b->next){
    634 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
    635 			continue;
    636 		heapDel(b);
    637 		b->ref++;
    638 		break;
    639 	}
    640 
    641 	if(b == nil){
    642 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
    643 
    644 		b = cacheBumpBlock(c);
    645 
    646 		b->part = PartVenti;
    647 		b->addr = NilBlock;
    648 		b->l.type = type;
    649 		memmove(b->score, score, VtScoreSize);
    650 
    651 		/* chain onto correct hash */
    652 		b->next = c->heads[h];
    653 		c->heads[h] = b;
    654 		if(b->next != nil)
    655 			b->next->prev = &b->next;
    656 		b->prev = &c->heads[h];
    657 	}
    658 	qunlock(&c->lk);
    659 
    660 	bwatchLock(b);
    661 	qlock(&b->lk);
    662 	b->nlock = 1;
    663 	b->pc = getcallerpc(&c);
    664 
    665 	switch(b->iostate){
    666 	default:
    667 		abort();
    668 	case BioEmpty:
    669 		n = vtread(c->z, score, vtType[type], b->data, c->size);
    670 		if(n < 0 || vtsha1check(score, b->data, n) < 0){
    671 			blockSetIOState(b, BioVentiError);
    672 			blockPut(b);
    673 			werrstr(
    674 			"venti error reading block %V or wrong score: %r",
    675 				score);
    676 			return nil;
    677 		}
    678 		vtzeroextend(vtType[type], b->data, n, c->size);
    679 		blockSetIOState(b, BioClean);
    680 		return b;
    681 	case BioClean:
    682 		return b;
    683 	case BioVentiError:
    684 		blockPut(b);
    685 		werrstr("venti i/o error or wrong score, block %V", score);
    686 		return nil;
    687 	case BioReadError:
    688 		blockPut(b);
    689 		werrstr("error reading block %V", b->score);
    690 		return nil;
    691 	}
    692 	/* NOT REACHED */
    693 }
    694 
    695 /*
    696  * allocate a new on-disk block and load it into the memory cache.
    697  * BUG: if the disk is full, should we flush some of it to Venti?
    698  */
    699 static u32int lastAlloc;
    700 
    701 Block *
    702 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
    703 {
    704 	FreeList *fl;
    705 	u32int addr;
    706 	Block *b;
    707 	int n, nwrap;
    708 	Label lab;
    709 
    710 	n = c->size / LabelSize;
    711 	fl = c->fl;
    712 
    713 	qlock(&fl->lk);
    714 	addr = fl->last;
    715 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
    716 	if(b == nil){
    717 		fprint(2, "%s: cacheAllocBlock: xxx %r\n", argv0);
    718 		qunlock(&fl->lk);
    719 		return nil;
    720 	}
    721 	nwrap = 0;
    722 	for(;;){
    723 		if(++addr >= fl->end){
    724 			addr = 0;
    725 			if(++nwrap >= 2){
    726 				blockPut(b);
    727 				werrstr("disk is full");
    728 				/*
    729 				 * try to avoid a continuous spew of console
    730 				 * messages.
    731 				 */
    732 				if (fl->last != 0)
    733 					fprint(2, "%s: cacheAllocBlock: xxx1 %r\n",
    734 						argv0);
    735 				fl->last = 0;
    736 				qunlock(&fl->lk);
    737 				return nil;
    738 			}
    739 		}
    740 		if(addr%n == 0){
    741 			blockPut(b);
    742 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
    743 			if(b == nil){
    744 				fl->last = addr;
    745 				fprint(2, "%s: cacheAllocBlock: xxx2 %r\n", argv0);
    746 				qunlock(&fl->lk);
    747 				return nil;
    748 			}
    749 		}
    750 		if(!labelUnpack(&lab, b->data, addr%n))
    751 			continue;
    752 		if(lab.state == BsFree)
    753 			goto Found;
    754 		if(lab.state&BsClosed)
    755 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
    756 			goto Found;
    757 	}
    758 Found:
    759 	blockPut(b);
    760 	b = cacheLocal(c, PartData, addr, OOverWrite);
    761 	if(b == nil){
    762 		fprint(2, "%s: cacheAllocBlock: xxx3 %r\n", argv0);
    763 		return nil;
    764 	}
    765 assert(b->iostate == BioLabel || b->iostate == BioClean);
    766 	fl->last = addr;
    767 	lab.type = type;
    768 	lab.tag = tag;
    769 	lab.state = BsAlloc;
    770 	lab.epoch = epoch;
    771 	lab.epochClose = ~(u32int)0;
    772 	if(!blockSetLabel(b, &lab, 1)){
    773 		fprint(2, "%s: cacheAllocBlock: xxx4 %r\n", argv0);
    774 		blockPut(b);
    775 		return nil;
    776 	}
    777 	vtzeroextend(vtType[type], b->data, 0, c->size);
    778 if(0)diskWrite(c->disk, b);
    779 
    780 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
    781 	lastAlloc = addr;
    782 	fl->nused++;
    783 	qunlock(&fl->lk);
    784 	b->pc = getcallerpc(&c);
    785 	return b;
    786 }
    787 
    788 int
    789 cacheDirty(Cache *c)
    790 {
    791 	return c->ndirty;
    792 }
    793 
    794 void
    795 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
    796 {
    797 	int n;
    798 	u32int addr, nused;
    799 	Block *b;
    800 	Label lab;
    801 	FreeList *fl;
    802 
    803 	fl = c->fl;
    804 	n = c->size / LabelSize;
    805 	*bsize = c->size;
    806 	qlock(&fl->lk);
    807 	if(fl->epochLow == epochLow){
    808 		*used = fl->nused;
    809 		*total = fl->end;
    810 		qunlock(&fl->lk);
    811 		return;
    812 	}
    813 	b = nil;
    814 	nused = 0;
    815 	for(addr=0; addr<fl->end; addr++){
    816 		if(addr%n == 0){
    817 			blockPut(b);
    818 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
    819 			if(b == nil){
    820 				fprint(2, "%s: flCountUsed: loading %ux: %r\n",
    821 					argv0, addr/n);
    822 				break;
    823 			}
    824 		}
    825 		if(!labelUnpack(&lab, b->data, addr%n))
    826 			continue;
    827 		if(lab.state == BsFree)
    828 			continue;
    829 		if(lab.state&BsClosed)
    830 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
    831 			continue;
    832 		nused++;
    833 	}
    834 	blockPut(b);
    835 	if(addr == fl->end){
    836 		fl->nused = nused;
    837 		fl->epochLow = epochLow;
    838 	}
    839 	*used = nused;
    840 	*total = fl->end;
    841 	qunlock(&fl->lk);
    842 	return;
    843 }
    844 
    845 static FreeList *
    846 flAlloc(u32int end)
    847 {
    848 	FreeList *fl;
    849 
    850 	fl = vtmallocz(sizeof(*fl));
    851 	fl->last = 0;
    852 	fl->end = end;
    853 	return fl;
    854 }
    855 
    856 static void
    857 flFree(FreeList *fl)
    858 {
    859 	vtfree(fl);
    860 }
    861 
    862 u32int
    863 cacheLocalSize(Cache *c, int part)
    864 {
    865 	return diskSize(c->disk, part);
    866 }
    867 
    868 /*
    869  * The thread that has locked b may refer to it by
    870  * multiple names.  Nlock counts the number of
    871  * references the locking thread holds.  It will call
    872  * blockPut once per reference.
    873  */
    874 void
    875 blockDupLock(Block *b)
    876 {
    877 	assert(b->nlock > 0);
    878 	b->nlock++;
    879 }
    880 
    881 /*
    882  * we're done with the block.
    883  * unlock it.  can't use it after calling this.
    884  */
    885 void
    886 blockPut(Block* b)
    887 {
    888 	Cache *c;
    889 
    890 	if(b == nil)
    891 		return;
    892 
    893 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
    894 
    895 	if(b->iostate == BioDirty)
    896 		bwatchDependency(b);
    897 
    898 	if(--b->nlock > 0)
    899 		return;
    900 
    901 	/*
    902 	 * b->nlock should probably stay at zero while
    903 	 * the block is unlocked, but diskThread and rsleep
    904 	 * conspire to assume that they can just qlock(&b->lk); blockPut(b),
    905 	 * so we have to keep b->nlock set to 1 even
    906 	 * when the block is unlocked.
    907 	 */
    908 	assert(b->nlock == 0);
    909 	b->nlock = 1;
    910 //	b->pc = 0;
    911 
    912 	bwatchUnlock(b);
    913 	qunlock(&b->lk);
    914 	c = b->c;
    915 	qlock(&c->lk);
    916 
    917 	if(--b->ref > 0){
    918 		qunlock(&c->lk);
    919 		return;
    920 	}
    921 
    922 	assert(b->ref == 0);
    923 	switch(b->iostate){
    924 	default:
    925 		b->used = c->now++;
    926 		heapIns(b);
    927 		break;
    928 	case BioEmpty:
    929 	case BioLabel:
    930 		if(c->nheap == 0)
    931 			b->used = c->now++;
    932 		else
    933 			b->used = c->heap[0]->used;
    934 		heapIns(b);
    935 		break;
    936 	case BioDirty:
    937 		break;
    938 	}
    939 	qunlock(&c->lk);
    940 }
    941 
    942 /*
    943  * set the label associated with a block.
    944  */
    945 Block*
    946 _blockSetLabel(Block *b, Label *l)
    947 {
    948 	int lpb;
    949 	Block *bb;
    950 	u32int a;
    951 	Cache *c;
    952 
    953 	c = b->c;
    954 
    955 	assert(b->part == PartData);
    956 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
    957 	lpb = c->size / LabelSize;
    958 	a = b->addr / lpb;
    959 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
    960 	if(bb == nil){
    961 		blockPut(b);
    962 		return nil;
    963 	}
    964 	b->l = *l;
    965 	labelPack(l, bb->data, b->addr%lpb);
    966 	blockDirty(bb);
    967 	return bb;
    968 }
    969 
    970 int
    971 blockSetLabel(Block *b, Label *l, int allocating)
    972 {
    973 	Block *lb;
    974 
    975 	lb = _blockSetLabel(b, l);
    976 	if(lb == nil)
    977 		return 0;
    978 
    979 	/*
    980 	 * If we're allocating the block, make sure the label (bl)
    981 	 * goes to disk before the data block (b) itself.  This is to help
    982 	 * the blocks that in turn depend on b.
    983 	 *
    984 	 * Suppose bx depends on (must be written out after) b.
    985 	 * Once we write b we'll think it's safe to write bx.
    986 	 * Bx can't get at b unless it has a valid label, though.
    987 	 *
    988 	 * Allocation is the only case in which having a current label
    989 	 * is vital because:
    990 	 *
    991 	 *	- l.type is set at allocation and never changes.
    992 	 *	- l.tag is set at allocation and never changes.
    993 	 *	- l.state is not checked when we load blocks.
    994 	 *	- the archiver cares deeply about l.state being
    995 	 *		BaActive vs. BaCopied, but that's handled
    996 	 *		by direct calls to _blockSetLabel.
    997 	 */
    998 
    999 	if(allocating)
   1000 		blockDependency(b, lb, -1, nil, nil);
   1001 	blockPut(lb);
   1002 	return 1;
   1003 }
   1004 
   1005 /*
   1006  * Record that bb must be written out before b.
   1007  * If index is given, we're about to overwrite the score/e
   1008  * at that index in the block.  Save the old value so we
   1009  * can write a safer ``old'' version of the block if pressed.
   1010  */
   1011 void
   1012 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
   1013 {
   1014 	BList *p;
   1015 
   1016 	if(bb->iostate == BioClean)
   1017 		return;
   1018 
   1019 	/*
   1020 	 * Dependencies for blocks containing Entry structures
   1021 	 * or scores must always be explained.  The problem with
   1022 	 * only explaining some of them is this.  Suppose we have two
   1023 	 * dependencies for the same field, the first explained
   1024 	 * and the second not.  We try to write the block when the first
   1025 	 * dependency is not written but the second is.  We will roll back
   1026 	 * the first change even though the second trumps it.
   1027 	 */
   1028 	if(index == -1 && bb->part == PartData)
   1029 		assert(b->l.type == BtData);
   1030 
   1031 	if(bb->iostate != BioDirty){
   1032 		fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
   1033 			argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
   1034 		abort();
   1035 	}
   1036 
   1037 	p = blistAlloc(bb);
   1038 	if(p == nil)
   1039 		return;
   1040 
   1041 	assert(bb->iostate == BioDirty);
   1042 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
   1043 
   1044 	p->part = bb->part;
   1045 	p->addr = bb->addr;
   1046 	p->type = bb->l.type;
   1047 	p->vers = bb->vers;
   1048 	p->index = index;
   1049 	if(p->index >= 0){
   1050 		/*
   1051 		 * This test would just be b->l.type==BtDir except
   1052 		 * we need to exclude the super block.
   1053 		 */
   1054 		if(b->l.type == BtDir && b->part == PartData)
   1055 			entryPack(e, p->old.entry, 0);
   1056 		else
   1057 			memmove(p->old.score, score, VtScoreSize);
   1058 	}
   1059 	p->next = b->prior;
   1060 	b->prior = p;
   1061 }
   1062 
   1063 /*
   1064  * Mark an in-memory block as dirty.  If there are too many
   1065  * dirty blocks, start writing some out to disk.
   1066  *
   1067  * If there were way too many dirty blocks, we used to
   1068  * try to do some flushing ourselves, but it's just too dangerous --
   1069  * it implies that the callers cannot have any of our priors locked,
   1070  * but this is hard to avoid in some cases.
   1071  */
   1072 int
   1073 blockDirty(Block *b)
   1074 {
   1075 	Cache *c;
   1076 
   1077 	c = b->c;
   1078 
   1079 	assert(b->part != PartVenti);
   1080 
   1081 	if(b->iostate == BioDirty)
   1082 		return 1;
   1083 	assert(b->iostate == BioClean || b->iostate == BioLabel);
   1084 
   1085 	qlock(&c->lk);
   1086 	b->iostate = BioDirty;
   1087 	c->ndirty++;
   1088 	if(c->ndirty > (c->maxdirty>>1))
   1089 		rwakeup(&c->flush);
   1090 	qunlock(&c->lk);
   1091 
   1092 	return 1;
   1093 }
   1094 
   1095 /*
   1096  * We've decided to write out b.  Maybe b has some pointers to blocks
   1097  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
   1098  * copy of b that is safe to write out.  (diskThread will make sure the block
   1099  * remains marked as dirty.)
   1100  */
   1101 uchar *
   1102 blockRollback(Block *b, uchar *buf)
   1103 {
   1104 	u32int addr;
   1105 	BList *p;
   1106 	Super super;
   1107 
   1108 	/* easy case */
   1109 	if(b->prior == nil)
   1110 		return b->data;
   1111 
   1112 	memmove(buf, b->data, b->c->size);
   1113 	for(p=b->prior; p; p=p->next){
   1114 		/*
   1115 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
   1116 		 */
   1117 		assert(p->index >= 0);
   1118 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
   1119 		if(b->part == PartSuper){
   1120 			assert(p->index == 0);
   1121 			superUnpack(&super, buf);
   1122 			addr = globalToLocal(p->old.score);
   1123 			if(addr == NilBlock){
   1124 				fprint(2, "%s: rolling back super block: "
   1125 					"bad replacement addr %V\n",
   1126 					argv0, p->old.score);
   1127 				abort();
   1128 			}
   1129 			super.active = addr;
   1130 			superPack(&super, buf);
   1131 			continue;
   1132 		}
   1133 		if(b->l.type == BtDir)
   1134 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
   1135 		else
   1136 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
   1137 	}
   1138 	return buf;
   1139 }
   1140 
   1141 /*
   1142  * Try to write block b.
   1143  * If b depends on other blocks:
   1144  *
   1145  *	If the block has been written out, remove the dependency.
   1146  *	If the dependency is replaced by a more recent dependency,
   1147  *		throw it out.
   1148  *	If we know how to write out an old version of b that doesn't
   1149  *		depend on it, do that.
   1150  *
   1151  *	Otherwise, bail.
   1152  */
   1153 int
   1154 blockWrite(Block *b, int waitlock)
   1155 {
   1156 	uchar *dmap;
   1157 	Cache *c;
   1158 	BList *p, **pp;
   1159 	Block *bb;
   1160 	int lockfail;
   1161 
   1162 	c = b->c;
   1163 
   1164 	if(b->iostate != BioDirty)
   1165 		return 1;
   1166 
   1167 	dmap = b->dmap;
   1168 	memset(dmap, 0, c->ndmap);
   1169 	pp = &b->prior;
   1170 	for(p=*pp; p; p=*pp){
   1171 		if(p->index >= 0){
   1172 			/* more recent dependency has succeeded; this one can go */
   1173 			if(dmap[p->index/8] & (1<<(p->index%8)))
   1174 				goto ignblock;
   1175 		}
   1176 
   1177 		lockfail = 0;
   1178 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
   1179 			&lockfail);
   1180 		if(bb == nil){
   1181 			if(lockfail)
   1182 				return 0;
   1183 			/* block not in cache => was written already */
   1184 			dmap[p->index/8] |= 1<<(p->index%8);
   1185 			goto ignblock;
   1186 		}
   1187 
   1188 		/*
   1189 		 * same version of block is still in cache.
   1190 		 *
   1191 		 * the assertion is true because the block still has version p->vers,
   1192 		 * which means it hasn't been written out since we last saw it.
   1193 		 */
   1194 		if(bb->iostate != BioDirty){
   1195 			fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
   1196 				argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
   1197 			/* probably BioWriting if it happens? */
   1198 			if(bb->iostate == BioClean)
   1199 				goto ignblock;
   1200 		}
   1201 
   1202 		blockPut(bb);
   1203 
   1204 		if(p->index < 0){
   1205 			/*
   1206 			 * We don't know how to temporarily undo
   1207 			 * b's dependency on bb, so just don't write b yet.
   1208 			 */
   1209 			if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
   1210 				argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
   1211 			return 0;
   1212 		}
   1213 		/* keep walking down the list */
   1214 		pp = &p->next;
   1215 		continue;
   1216 
   1217 ignblock:
   1218 		*pp = p->next;
   1219 		blistFree(c, p);
   1220 		continue;
   1221 	}
   1222 
   1223 	/*
   1224 	 * DiskWrite must never be called with a double-locked block.
   1225 	 * This call to diskWrite is okay because blockWrite is only called
   1226 	 * from the cache flush thread, which never double-locks a block.
   1227 	 */
   1228 	diskWrite(c->disk, b);
   1229 	return 1;
   1230 }
   1231 
   1232 /*
   1233  * Change the I/O state of block b.
   1234  * Just an assignment except for magic in
   1235  * switch statement (read comments there).
   1236  */
   1237 void
   1238 blockSetIOState(Block *b, int iostate)
   1239 {
   1240 	int dowakeup;
   1241 	Cache *c;
   1242 	BList *p, *q;
   1243 
   1244 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
   1245 
   1246 	c = b->c;
   1247 
   1248 	dowakeup = 0;
   1249 	switch(iostate){
   1250 	default:
   1251 		abort();
   1252 	case BioEmpty:
   1253 		assert(!b->uhead);
   1254 		break;
   1255 	case BioLabel:
   1256 		assert(!b->uhead);
   1257 		break;
   1258 	case BioClean:
   1259 		bwatchDependency(b);
   1260 		/*
   1261 		 * If b->prior is set, it means a write just finished.
   1262 		 * The prior list isn't needed anymore.
   1263 		 */
   1264 		for(p=b->prior; p; p=q){
   1265 			q = p->next;
   1266 			blistFree(c, p);
   1267 		}
   1268 		b->prior = nil;
   1269 		/*
   1270 		 * Freeing a block or just finished a write.
   1271 		 * Move the blocks from the per-block unlink
   1272 		 * queue to the cache unlink queue.
   1273 		 */
   1274 		if(b->iostate == BioDirty || b->iostate == BioWriting){
   1275 			qlock(&c->lk);
   1276 			c->ndirty--;
   1277 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
   1278 			b->vers = c->vers++;
   1279 			if(b->uhead){
   1280 				/* add unlink blocks to unlink queue */
   1281 				if(c->uhead == nil){
   1282 					c->uhead = b->uhead;
   1283 					rwakeup(&c->unlink);
   1284 				}else
   1285 					c->utail->next = b->uhead;
   1286 				c->utail = b->utail;
   1287 				b->uhead = nil;
   1288 			}
   1289 			qunlock(&c->lk);
   1290 		}
   1291 		assert(!b->uhead);
   1292 		dowakeup = 1;
   1293 		break;
   1294 	case BioDirty:
   1295 		/*
   1296 		 * Wrote out an old version of the block (see blockRollback).
   1297 		 * Bump a version count, leave it dirty.
   1298 		 */
   1299 		if(b->iostate == BioWriting){
   1300 			qlock(&c->lk);
   1301 			b->vers = c->vers++;
   1302 			qunlock(&c->lk);
   1303 			dowakeup = 1;
   1304 		}
   1305 		break;
   1306 	case BioReading:
   1307 	case BioWriting:
   1308 		/*
   1309 		 * Adding block to disk queue.  Bump reference count.
   1310 		 * diskThread decs the count later by calling blockPut.
   1311 		 * This is here because we need to lock c->lk to
   1312 		 * manipulate the ref count.
   1313 		 */
   1314 		qlock(&c->lk);
   1315 		b->ref++;
   1316 		qunlock(&c->lk);
   1317 		break;
   1318 	case BioReadError:
   1319 	case BioVentiError:
   1320 		/*
   1321 		 * Oops.
   1322 		 */
   1323 		dowakeup = 1;
   1324 		break;
   1325 	}
   1326 	b->iostate = iostate;
   1327 	/*
   1328 	 * Now that the state has changed, we can wake the waiters.
   1329 	 */
   1330 	if(dowakeup)
   1331 		rwakeupall(&b->ioready);
   1332 }
   1333 
   1334 /*
   1335  * The active file system is a tree of blocks.
   1336  * When we add snapshots to the mix, the entire file system
   1337  * becomes a dag and thus requires a bit more care.
   1338  *
   1339  * The life of the file system is divided into epochs.  A snapshot
   1340  * ends one epoch and begins the next.  Each file system block
   1341  * is marked with the epoch in which it was created (b.epoch).
   1342  * When the block is unlinked from the file system (closed), it is marked
   1343  * with the epoch in which it was removed (b.epochClose).
   1344  * Once we have discarded or archived all snapshots up to
   1345  * b.epochClose, we can reclaim the block.
   1346  *
   1347  * If a block was created in a past epoch but is not yet closed,
   1348  * it is treated as copy-on-write.  Of course, in order to insert the
   1349  * new pointer into the tree, the parent must be made writable,
   1350  * and so on up the tree.  The recursion stops because the root
   1351  * block is always writable.
   1352  *
   1353  * If blocks are never closed, they will never be reused, and
   1354  * we will run out of disk space.  But marking a block as closed
   1355  * requires some care about dependencies and write orderings.
   1356  *
   1357  * (1) If a block p points at a copy-on-write block b and we
   1358  * copy b to create bb, then p must be written out after bb and
   1359  * lbb (bb's label block).
   1360  *
   1361  * (2) We have to mark b as closed, but only after we switch
   1362  * the pointer, so lb must be written out after p.  In fact, we
   1363  * can't even update the in-memory copy, or the cache might
   1364  * mistakenly give out b for reuse before p gets written.
   1365  *
   1366  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
   1367  * The caller is expected to record a "p after bb" dependency
   1368  * to finish (1), and also expected to call blockRemoveLink
   1369  * to arrange for (2) to happen once p is written.
   1370  *
   1371  * Until (2) happens, some pieces of the code (e.g., the archiver)
   1372  * still need to know whether a block has been copied, so we
   1373  * set the BsCopied bit in the label and force that to disk *before*
   1374  * the copy gets written out.
   1375  */
   1376 Block*
   1377 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
   1378 {
   1379 	Block *bb, *lb;
   1380 	Label l;
   1381 
   1382 	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
   1383 		fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
   1384 			argv0, b->addr, &b->l, elo, ehi);
   1385 
   1386 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
   1387 	if(bb == nil){
   1388 		blockPut(b);
   1389 		return nil;
   1390 	}
   1391 
   1392 	/*
   1393 	 * Update label so we know the block has been copied.
   1394 	 * (It will be marked closed once it has been unlinked from
   1395 	 * the tree.)  This must follow cacheAllocBlock since we
   1396 	 * can't be holding onto lb when we call cacheAllocBlock.
   1397 	 */
   1398 	if((b->l.state&BsCopied)==0)
   1399 	if(b->part == PartData){	/* not the superblock */
   1400 		l = b->l;
   1401 		l.state |= BsCopied;
   1402 		lb = _blockSetLabel(b, &l);
   1403 		if(lb == nil){
   1404 			/* can't set label => can't copy block */
   1405 			blockPut(b);
   1406 			l.type = BtMax;
   1407 			l.state = BsFree;
   1408 			l.epoch = 0;
   1409 			l.epochClose = 0;
   1410 			l.tag = 0;
   1411 			blockSetLabel(bb, &l, 0);
   1412 			blockPut(bb);
   1413 			return nil;
   1414 		}
   1415 		blockDependency(bb, lb, -1, nil, nil);
   1416 		blockPut(lb);
   1417 	}
   1418 
   1419 	memmove(bb->data, b->data, b->c->size);
   1420 	blockDirty(bb);
   1421 	blockPut(b);
   1422 	return bb;
   1423 }
   1424 
   1425 /*
   1426  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
   1427  * If recurse is set, we are unlinking all of bb's children as well.
   1428  *
   1429  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
   1430  * the relevant information to b's list of unlinked blocks.  Once b is written,
   1431  * the list will be queued for processing.
   1432  *
   1433  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
   1434  */
   1435 void
   1436 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
   1437 {
   1438 	BList *p, **pp, bl;
   1439 
   1440 	/* remove bb from prior list */
   1441 	for(pp=&b->prior; (p=*pp)!=nil; ){
   1442 		if(p->part == PartData && p->addr == addr){
   1443 			*pp = p->next;
   1444 			blistFree(b->c, p);
   1445 		}else
   1446 			pp = &p->next;
   1447 	}
   1448 
   1449 	bl.part = PartData;
   1450 	bl.addr = addr;
   1451 	bl.type = type;
   1452 	bl.tag = tag;
   1453 	if(b->l.epoch == 0)
   1454 		assert(b->part == PartSuper);
   1455 	bl.epoch = b->l.epoch;
   1456 	bl.next = nil;
   1457 	bl.recurse = recurse;
   1458 
   1459 	if(b->part == PartSuper && b->iostate == BioClean)
   1460 		p = nil;
   1461 	else
   1462 		p = blistAlloc(b);
   1463 	if(p == nil){
   1464 		/*
   1465 		 * b has already been written to disk.
   1466 		 */
   1467 		doRemoveLink(b->c, &bl);
   1468 		return;
   1469 	}
   1470 
   1471 	/* Uhead is only processed when the block goes from Dirty -> Clean */
   1472 	assert(b->iostate == BioDirty);
   1473 
   1474 	*p = bl;
   1475 	if(b->uhead == nil)
   1476 		b->uhead = p;
   1477 	else
   1478 		b->utail->next = p;
   1479 	b->utail = p;
   1480 }
   1481 
   1482 /*
   1483  * Process removal of a single block and perhaps its children.
   1484  */
   1485 static void
   1486 doRemoveLink(Cache *c, BList *p)
   1487 {
   1488 	int i, n, recurse;
   1489 	u32int a;
   1490 	Block *b;
   1491 	Label l;
   1492 	BList bl;
   1493 
   1494 	recurse = (p->recurse && p->type != BtData && p->type != BtDir);
   1495 
   1496 	/*
   1497 	 * We're not really going to overwrite b, but if we're not
   1498 	 * going to look at its contents, there is no point in reading
   1499 	 * them from the disk.
   1500 	 */
   1501 	b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
   1502 	if(b == nil)
   1503 		return;
   1504 
   1505 	/*
   1506 	 * When we're unlinking from the superblock, close with the next epoch.
   1507 	 */
   1508 	if(p->epoch == 0)
   1509 		p->epoch = b->l.epoch+1;
   1510 
   1511 	/* sanity check */
   1512 	if(b->l.epoch > p->epoch){
   1513 		fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
   1514 			argv0, b->l.epoch, p->epoch);
   1515 		blockPut(b);
   1516 		return;
   1517 	}
   1518 
   1519 	if(recurse){
   1520 		n = c->size / VtScoreSize;
   1521 		for(i=0; i<n; i++){
   1522 			a = globalToLocal(b->data + i*VtScoreSize);
   1523 			if(a == NilBlock || !readLabel(c, &l, a))
   1524 				continue;
   1525 			if(l.state&BsClosed)
   1526 				continue;
   1527 			/*
   1528 			 * If stack space becomes an issue...
   1529 			p->addr = a;
   1530 			p->type = l.type;
   1531 			p->tag = l.tag;
   1532 			doRemoveLink(c, p);
   1533 			 */
   1534 
   1535 			bl.part = PartData;
   1536 			bl.addr = a;
   1537 			bl.type = l.type;
   1538 			bl.tag = l.tag;
   1539 			bl.epoch = p->epoch;
   1540 			bl.next = nil;
   1541 			bl.recurse = 1;
   1542 			/* give up the block lock - share with others */
   1543 			blockPut(b);
   1544 			doRemoveLink(c, &bl);
   1545 			b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
   1546 			if(b == nil){
   1547 				fprint(2, "%s: warning: lost block in doRemoveLink\n",
   1548 					argv0);
   1549 				return;
   1550 			}
   1551 		}
   1552 	}
   1553 
   1554 	l = b->l;
   1555 	l.state |= BsClosed;
   1556 	l.epochClose = p->epoch;
   1557 	if(l.epochClose == l.epoch){
   1558 		qlock(&c->fl->lk);
   1559 		if(l.epoch == c->fl->epochLow)
   1560 			c->fl->nused--;
   1561 		blockSetLabel(b, &l, 0);
   1562 		qunlock(&c->fl->lk);
   1563 	}else
   1564 		blockSetLabel(b, &l, 0);
   1565 	blockPut(b);
   1566 }
   1567 
   1568 /*
   1569  * Allocate a BList so that we can record a dependency
   1570  * or queue a removal related to block b.
   1571  * If we can't find a BList, we write out b and return nil.
   1572  */
   1573 static BList *
   1574 blistAlloc(Block *b)
   1575 {
   1576 	Cache *c;
   1577 	BList *p;
   1578 
   1579 	if(b->iostate != BioDirty){
   1580 		/*
   1581 		 * should not happen anymore -
   1582 	 	 * blockDirty used to flush but no longer does.
   1583 		 */
   1584 		assert(b->iostate == BioClean);
   1585 		fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
   1586 		return nil;
   1587 	}
   1588 
   1589 	c = b->c;
   1590 	qlock(&c->lk);
   1591 	if(c->blfree == nil){
   1592 		/*
   1593 		 * No free BLists.  What are our options?
   1594 		 */
   1595 
   1596 		/* Block has no priors? Just write it. */
   1597 		if(b->prior == nil){
   1598 			qunlock(&c->lk);
   1599 			diskWriteAndWait(c->disk, b);
   1600 			return nil;
   1601 		}
   1602 
   1603 		/*
   1604 		 * Wake the flush thread, which will hopefully free up
   1605 		 * some BLists for us.  We used to flush a block from
   1606 		 * our own prior list and reclaim that BList, but this is
   1607 		 * a no-no: some of the blocks on our prior list may
   1608 		 * be locked by our caller.  Or maybe their label blocks
   1609 		 * are locked by our caller.  In any event, it's too hard
   1610 		 * to make sure we can do I/O for ourselves.  Instead,
   1611 		 * we assume the flush thread will find something.
   1612 		 * (The flush thread never blocks waiting for a block,
   1613 		 * so it can't deadlock like we can.)
   1614 		 */
   1615 		while(c->blfree == nil){
   1616 			rwakeup(&c->flush);
   1617 			rsleep(&c->blrend);
   1618 			if(c->blfree == nil)
   1619 				fprint(2, "%s: flushing for blists\n", argv0);
   1620 		}
   1621 	}
   1622 
   1623 	p = c->blfree;
   1624 	c->blfree = p->next;
   1625 	qunlock(&c->lk);
   1626 	return p;
   1627 }
   1628 
   1629 static void
   1630 blistFree(Cache *c, BList *bl)
   1631 {
   1632 	qlock(&c->lk);
   1633 	bl->next = c->blfree;
   1634 	c->blfree = bl;
   1635 	rwakeup(&c->blrend);
   1636 	qunlock(&c->lk);
   1637 }
   1638 
   1639 char*
   1640 bsStr(int state)
   1641 {
   1642 	static char s[100];
   1643 
   1644 	if(state == BsFree)
   1645 		return "Free";
   1646 	if(state == BsBad)
   1647 		return "Bad";
   1648 
   1649 	sprint(s, "%x", state);
   1650 	if(!(state&BsAlloc))
   1651 		strcat(s, ",Free");	/* should not happen */
   1652 	if(state&BsCopied)
   1653 		strcat(s, ",Copied");
   1654 	if(state&BsVenti)
   1655 		strcat(s, ",Venti");
   1656 	if(state&BsClosed)
   1657 		strcat(s, ",Closed");
   1658 	return s;
   1659 }
   1660 
   1661 char *
   1662 bioStr(int iostate)
   1663 {
   1664 	switch(iostate){
   1665 	default:
   1666 		return "Unknown!!";
   1667 	case BioEmpty:
   1668 		return "Empty";
   1669 	case BioLabel:
   1670 		return "Label";
   1671 	case BioClean:
   1672 		return "Clean";
   1673 	case BioDirty:
   1674 		return "Dirty";
   1675 	case BioReading:
   1676 		return "Reading";
   1677 	case BioWriting:
   1678 		return "Writing";
   1679 	case BioReadError:
   1680 		return "ReadError";
   1681 	case BioVentiError:
   1682 		return "VentiError";
   1683 	case BioMax:
   1684 		return "Max";
   1685 	}
   1686 }
   1687 
   1688 static char *bttab[] = {
   1689 	"BtData",
   1690 	"BtData+1",
   1691 	"BtData+2",
   1692 	"BtData+3",
   1693 	"BtData+4",
   1694 	"BtData+5",
   1695 	"BtData+6",
   1696 	"BtData+7",
   1697 	"BtDir",
   1698 	"BtDir+1",
   1699 	"BtDir+2",
   1700 	"BtDir+3",
   1701 	"BtDir+4",
   1702 	"BtDir+5",
   1703 	"BtDir+6",
   1704 	"BtDir+7",
   1705 };
   1706 
   1707 char*
   1708 btStr(int type)
   1709 {
   1710 	if(type < nelem(bttab))
   1711 		return bttab[type];
   1712 	return "unknown";
   1713 }
   1714 
   1715 int
   1716 labelFmt(Fmt *f)
   1717 {
   1718 	Label *l;
   1719 
   1720 	l = va_arg(f->args, Label*);
   1721 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
   1722 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
   1723 }
   1724 
   1725 int
   1726 scoreFmt(Fmt *f)
   1727 {
   1728 	uchar *v;
   1729 	int i;
   1730 	u32int addr;
   1731 
   1732 	v = va_arg(f->args, uchar*);
   1733 	if(v == nil){
   1734 		fmtprint(f, "*");
   1735 	}else if((addr = globalToLocal(v)) != NilBlock)
   1736 		fmtprint(f, "0x%.8ux", addr);
   1737 	else{
   1738 		for(i = 0; i < VtScoreSize; i++)
   1739 			fmtprint(f, "%2.2ux", v[i]);
   1740 	}
   1741 
   1742 	return 0;
   1743 }
   1744 
   1745 static int
   1746 upHeap(int i, Block *b)
   1747 {
   1748 	Block *bb;
   1749 	u32int now;
   1750 	int p;
   1751 	Cache *c;
   1752 
   1753 	c = b->c;
   1754 	now = c->now;
   1755 	for(; i != 0; i = p){
   1756 		p = (i - 1) >> 1;
   1757 		bb = c->heap[p];
   1758 		if(b->used - now >= bb->used - now)
   1759 			break;
   1760 		c->heap[i] = bb;
   1761 		bb->heap = i;
   1762 	}
   1763 	c->heap[i] = b;
   1764 	b->heap = i;
   1765 
   1766 	return i;
   1767 }
   1768 
   1769 static int
   1770 downHeap(int i, Block *b)
   1771 {
   1772 	Block *bb;
   1773 	u32int now;
   1774 	int k;
   1775 	Cache *c;
   1776 
   1777 	c = b->c;
   1778 	now = c->now;
   1779 	for(; ; i = k){
   1780 		k = (i << 1) + 1;
   1781 		if(k >= c->nheap)
   1782 			break;
   1783 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
   1784 			k++;
   1785 		bb = c->heap[k];
   1786 		if(b->used - now <= bb->used - now)
   1787 			break;
   1788 		c->heap[i] = bb;
   1789 		bb->heap = i;
   1790 	}
   1791 	c->heap[i] = b;
   1792 	b->heap = i;
   1793 	return i;
   1794 }
   1795 
   1796 /*
   1797  * Delete a block from the heap.
   1798  * Called with c->lk held.
   1799  */
   1800 static void
   1801 heapDel(Block *b)
   1802 {
   1803 	int i, si;
   1804 	Cache *c;
   1805 
   1806 	c = b->c;
   1807 
   1808 	si = b->heap;
   1809 	if(si == BadHeap)
   1810 		return;
   1811 	b->heap = BadHeap;
   1812 	c->nheap--;
   1813 	if(si == c->nheap)
   1814 		return;
   1815 	b = c->heap[c->nheap];
   1816 	i = upHeap(si, b);
   1817 	if(i == si)
   1818 		downHeap(i, b);
   1819 }
   1820 
   1821 /*
   1822  * Insert a block into the heap.
   1823  * Called with c->lk held.
   1824  */
   1825 static void
   1826 heapIns(Block *b)
   1827 {
   1828 	assert(b->heap == BadHeap);
   1829 	upHeap(b->c->nheap++, b);
   1830 	rwakeup(&b->c->heapwait);
   1831 }
   1832 
   1833 /*
   1834  * Get just the label for a block.
   1835  */
   1836 int
   1837 readLabel(Cache *c, Label *l, u32int addr)
   1838 {
   1839 	int lpb;
   1840 	Block *b;
   1841 	u32int a;
   1842 
   1843 	lpb = c->size / LabelSize;
   1844 	a = addr / lpb;
   1845 	b = cacheLocal(c, PartLabel, a, OReadOnly);
   1846 	if(b == nil){
   1847 		blockPut(b);
   1848 		return 0;
   1849 	}
   1850 
   1851 	if(!labelUnpack(l, b->data, addr%lpb)){
   1852 		blockPut(b);
   1853 		return 0;
   1854 	}
   1855 	blockPut(b);
   1856 	return 1;
   1857 }
   1858 
   1859 /*
   1860  * Process unlink queue.
   1861  * Called with c->lk held.
   1862  */
   1863 static void
   1864 unlinkBody(Cache *c)
   1865 {
   1866 	BList *p;
   1867 
   1868 	while(c->uhead != nil){
   1869 		p = c->uhead;
   1870 		c->uhead = p->next;
   1871 		qunlock(&c->lk);
   1872 		doRemoveLink(c, p);
   1873 		qlock(&c->lk);
   1874 		p->next = c->blfree;
   1875 		c->blfree = p;
   1876 	}
   1877 }
   1878 
   1879 /*
   1880  * Occasionally unlink the blocks on the cache unlink queue.
   1881  */
   1882 static void
   1883 unlinkThread(void *a)
   1884 {
   1885 	Cache *c = a;
   1886 
   1887 	threadsetname("unlink");
   1888 
   1889 	qlock(&c->lk);
   1890 	for(;;){
   1891 		while(c->uhead == nil && c->die.l == nil)
   1892 			rsleep(&c->unlink);
   1893 		if(c->die.l != nil)
   1894 			break;
   1895 		unlinkBody(c);
   1896 	}
   1897 	c->ref--;
   1898 	rwakeup(&c->die);
   1899 	qunlock(&c->lk);
   1900 }
   1901 
   1902 static int
   1903 baddrCmp(const void *a0, const void *a1)
   1904 {
   1905 	BAddr *b0, *b1;
   1906 	b0 = (BAddr*)a0;
   1907 	b1 = (BAddr*)a1;
   1908 
   1909 	if(b0->part < b1->part)
   1910 		return -1;
   1911 	if(b0->part > b1->part)
   1912 		return 1;
   1913 	if(b0->addr < b1->addr)
   1914 		return -1;
   1915 	if(b0->addr > b1->addr)
   1916 		return 1;
   1917 	return 0;
   1918 }
   1919 
   1920 /*
   1921  * Scan the block list for dirty blocks; add them to the list c->baddr.
   1922  */
   1923 static void
   1924 flushFill(Cache *c)
   1925 {
   1926 	int i, ndirty;
   1927 	BAddr *p;
   1928 	Block *b;
   1929 
   1930 	qlock(&c->lk);
   1931 	if(c->ndirty == 0){
   1932 		qunlock(&c->lk);
   1933 		return;
   1934 	}
   1935 
   1936 	p = c->baddr;
   1937 	ndirty = 0;
   1938 	for(i=0; i<c->nblocks; i++){
   1939 		b = c->blocks + i;
   1940 		if(b->part == PartError)
   1941 			continue;
   1942 		if(b->iostate == BioDirty || b->iostate == BioWriting)
   1943 			ndirty++;
   1944 		if(b->iostate != BioDirty)
   1945 			continue;
   1946 		p->part = b->part;
   1947 		p->addr = b->addr;
   1948 		p->vers = b->vers;
   1949 		p++;
   1950 	}
   1951 	if(ndirty != c->ndirty){
   1952 		fprint(2, "%s: ndirty mismatch expected %d found %d\n",
   1953 			argv0, c->ndirty, ndirty);
   1954 		c->ndirty = ndirty;
   1955 	}
   1956 	qunlock(&c->lk);
   1957 
   1958 	c->bw = p - c->baddr;
   1959 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
   1960 }
   1961 
   1962 /*
   1963  * This is not thread safe, i.e. it can't be called from multiple threads.
   1964  *
   1965  * It's okay how we use it, because it only gets called in
   1966  * the flushThread.  And cacheFree, but only after
   1967  * cacheFree has killed off the flushThread.
   1968  */
   1969 static int
   1970 cacheFlushBlock(Cache *c)
   1971 {
   1972 	Block *b;
   1973 	BAddr *p;
   1974 	int lockfail, nfail;
   1975 
   1976 	nfail = 0;
   1977 	for(;;){
   1978 		if(c->br == c->be){
   1979 			if(c->bw == 0 || c->bw == c->be)
   1980 				flushFill(c);
   1981 			c->br = 0;
   1982 			c->be = c->bw;
   1983 			c->bw = 0;
   1984 			c->nflush = 0;
   1985 		}
   1986 
   1987 		if(c->br == c->be)
   1988 			return 0;
   1989 		p = c->baddr + c->br;
   1990 		c->br++;
   1991 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
   1992 			&lockfail);
   1993 
   1994 		if(b && blockWrite(b, Nowaitlock)){
   1995 			c->nflush++;
   1996 			blockPut(b);
   1997 			return 1;
   1998 		}
   1999 		if(b)
   2000 			blockPut(b);
   2001 
   2002 		/*
   2003 		 * Why didn't we write the block?
   2004 		 */
   2005 
   2006 		/* Block already written out */
   2007 		if(b == nil && !lockfail)
   2008 			continue;
   2009 
   2010 		/* Failed to acquire lock; sleep if happens a lot. */
   2011 		if(lockfail && ++nfail > 100){
   2012 			sleep(500);
   2013 			nfail = 0;
   2014 		}
   2015 		/* Requeue block. */
   2016 		if(c->bw < c->be)
   2017 			c->baddr[c->bw++] = *p;
   2018 	}
   2019 }
   2020 
   2021 /*
   2022  * Occasionally flush dirty blocks from memory to the disk.
   2023  */
   2024 static void
   2025 flushThread(void *a)
   2026 {
   2027 	Cache *c = a;
   2028 	int i;
   2029 
   2030 	threadsetname("flush");
   2031 	qlock(&c->lk);
   2032 	while(c->die.l == nil){
   2033 		rsleep(&c->flush);
   2034 		qunlock(&c->lk);
   2035 		for(i=0; i<FlushSize; i++)
   2036 			if(!cacheFlushBlock(c)){
   2037 				/*
   2038 				 * If i==0, could be someone is waking us repeatedly
   2039 				 * to flush the cache but there's no work to do.
   2040 				 * Pause a little.
   2041 				 */
   2042 				if(i==0){
   2043 					// fprint(2, "%s: flushthread found "
   2044 					//	"nothing to flush - %d dirty\n",
   2045 					//	argv0, c->ndirty);
   2046 					sleep(250);
   2047 				}
   2048 				break;
   2049 			}
   2050 		if(i==0 && c->ndirty){
   2051 			/*
   2052 			 * All the blocks are being written right now -- there's nothing to do.
   2053 			 * We might be spinning with cacheFlush though -- he'll just keep
   2054 			 * kicking us until c->ndirty goes down.  Probably we should sleep
   2055 			 * on something that the diskThread can kick, but for now we'll
   2056 			 * just pause for a little while waiting for disks to finish.
   2057 			 */
   2058 			sleep(100);
   2059 		}
   2060 		qlock(&c->lk);
   2061 		rwakeupall(&c->flushwait);
   2062 	}
   2063 	c->ref--;
   2064 	rwakeup(&c->die);
   2065 	qunlock(&c->lk);
   2066 }
   2067 
   2068 /*
   2069  * Flush the cache.
   2070  */
   2071 void
   2072 cacheFlush(Cache *c, int wait)
   2073 {
   2074 	qlock(&c->lk);
   2075 	if(wait){
   2076 		while(c->ndirty){
   2077 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
   2078 		//		c->ndirty, c->uhead);
   2079 			rwakeup(&c->flush);
   2080 			rsleep(&c->flushwait);
   2081 		}
   2082 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
   2083 	}else if(c->ndirty)
   2084 		rwakeup(&c->flush);
   2085 	qunlock(&c->lk);
   2086 }
   2087 
   2088 /*
   2089  * Kick the flushThread every 30 seconds.
   2090  */
   2091 static void
   2092 cacheSync(void *v)
   2093 {
   2094 	Cache *c;
   2095 
   2096 	c = v;
   2097 	cacheFlush(c, 0);
   2098 }