cache.c (43774B)
1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 QLock lk; 25 int ref; 26 int mode; 27 28 Disk *disk; 29 int size; /* block size */ 30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 31 VtConn *z; 32 u32int now; /* ticks for usage timestamps */ 33 Block **heads; /* hash table for finding address */ 34 int nheap; /* number of available victims */ 35 Block **heap; /* heap for locating victims */ 36 long nblocks; /* number of blocks allocated */ 37 Block *blocks; /* array of block descriptors */ 38 u8int *mem; /* memory for all block data & blists */ 39 40 BList *blfree; 41 Rendez blrend; 42 43 int ndirty; /* number of dirty blocks in the cache */ 44 int maxdirty; /* max number of dirty blocks */ 45 u32int vers; 46 47 long hashSize; 48 49 FreeList *fl; 50 51 Rendez die; /* daemon threads should die when QLock != nil */ 52 53 Rendez flush; 54 Rendez flushwait; 55 Rendez heapwait; 56 BAddr *baddr; 57 int bw, br, be; 58 int nflush; 59 60 Periodic *sync; 61 62 /* unlink daemon */ 63 BList *uhead; 64 BList *utail; 65 Rendez unlink; 66 67 /* block counts */ 68 int nused; 69 int ndisk; 70 }; 71 72 struct BList { 73 int part; 74 u32int addr; 75 uchar type; 76 u32int tag; 77 u32int epoch; 78 u32int vers; 79 80 int recurse; /* for block unlink */ 81 82 /* for roll back */ 83 int index; /* -1 indicates not valid */ 84 union { 85 uchar score[VtScoreSize]; 86 uchar entry[VtEntrySize]; 87 } old; 88 BList *next; 89 }; 90 91 struct BAddr { 92 int part; 93 u32int addr; 94 u32int vers; 95 }; 96 97 struct FreeList { 98 QLock lk; 99 u32int last; /* last block allocated */ 100 u32int end; /* end of data partition */ 101 u32int nused; /* number of used blocks */ 102 u32int epochLow; /* low epoch when last updated nused */ 103 }; 104 105 static FreeList *flAlloc(u32int end); 106 static void flFree(FreeList *fl); 107 108 static Block *cacheBumpBlock(Cache *c); 109 static void heapDel(Block*); 110 static void heapIns(Block*); 111 static void cacheCheck(Cache*); 112 static void unlinkThread(void *a); 113 static void flushThread(void *a); 114 static void unlinkBody(Cache *c); 115 static int cacheFlushBlock(Cache *c); 116 static void cacheSync(void*); 117 static BList *blistAlloc(Block*); 118 static void blistFree(Cache*, BList*); 119 static void doRemoveLink(Cache*, BList*); 120 121 /* 122 * Mapping from local block type to Venti type 123 */ 124 int vtType[BtMax] = { 125 VtDataType, /* BtData | 0 */ 126 VtDataType+1, /* BtData | 1 */ 127 VtDataType+2, /* BtData | 2 */ 128 VtDataType+3, /* BtData | 3 */ 129 VtDataType+4, /* BtData | 4 */ 130 VtDataType+5, /* BtData | 5 */ 131 VtDataType+6, /* BtData | 6 */ 132 VtDataType+7, /* BtData | 7 */ 133 VtDirType, /* BtDir | 0 */ 134 VtDirType+1, /* BtDir | 1 */ 135 VtDirType+2, /* BtDir | 2 */ 136 VtDirType+3, /* BtDir | 3 */ 137 VtDirType+4, /* BtDir | 4 */ 138 VtDirType+5, /* BtDir | 5 */ 139 VtDirType+6, /* BtDir | 6 */ 140 VtDirType+7, /* BtDir | 7 */ 141 }; 142 143 /* 144 * Allocate the memory cache. 145 */ 146 Cache * 147 cacheAlloc(Disk *disk, VtConn *z, ulong nblocks, int mode) 148 { 149 int i; 150 Cache *c; 151 Block *b; 152 BList *bl; 153 u8int *p; 154 int nbl; 155 156 c = vtmallocz(sizeof(Cache)); 157 158 /* reasonable number of BList elements */ 159 nbl = nblocks * 4; 160 161 c->ref = 1; 162 c->disk = disk; 163 c->z = z; 164 c->size = diskBlockSize(disk); 165 bwatchSetBlockSize(c->size); 166 /* round c->size up to be a nice multiple */ 167 c->size = (c->size + 127) & ~127; 168 c->ndmap = (c->size/20 + 7) / 8; 169 c->nblocks = nblocks; 170 c->hashSize = nblocks; 171 c->heads = vtmallocz(c->hashSize*sizeof(Block*)); 172 c->heap = vtmallocz(nblocks*sizeof(Block*)); 173 c->blocks = vtmallocz(nblocks*sizeof(Block)); 174 c->mem = vtmallocz(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 175 c->baddr = vtmallocz(nblocks * sizeof(BAddr)); 176 c->mode = mode; 177 c->vers++; 178 p = c->mem; 179 for(i = 0; i < nblocks; i++){ 180 b = &c->blocks[i]; 181 b->c = c; 182 b->data = p; 183 b->heap = i; 184 b->ioready.l = &b->lk; 185 c->heap[i] = b; 186 p += c->size; 187 } 188 c->nheap = nblocks; 189 for(i = 0; i < nbl; i++){ 190 bl = (BList*)p; 191 bl->next = c->blfree; 192 c->blfree = bl; 193 p += sizeof(BList); 194 } 195 /* separate loop to keep blocks and blists reasonably aligned */ 196 for(i = 0; i < nblocks; i++){ 197 b = &c->blocks[i]; 198 b->dmap = p; 199 p += c->ndmap; 200 } 201 202 c->blrend.l = &c->lk; 203 204 c->maxdirty = nblocks*(DirtyPercentage*0.01); 205 206 c->fl = flAlloc(diskSize(disk, PartData)); 207 208 c->unlink.l = &c->lk; 209 c->flush.l = &c->lk; 210 c->flushwait.l = &c->lk; 211 c->heapwait.l = &c->lk; 212 c->sync = periodicAlloc(cacheSync, c, 30*1000); 213 214 if(mode == OReadWrite){ 215 c->ref += 2; 216 proccreate(unlinkThread, c, STACK); 217 proccreate(flushThread, c, STACK); 218 } 219 cacheCheck(c); 220 221 return c; 222 } 223 224 /* 225 * Free the whole memory cache, flushing all dirty blocks to the disk. 226 */ 227 void 228 cacheFree(Cache *c) 229 { 230 int i; 231 232 /* kill off daemon threads */ 233 qlock(&c->lk); 234 c->die.l = &c->lk; 235 periodicKill(c->sync); 236 rwakeup(&c->flush); 237 rwakeup(&c->unlink); 238 while(c->ref > 1) 239 rsleep(&c->die); 240 241 /* flush everything out */ 242 do { 243 unlinkBody(c); 244 qunlock(&c->lk); 245 while(cacheFlushBlock(c)) 246 ; 247 diskFlush(c->disk); 248 qlock(&c->lk); 249 } while(c->uhead || c->ndirty); 250 qunlock(&c->lk); 251 252 cacheCheck(c); 253 254 for(i = 0; i < c->nblocks; i++){ 255 assert(c->blocks[i].ref == 0); 256 } 257 flFree(c->fl); 258 vtfree(c->baddr); 259 vtfree(c->heads); 260 vtfree(c->blocks); 261 vtfree(c->mem); 262 diskFree(c->disk); 263 /* don't close vtSession */ 264 vtfree(c); 265 } 266 267 static void 268 cacheDump(Cache *c) 269 { 270 int i; 271 Block *b; 272 273 for(i = 0; i < c->nblocks; i++){ 274 b = &c->blocks[i]; 275 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n", 276 i, b->part, b->addr, b->score, b->l.type, b->ref, 277 bsStr(b->l.state), bioStr(b->iostate), b->pc); 278 } 279 } 280 281 static void 282 cacheCheck(Cache *c) 283 { 284 u32int size, now; 285 int i, k, refed; 286 Block *b; 287 288 size = c->size; 289 now = c->now; 290 291 for(i = 0; i < c->nheap; i++){ 292 if(c->heap[i]->heap != i) 293 sysfatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 294 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 295 sysfatal("bad heap ordering"); 296 k = (i << 1) + 1; 297 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 298 sysfatal("bad heap ordering"); 299 k++; 300 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 301 sysfatal("bad heap ordering"); 302 } 303 304 refed = 0; 305 for(i = 0; i < c->nblocks; i++){ 306 b = &c->blocks[i]; 307 if(b->data != &c->mem[i * size]) 308 sysfatal("mis-blocked at %d", i); 309 if(b->ref && b->heap == BadHeap){ 310 refed++; 311 } 312 } 313 if(c->nheap + refed != c->nblocks){ 314 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks); 315 cacheDump(c); 316 } 317 assert(c->nheap + refed == c->nblocks); 318 refed = 0; 319 for(i = 0; i < c->nblocks; i++){ 320 b = &c->blocks[i]; 321 if(b->ref){ 322 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l); 323 refed++; 324 } 325 } 326 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed); 327 } 328 329 330 /* 331 * locate the block with the oldest second to last use. 332 * remove it from the heap, and fix up the heap. 333 */ 334 /* called with c->lk held */ 335 static Block * 336 cacheBumpBlock(Cache *c) 337 { 338 int printed; 339 Block *b; 340 341 /* 342 * locate the block with the oldest second to last use. 343 * remove it from the heap, and fix up the heap. 344 */ 345 printed = 0; 346 if(c->nheap == 0){ 347 while(c->nheap == 0){ 348 rwakeup(&c->flush); 349 rsleep(&c->heapwait); 350 if(c->nheap == 0){ 351 printed = 1; 352 fprint(2, "%s: entire cache is busy, %d dirty " 353 "-- waking flush thread\n", 354 argv0, c->ndirty); 355 } 356 } 357 if(printed) 358 fprint(2, "%s: cache is okay again, %d dirty\n", 359 argv0, c->ndirty); 360 } 361 362 b = c->heap[0]; 363 heapDel(b); 364 365 assert(b->heap == BadHeap); 366 assert(b->ref == 0); 367 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 368 assert(b->prior == nil); 369 assert(b->uhead == nil); 370 371 /* 372 * unchain the block from hash chain 373 */ 374 if(b->prev){ 375 *(b->prev) = b->next; 376 if(b->next) 377 b->next->prev = b->prev; 378 b->prev = nil; 379 } 380 381 382 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score); 383 /* set block to a reasonable state */ 384 b->ref = 1; 385 b->part = PartError; 386 memset(&b->l, 0, sizeof(b->l)); 387 b->iostate = BioEmpty; 388 389 return b; 390 } 391 392 /* 393 * look for a particular version of the block in the memory cache. 394 */ 395 static Block * 396 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 397 int waitlock, int *lockfailure) 398 { 399 Block *b; 400 ulong h; 401 402 h = addr % c->hashSize; 403 404 if(lockfailure) 405 *lockfailure = 0; 406 407 /* 408 * look for the block in the cache 409 */ 410 qlock(&c->lk); 411 for(b = c->heads[h]; b != nil; b = b->next){ 412 if(b->part == part && b->addr == addr) 413 break; 414 } 415 if(b == nil || b->vers != vers){ 416 qunlock(&c->lk); 417 return nil; 418 } 419 if(!waitlock && !canqlock(&b->lk)){ 420 *lockfailure = 1; 421 qunlock(&c->lk); 422 return nil; 423 } 424 heapDel(b); 425 b->ref++; 426 qunlock(&c->lk); 427 428 bwatchLock(b); 429 if(waitlock) 430 qlock(&b->lk); 431 b->nlock = 1; 432 433 for(;;){ 434 switch(b->iostate){ 435 default: 436 abort(); 437 case BioEmpty: 438 case BioLabel: 439 case BioClean: 440 case BioDirty: 441 if(b->vers != vers){ 442 blockPut(b); 443 return nil; 444 } 445 return b; 446 case BioReading: 447 case BioWriting: 448 rsleep(&b->ioready); 449 break; 450 case BioVentiError: 451 blockPut(b); 452 werrstr("venti i/o error block 0x%.8ux", addr); 453 return nil; 454 case BioReadError: 455 blockPut(b); 456 werrstr("error reading block 0x%.8ux", addr); 457 return nil; 458 } 459 } 460 /* NOT REACHED */ 461 } 462 463 464 /* 465 * fetch a local (on-disk) block from the memory cache. 466 * if it's not there, load it, bumping some other block. 467 */ 468 Block * 469 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 470 { 471 Block *b; 472 ulong h; 473 474 assert(part != PartVenti); 475 476 h = addr % c->hashSize; 477 478 /* 479 * look for the block in the cache 480 */ 481 qlock(&c->lk); 482 for(b = c->heads[h]; b != nil; b = b->next){ 483 if(b->part != part || b->addr != addr) 484 continue; 485 if(epoch && b->l.epoch != epoch){ 486 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 487 qunlock(&c->lk); 488 werrstr(ELabelMismatch); 489 return nil; 490 } 491 heapDel(b); 492 b->ref++; 493 break; 494 } 495 496 if(b == nil){ 497 b = cacheBumpBlock(c); 498 499 b->part = part; 500 b->addr = addr; 501 localToGlobal(addr, b->score); 502 503 /* chain onto correct hash */ 504 b->next = c->heads[h]; 505 c->heads[h] = b; 506 if(b->next != nil) 507 b->next->prev = &b->next; 508 b->prev = &c->heads[h]; 509 } 510 511 qunlock(&c->lk); 512 513 /* 514 * BUG: what if the epoch changes right here? 515 * In the worst case, we could end up in some weird 516 * lock loop, because the block we want no longer exists, 517 * and instead we're trying to lock a block we have no 518 * business grabbing. 519 * 520 * For now, I'm not going to worry about it. 521 */ 522 523 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr); 524 bwatchLock(b); 525 qlock(&b->lk); 526 b->nlock = 1; 527 528 if(part == PartData && b->iostate == BioEmpty){ 529 if(!readLabel(c, &b->l, addr)){ 530 blockPut(b); 531 return nil; 532 } 533 blockSetIOState(b, BioLabel); 534 } 535 if(epoch && b->l.epoch != epoch){ 536 blockPut(b); 537 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 538 werrstr(ELabelMismatch); 539 return nil; 540 } 541 542 b->pc = getcallerpc(&c); 543 for(;;){ 544 switch(b->iostate){ 545 default: 546 abort(); 547 case BioLabel: 548 if(mode == OOverWrite) 549 /* 550 * leave iostate as BioLabel because data 551 * hasn't been read. 552 */ 553 return b; 554 /* fall through */ 555 case BioEmpty: 556 diskRead(c->disk, b); 557 rsleep(&b->ioready); 558 break; 559 case BioClean: 560 case BioDirty: 561 return b; 562 case BioReading: 563 case BioWriting: 564 rsleep(&b->ioready); 565 break; 566 case BioReadError: 567 blockSetIOState(b, BioEmpty); 568 blockPut(b); 569 werrstr("error reading block 0x%.8ux", addr); 570 return nil; 571 } 572 } 573 /* NOT REACHED */ 574 } 575 576 Block * 577 cacheLocal(Cache *c, int part, u32int addr, int mode) 578 { 579 return _cacheLocal(c, part, addr, mode, 0); 580 } 581 582 /* 583 * fetch a local (on-disk) block from the memory cache. 584 * if it's not there, load it, bumping some other block. 585 * check tag and type. 586 */ 587 Block * 588 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 589 { 590 Block *b; 591 592 b = _cacheLocal(c, PartData, addr, mode, epoch); 593 if(b == nil) 594 return nil; 595 if(b->l.type != type || b->l.tag != tag){ 596 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 597 argv0, addr, b->l.type, type, b->l.tag, tag); 598 werrstr(ELabelMismatch); 599 blockPut(b); 600 return nil; 601 } 602 b->pc = getcallerpc(&c); 603 return b; 604 } 605 606 /* 607 * fetch a global (Venti) block from the memory cache. 608 * if it's not there, load it, bumping some other block. 609 * check tag and type if it's really a local block in disguise. 610 */ 611 Block * 612 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 613 { 614 int n; 615 Block *b; 616 ulong h; 617 u32int addr; 618 619 addr = globalToLocal(score); 620 if(addr != NilBlock){ 621 b = cacheLocalData(c, addr, type, tag, mode, 0); 622 if(b) 623 b->pc = getcallerpc(&c); 624 return b; 625 } 626 627 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 628 629 /* 630 * look for the block in the cache 631 */ 632 qlock(&c->lk); 633 for(b = c->heads[h]; b != nil; b = b->next){ 634 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 635 continue; 636 heapDel(b); 637 b->ref++; 638 break; 639 } 640 641 if(b == nil){ 642 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type); 643 644 b = cacheBumpBlock(c); 645 646 b->part = PartVenti; 647 b->addr = NilBlock; 648 b->l.type = type; 649 memmove(b->score, score, VtScoreSize); 650 651 /* chain onto correct hash */ 652 b->next = c->heads[h]; 653 c->heads[h] = b; 654 if(b->next != nil) 655 b->next->prev = &b->next; 656 b->prev = &c->heads[h]; 657 } 658 qunlock(&c->lk); 659 660 bwatchLock(b); 661 qlock(&b->lk); 662 b->nlock = 1; 663 b->pc = getcallerpc(&c); 664 665 switch(b->iostate){ 666 default: 667 abort(); 668 case BioEmpty: 669 n = vtread(c->z, score, vtType[type], b->data, c->size); 670 if(n < 0 || vtsha1check(score, b->data, n) < 0){ 671 blockSetIOState(b, BioVentiError); 672 blockPut(b); 673 werrstr( 674 "venti error reading block %V or wrong score: %r", 675 score); 676 return nil; 677 } 678 vtzeroextend(vtType[type], b->data, n, c->size); 679 blockSetIOState(b, BioClean); 680 return b; 681 case BioClean: 682 return b; 683 case BioVentiError: 684 blockPut(b); 685 werrstr("venti i/o error or wrong score, block %V", score); 686 return nil; 687 case BioReadError: 688 blockPut(b); 689 werrstr("error reading block %V", b->score); 690 return nil; 691 } 692 /* NOT REACHED */ 693 } 694 695 /* 696 * allocate a new on-disk block and load it into the memory cache. 697 * BUG: if the disk is full, should we flush some of it to Venti? 698 */ 699 static u32int lastAlloc; 700 701 Block * 702 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 703 { 704 FreeList *fl; 705 u32int addr; 706 Block *b; 707 int n, nwrap; 708 Label lab; 709 710 n = c->size / LabelSize; 711 fl = c->fl; 712 713 qlock(&fl->lk); 714 addr = fl->last; 715 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 716 if(b == nil){ 717 fprint(2, "%s: cacheAllocBlock: xxx %r\n", argv0); 718 qunlock(&fl->lk); 719 return nil; 720 } 721 nwrap = 0; 722 for(;;){ 723 if(++addr >= fl->end){ 724 addr = 0; 725 if(++nwrap >= 2){ 726 blockPut(b); 727 werrstr("disk is full"); 728 /* 729 * try to avoid a continuous spew of console 730 * messages. 731 */ 732 if (fl->last != 0) 733 fprint(2, "%s: cacheAllocBlock: xxx1 %r\n", 734 argv0); 735 fl->last = 0; 736 qunlock(&fl->lk); 737 return nil; 738 } 739 } 740 if(addr%n == 0){ 741 blockPut(b); 742 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 743 if(b == nil){ 744 fl->last = addr; 745 fprint(2, "%s: cacheAllocBlock: xxx2 %r\n", argv0); 746 qunlock(&fl->lk); 747 return nil; 748 } 749 } 750 if(!labelUnpack(&lab, b->data, addr%n)) 751 continue; 752 if(lab.state == BsFree) 753 goto Found; 754 if(lab.state&BsClosed) 755 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 756 goto Found; 757 } 758 Found: 759 blockPut(b); 760 b = cacheLocal(c, PartData, addr, OOverWrite); 761 if(b == nil){ 762 fprint(2, "%s: cacheAllocBlock: xxx3 %r\n", argv0); 763 return nil; 764 } 765 assert(b->iostate == BioLabel || b->iostate == BioClean); 766 fl->last = addr; 767 lab.type = type; 768 lab.tag = tag; 769 lab.state = BsAlloc; 770 lab.epoch = epoch; 771 lab.epochClose = ~(u32int)0; 772 if(!blockSetLabel(b, &lab, 1)){ 773 fprint(2, "%s: cacheAllocBlock: xxx4 %r\n", argv0); 774 blockPut(b); 775 return nil; 776 } 777 vtzeroextend(vtType[type], b->data, 0, c->size); 778 if(0)diskWrite(c->disk, b); 779 780 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag); 781 lastAlloc = addr; 782 fl->nused++; 783 qunlock(&fl->lk); 784 b->pc = getcallerpc(&c); 785 return b; 786 } 787 788 int 789 cacheDirty(Cache *c) 790 { 791 return c->ndirty; 792 } 793 794 void 795 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 796 { 797 int n; 798 u32int addr, nused; 799 Block *b; 800 Label lab; 801 FreeList *fl; 802 803 fl = c->fl; 804 n = c->size / LabelSize; 805 *bsize = c->size; 806 qlock(&fl->lk); 807 if(fl->epochLow == epochLow){ 808 *used = fl->nused; 809 *total = fl->end; 810 qunlock(&fl->lk); 811 return; 812 } 813 b = nil; 814 nused = 0; 815 for(addr=0; addr<fl->end; addr++){ 816 if(addr%n == 0){ 817 blockPut(b); 818 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 819 if(b == nil){ 820 fprint(2, "%s: flCountUsed: loading %ux: %r\n", 821 argv0, addr/n); 822 break; 823 } 824 } 825 if(!labelUnpack(&lab, b->data, addr%n)) 826 continue; 827 if(lab.state == BsFree) 828 continue; 829 if(lab.state&BsClosed) 830 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 831 continue; 832 nused++; 833 } 834 blockPut(b); 835 if(addr == fl->end){ 836 fl->nused = nused; 837 fl->epochLow = epochLow; 838 } 839 *used = nused; 840 *total = fl->end; 841 qunlock(&fl->lk); 842 return; 843 } 844 845 static FreeList * 846 flAlloc(u32int end) 847 { 848 FreeList *fl; 849 850 fl = vtmallocz(sizeof(*fl)); 851 fl->last = 0; 852 fl->end = end; 853 return fl; 854 } 855 856 static void 857 flFree(FreeList *fl) 858 { 859 vtfree(fl); 860 } 861 862 u32int 863 cacheLocalSize(Cache *c, int part) 864 { 865 return diskSize(c->disk, part); 866 } 867 868 /* 869 * The thread that has locked b may refer to it by 870 * multiple names. Nlock counts the number of 871 * references the locking thread holds. It will call 872 * blockPut once per reference. 873 */ 874 void 875 blockDupLock(Block *b) 876 { 877 assert(b->nlock > 0); 878 b->nlock++; 879 } 880 881 /* 882 * we're done with the block. 883 * unlock it. can't use it after calling this. 884 */ 885 void 886 blockPut(Block* b) 887 { 888 Cache *c; 889 890 if(b == nil) 891 return; 892 893 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 894 895 if(b->iostate == BioDirty) 896 bwatchDependency(b); 897 898 if(--b->nlock > 0) 899 return; 900 901 /* 902 * b->nlock should probably stay at zero while 903 * the block is unlocked, but diskThread and rsleep 904 * conspire to assume that they can just qlock(&b->lk); blockPut(b), 905 * so we have to keep b->nlock set to 1 even 906 * when the block is unlocked. 907 */ 908 assert(b->nlock == 0); 909 b->nlock = 1; 910 // b->pc = 0; 911 912 bwatchUnlock(b); 913 qunlock(&b->lk); 914 c = b->c; 915 qlock(&c->lk); 916 917 if(--b->ref > 0){ 918 qunlock(&c->lk); 919 return; 920 } 921 922 assert(b->ref == 0); 923 switch(b->iostate){ 924 default: 925 b->used = c->now++; 926 heapIns(b); 927 break; 928 case BioEmpty: 929 case BioLabel: 930 if(c->nheap == 0) 931 b->used = c->now++; 932 else 933 b->used = c->heap[0]->used; 934 heapIns(b); 935 break; 936 case BioDirty: 937 break; 938 } 939 qunlock(&c->lk); 940 } 941 942 /* 943 * set the label associated with a block. 944 */ 945 Block* 946 _blockSetLabel(Block *b, Label *l) 947 { 948 int lpb; 949 Block *bb; 950 u32int a; 951 Cache *c; 952 953 c = b->c; 954 955 assert(b->part == PartData); 956 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 957 lpb = c->size / LabelSize; 958 a = b->addr / lpb; 959 bb = cacheLocal(c, PartLabel, a, OReadWrite); 960 if(bb == nil){ 961 blockPut(b); 962 return nil; 963 } 964 b->l = *l; 965 labelPack(l, bb->data, b->addr%lpb); 966 blockDirty(bb); 967 return bb; 968 } 969 970 int 971 blockSetLabel(Block *b, Label *l, int allocating) 972 { 973 Block *lb; 974 975 lb = _blockSetLabel(b, l); 976 if(lb == nil) 977 return 0; 978 979 /* 980 * If we're allocating the block, make sure the label (bl) 981 * goes to disk before the data block (b) itself. This is to help 982 * the blocks that in turn depend on b. 983 * 984 * Suppose bx depends on (must be written out after) b. 985 * Once we write b we'll think it's safe to write bx. 986 * Bx can't get at b unless it has a valid label, though. 987 * 988 * Allocation is the only case in which having a current label 989 * is vital because: 990 * 991 * - l.type is set at allocation and never changes. 992 * - l.tag is set at allocation and never changes. 993 * - l.state is not checked when we load blocks. 994 * - the archiver cares deeply about l.state being 995 * BaActive vs. BaCopied, but that's handled 996 * by direct calls to _blockSetLabel. 997 */ 998 999 if(allocating) 1000 blockDependency(b, lb, -1, nil, nil); 1001 blockPut(lb); 1002 return 1; 1003 } 1004 1005 /* 1006 * Record that bb must be written out before b. 1007 * If index is given, we're about to overwrite the score/e 1008 * at that index in the block. Save the old value so we 1009 * can write a safer ``old'' version of the block if pressed. 1010 */ 1011 void 1012 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1013 { 1014 BList *p; 1015 1016 if(bb->iostate == BioClean) 1017 return; 1018 1019 /* 1020 * Dependencies for blocks containing Entry structures 1021 * or scores must always be explained. The problem with 1022 * only explaining some of them is this. Suppose we have two 1023 * dependencies for the same field, the first explained 1024 * and the second not. We try to write the block when the first 1025 * dependency is not written but the second is. We will roll back 1026 * the first change even though the second trumps it. 1027 */ 1028 if(index == -1 && bb->part == PartData) 1029 assert(b->l.type == BtData); 1030 1031 if(bb->iostate != BioDirty){ 1032 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n", 1033 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1034 abort(); 1035 } 1036 1037 p = blistAlloc(bb); 1038 if(p == nil) 1039 return; 1040 1041 assert(bb->iostate == BioDirty); 1042 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1043 1044 p->part = bb->part; 1045 p->addr = bb->addr; 1046 p->type = bb->l.type; 1047 p->vers = bb->vers; 1048 p->index = index; 1049 if(p->index >= 0){ 1050 /* 1051 * This test would just be b->l.type==BtDir except 1052 * we need to exclude the super block. 1053 */ 1054 if(b->l.type == BtDir && b->part == PartData) 1055 entryPack(e, p->old.entry, 0); 1056 else 1057 memmove(p->old.score, score, VtScoreSize); 1058 } 1059 p->next = b->prior; 1060 b->prior = p; 1061 } 1062 1063 /* 1064 * Mark an in-memory block as dirty. If there are too many 1065 * dirty blocks, start writing some out to disk. 1066 * 1067 * If there were way too many dirty blocks, we used to 1068 * try to do some flushing ourselves, but it's just too dangerous -- 1069 * it implies that the callers cannot have any of our priors locked, 1070 * but this is hard to avoid in some cases. 1071 */ 1072 int 1073 blockDirty(Block *b) 1074 { 1075 Cache *c; 1076 1077 c = b->c; 1078 1079 assert(b->part != PartVenti); 1080 1081 if(b->iostate == BioDirty) 1082 return 1; 1083 assert(b->iostate == BioClean || b->iostate == BioLabel); 1084 1085 qlock(&c->lk); 1086 b->iostate = BioDirty; 1087 c->ndirty++; 1088 if(c->ndirty > (c->maxdirty>>1)) 1089 rwakeup(&c->flush); 1090 qunlock(&c->lk); 1091 1092 return 1; 1093 } 1094 1095 /* 1096 * We've decided to write out b. Maybe b has some pointers to blocks 1097 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1098 * copy of b that is safe to write out. (diskThread will make sure the block 1099 * remains marked as dirty.) 1100 */ 1101 uchar * 1102 blockRollback(Block *b, uchar *buf) 1103 { 1104 u32int addr; 1105 BList *p; 1106 Super super; 1107 1108 /* easy case */ 1109 if(b->prior == nil) 1110 return b->data; 1111 1112 memmove(buf, b->data, b->c->size); 1113 for(p=b->prior; p; p=p->next){ 1114 /* 1115 * we know p->index >= 0 because blockWrite has vetted this block for us. 1116 */ 1117 assert(p->index >= 0); 1118 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1119 if(b->part == PartSuper){ 1120 assert(p->index == 0); 1121 superUnpack(&super, buf); 1122 addr = globalToLocal(p->old.score); 1123 if(addr == NilBlock){ 1124 fprint(2, "%s: rolling back super block: " 1125 "bad replacement addr %V\n", 1126 argv0, p->old.score); 1127 abort(); 1128 } 1129 super.active = addr; 1130 superPack(&super, buf); 1131 continue; 1132 } 1133 if(b->l.type == BtDir) 1134 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1135 else 1136 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1137 } 1138 return buf; 1139 } 1140 1141 /* 1142 * Try to write block b. 1143 * If b depends on other blocks: 1144 * 1145 * If the block has been written out, remove the dependency. 1146 * If the dependency is replaced by a more recent dependency, 1147 * throw it out. 1148 * If we know how to write out an old version of b that doesn't 1149 * depend on it, do that. 1150 * 1151 * Otherwise, bail. 1152 */ 1153 int 1154 blockWrite(Block *b, int waitlock) 1155 { 1156 uchar *dmap; 1157 Cache *c; 1158 BList *p, **pp; 1159 Block *bb; 1160 int lockfail; 1161 1162 c = b->c; 1163 1164 if(b->iostate != BioDirty) 1165 return 1; 1166 1167 dmap = b->dmap; 1168 memset(dmap, 0, c->ndmap); 1169 pp = &b->prior; 1170 for(p=*pp; p; p=*pp){ 1171 if(p->index >= 0){ 1172 /* more recent dependency has succeeded; this one can go */ 1173 if(dmap[p->index/8] & (1<<(p->index%8))) 1174 goto ignblock; 1175 } 1176 1177 lockfail = 0; 1178 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock, 1179 &lockfail); 1180 if(bb == nil){ 1181 if(lockfail) 1182 return 0; 1183 /* block not in cache => was written already */ 1184 dmap[p->index/8] |= 1<<(p->index%8); 1185 goto ignblock; 1186 } 1187 1188 /* 1189 * same version of block is still in cache. 1190 * 1191 * the assertion is true because the block still has version p->vers, 1192 * which means it hasn't been written out since we last saw it. 1193 */ 1194 if(bb->iostate != BioDirty){ 1195 fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n", 1196 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1197 /* probably BioWriting if it happens? */ 1198 if(bb->iostate == BioClean) 1199 goto ignblock; 1200 } 1201 1202 blockPut(bb); 1203 1204 if(p->index < 0){ 1205 /* 1206 * We don't know how to temporarily undo 1207 * b's dependency on bb, so just don't write b yet. 1208 */ 1209 if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1210 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1211 return 0; 1212 } 1213 /* keep walking down the list */ 1214 pp = &p->next; 1215 continue; 1216 1217 ignblock: 1218 *pp = p->next; 1219 blistFree(c, p); 1220 continue; 1221 } 1222 1223 /* 1224 * DiskWrite must never be called with a double-locked block. 1225 * This call to diskWrite is okay because blockWrite is only called 1226 * from the cache flush thread, which never double-locks a block. 1227 */ 1228 diskWrite(c->disk, b); 1229 return 1; 1230 } 1231 1232 /* 1233 * Change the I/O state of block b. 1234 * Just an assignment except for magic in 1235 * switch statement (read comments there). 1236 */ 1237 void 1238 blockSetIOState(Block *b, int iostate) 1239 { 1240 int dowakeup; 1241 Cache *c; 1242 BList *p, *q; 1243 1244 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1245 1246 c = b->c; 1247 1248 dowakeup = 0; 1249 switch(iostate){ 1250 default: 1251 abort(); 1252 case BioEmpty: 1253 assert(!b->uhead); 1254 break; 1255 case BioLabel: 1256 assert(!b->uhead); 1257 break; 1258 case BioClean: 1259 bwatchDependency(b); 1260 /* 1261 * If b->prior is set, it means a write just finished. 1262 * The prior list isn't needed anymore. 1263 */ 1264 for(p=b->prior; p; p=q){ 1265 q = p->next; 1266 blistFree(c, p); 1267 } 1268 b->prior = nil; 1269 /* 1270 * Freeing a block or just finished a write. 1271 * Move the blocks from the per-block unlink 1272 * queue to the cache unlink queue. 1273 */ 1274 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1275 qlock(&c->lk); 1276 c->ndirty--; 1277 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1278 b->vers = c->vers++; 1279 if(b->uhead){ 1280 /* add unlink blocks to unlink queue */ 1281 if(c->uhead == nil){ 1282 c->uhead = b->uhead; 1283 rwakeup(&c->unlink); 1284 }else 1285 c->utail->next = b->uhead; 1286 c->utail = b->utail; 1287 b->uhead = nil; 1288 } 1289 qunlock(&c->lk); 1290 } 1291 assert(!b->uhead); 1292 dowakeup = 1; 1293 break; 1294 case BioDirty: 1295 /* 1296 * Wrote out an old version of the block (see blockRollback). 1297 * Bump a version count, leave it dirty. 1298 */ 1299 if(b->iostate == BioWriting){ 1300 qlock(&c->lk); 1301 b->vers = c->vers++; 1302 qunlock(&c->lk); 1303 dowakeup = 1; 1304 } 1305 break; 1306 case BioReading: 1307 case BioWriting: 1308 /* 1309 * Adding block to disk queue. Bump reference count. 1310 * diskThread decs the count later by calling blockPut. 1311 * This is here because we need to lock c->lk to 1312 * manipulate the ref count. 1313 */ 1314 qlock(&c->lk); 1315 b->ref++; 1316 qunlock(&c->lk); 1317 break; 1318 case BioReadError: 1319 case BioVentiError: 1320 /* 1321 * Oops. 1322 */ 1323 dowakeup = 1; 1324 break; 1325 } 1326 b->iostate = iostate; 1327 /* 1328 * Now that the state has changed, we can wake the waiters. 1329 */ 1330 if(dowakeup) 1331 rwakeupall(&b->ioready); 1332 } 1333 1334 /* 1335 * The active file system is a tree of blocks. 1336 * When we add snapshots to the mix, the entire file system 1337 * becomes a dag and thus requires a bit more care. 1338 * 1339 * The life of the file system is divided into epochs. A snapshot 1340 * ends one epoch and begins the next. Each file system block 1341 * is marked with the epoch in which it was created (b.epoch). 1342 * When the block is unlinked from the file system (closed), it is marked 1343 * with the epoch in which it was removed (b.epochClose). 1344 * Once we have discarded or archived all snapshots up to 1345 * b.epochClose, we can reclaim the block. 1346 * 1347 * If a block was created in a past epoch but is not yet closed, 1348 * it is treated as copy-on-write. Of course, in order to insert the 1349 * new pointer into the tree, the parent must be made writable, 1350 * and so on up the tree. The recursion stops because the root 1351 * block is always writable. 1352 * 1353 * If blocks are never closed, they will never be reused, and 1354 * we will run out of disk space. But marking a block as closed 1355 * requires some care about dependencies and write orderings. 1356 * 1357 * (1) If a block p points at a copy-on-write block b and we 1358 * copy b to create bb, then p must be written out after bb and 1359 * lbb (bb's label block). 1360 * 1361 * (2) We have to mark b as closed, but only after we switch 1362 * the pointer, so lb must be written out after p. In fact, we 1363 * can't even update the in-memory copy, or the cache might 1364 * mistakenly give out b for reuse before p gets written. 1365 * 1366 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1367 * The caller is expected to record a "p after bb" dependency 1368 * to finish (1), and also expected to call blockRemoveLink 1369 * to arrange for (2) to happen once p is written. 1370 * 1371 * Until (2) happens, some pieces of the code (e.g., the archiver) 1372 * still need to know whether a block has been copied, so we 1373 * set the BsCopied bit in the label and force that to disk *before* 1374 * the copy gets written out. 1375 */ 1376 Block* 1377 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1378 { 1379 Block *bb, *lb; 1380 Label l; 1381 1382 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1383 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n", 1384 argv0, b->addr, &b->l, elo, ehi); 1385 1386 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1387 if(bb == nil){ 1388 blockPut(b); 1389 return nil; 1390 } 1391 1392 /* 1393 * Update label so we know the block has been copied. 1394 * (It will be marked closed once it has been unlinked from 1395 * the tree.) This must follow cacheAllocBlock since we 1396 * can't be holding onto lb when we call cacheAllocBlock. 1397 */ 1398 if((b->l.state&BsCopied)==0) 1399 if(b->part == PartData){ /* not the superblock */ 1400 l = b->l; 1401 l.state |= BsCopied; 1402 lb = _blockSetLabel(b, &l); 1403 if(lb == nil){ 1404 /* can't set label => can't copy block */ 1405 blockPut(b); 1406 l.type = BtMax; 1407 l.state = BsFree; 1408 l.epoch = 0; 1409 l.epochClose = 0; 1410 l.tag = 0; 1411 blockSetLabel(bb, &l, 0); 1412 blockPut(bb); 1413 return nil; 1414 } 1415 blockDependency(bb, lb, -1, nil, nil); 1416 blockPut(lb); 1417 } 1418 1419 memmove(bb->data, b->data, b->c->size); 1420 blockDirty(bb); 1421 blockPut(b); 1422 return bb; 1423 } 1424 1425 /* 1426 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1427 * If recurse is set, we are unlinking all of bb's children as well. 1428 * 1429 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1430 * the relevant information to b's list of unlinked blocks. Once b is written, 1431 * the list will be queued for processing. 1432 * 1433 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1434 */ 1435 void 1436 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1437 { 1438 BList *p, **pp, bl; 1439 1440 /* remove bb from prior list */ 1441 for(pp=&b->prior; (p=*pp)!=nil; ){ 1442 if(p->part == PartData && p->addr == addr){ 1443 *pp = p->next; 1444 blistFree(b->c, p); 1445 }else 1446 pp = &p->next; 1447 } 1448 1449 bl.part = PartData; 1450 bl.addr = addr; 1451 bl.type = type; 1452 bl.tag = tag; 1453 if(b->l.epoch == 0) 1454 assert(b->part == PartSuper); 1455 bl.epoch = b->l.epoch; 1456 bl.next = nil; 1457 bl.recurse = recurse; 1458 1459 if(b->part == PartSuper && b->iostate == BioClean) 1460 p = nil; 1461 else 1462 p = blistAlloc(b); 1463 if(p == nil){ 1464 /* 1465 * b has already been written to disk. 1466 */ 1467 doRemoveLink(b->c, &bl); 1468 return; 1469 } 1470 1471 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1472 assert(b->iostate == BioDirty); 1473 1474 *p = bl; 1475 if(b->uhead == nil) 1476 b->uhead = p; 1477 else 1478 b->utail->next = p; 1479 b->utail = p; 1480 } 1481 1482 /* 1483 * Process removal of a single block and perhaps its children. 1484 */ 1485 static void 1486 doRemoveLink(Cache *c, BList *p) 1487 { 1488 int i, n, recurse; 1489 u32int a; 1490 Block *b; 1491 Label l; 1492 BList bl; 1493 1494 recurse = (p->recurse && p->type != BtData && p->type != BtDir); 1495 1496 /* 1497 * We're not really going to overwrite b, but if we're not 1498 * going to look at its contents, there is no point in reading 1499 * them from the disk. 1500 */ 1501 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0); 1502 if(b == nil) 1503 return; 1504 1505 /* 1506 * When we're unlinking from the superblock, close with the next epoch. 1507 */ 1508 if(p->epoch == 0) 1509 p->epoch = b->l.epoch+1; 1510 1511 /* sanity check */ 1512 if(b->l.epoch > p->epoch){ 1513 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n", 1514 argv0, b->l.epoch, p->epoch); 1515 blockPut(b); 1516 return; 1517 } 1518 1519 if(recurse){ 1520 n = c->size / VtScoreSize; 1521 for(i=0; i<n; i++){ 1522 a = globalToLocal(b->data + i*VtScoreSize); 1523 if(a == NilBlock || !readLabel(c, &l, a)) 1524 continue; 1525 if(l.state&BsClosed) 1526 continue; 1527 /* 1528 * If stack space becomes an issue... 1529 p->addr = a; 1530 p->type = l.type; 1531 p->tag = l.tag; 1532 doRemoveLink(c, p); 1533 */ 1534 1535 bl.part = PartData; 1536 bl.addr = a; 1537 bl.type = l.type; 1538 bl.tag = l.tag; 1539 bl.epoch = p->epoch; 1540 bl.next = nil; 1541 bl.recurse = 1; 1542 /* give up the block lock - share with others */ 1543 blockPut(b); 1544 doRemoveLink(c, &bl); 1545 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1546 if(b == nil){ 1547 fprint(2, "%s: warning: lost block in doRemoveLink\n", 1548 argv0); 1549 return; 1550 } 1551 } 1552 } 1553 1554 l = b->l; 1555 l.state |= BsClosed; 1556 l.epochClose = p->epoch; 1557 if(l.epochClose == l.epoch){ 1558 qlock(&c->fl->lk); 1559 if(l.epoch == c->fl->epochLow) 1560 c->fl->nused--; 1561 blockSetLabel(b, &l, 0); 1562 qunlock(&c->fl->lk); 1563 }else 1564 blockSetLabel(b, &l, 0); 1565 blockPut(b); 1566 } 1567 1568 /* 1569 * Allocate a BList so that we can record a dependency 1570 * or queue a removal related to block b. 1571 * If we can't find a BList, we write out b and return nil. 1572 */ 1573 static BList * 1574 blistAlloc(Block *b) 1575 { 1576 Cache *c; 1577 BList *p; 1578 1579 if(b->iostate != BioDirty){ 1580 /* 1581 * should not happen anymore - 1582 * blockDirty used to flush but no longer does. 1583 */ 1584 assert(b->iostate == BioClean); 1585 fprint(2, "%s: blistAlloc: called on clean block\n", argv0); 1586 return nil; 1587 } 1588 1589 c = b->c; 1590 qlock(&c->lk); 1591 if(c->blfree == nil){ 1592 /* 1593 * No free BLists. What are our options? 1594 */ 1595 1596 /* Block has no priors? Just write it. */ 1597 if(b->prior == nil){ 1598 qunlock(&c->lk); 1599 diskWriteAndWait(c->disk, b); 1600 return nil; 1601 } 1602 1603 /* 1604 * Wake the flush thread, which will hopefully free up 1605 * some BLists for us. We used to flush a block from 1606 * our own prior list and reclaim that BList, but this is 1607 * a no-no: some of the blocks on our prior list may 1608 * be locked by our caller. Or maybe their label blocks 1609 * are locked by our caller. In any event, it's too hard 1610 * to make sure we can do I/O for ourselves. Instead, 1611 * we assume the flush thread will find something. 1612 * (The flush thread never blocks waiting for a block, 1613 * so it can't deadlock like we can.) 1614 */ 1615 while(c->blfree == nil){ 1616 rwakeup(&c->flush); 1617 rsleep(&c->blrend); 1618 if(c->blfree == nil) 1619 fprint(2, "%s: flushing for blists\n", argv0); 1620 } 1621 } 1622 1623 p = c->blfree; 1624 c->blfree = p->next; 1625 qunlock(&c->lk); 1626 return p; 1627 } 1628 1629 static void 1630 blistFree(Cache *c, BList *bl) 1631 { 1632 qlock(&c->lk); 1633 bl->next = c->blfree; 1634 c->blfree = bl; 1635 rwakeup(&c->blrend); 1636 qunlock(&c->lk); 1637 } 1638 1639 char* 1640 bsStr(int state) 1641 { 1642 static char s[100]; 1643 1644 if(state == BsFree) 1645 return "Free"; 1646 if(state == BsBad) 1647 return "Bad"; 1648 1649 sprint(s, "%x", state); 1650 if(!(state&BsAlloc)) 1651 strcat(s, ",Free"); /* should not happen */ 1652 if(state&BsCopied) 1653 strcat(s, ",Copied"); 1654 if(state&BsVenti) 1655 strcat(s, ",Venti"); 1656 if(state&BsClosed) 1657 strcat(s, ",Closed"); 1658 return s; 1659 } 1660 1661 char * 1662 bioStr(int iostate) 1663 { 1664 switch(iostate){ 1665 default: 1666 return "Unknown!!"; 1667 case BioEmpty: 1668 return "Empty"; 1669 case BioLabel: 1670 return "Label"; 1671 case BioClean: 1672 return "Clean"; 1673 case BioDirty: 1674 return "Dirty"; 1675 case BioReading: 1676 return "Reading"; 1677 case BioWriting: 1678 return "Writing"; 1679 case BioReadError: 1680 return "ReadError"; 1681 case BioVentiError: 1682 return "VentiError"; 1683 case BioMax: 1684 return "Max"; 1685 } 1686 } 1687 1688 static char *bttab[] = { 1689 "BtData", 1690 "BtData+1", 1691 "BtData+2", 1692 "BtData+3", 1693 "BtData+4", 1694 "BtData+5", 1695 "BtData+6", 1696 "BtData+7", 1697 "BtDir", 1698 "BtDir+1", 1699 "BtDir+2", 1700 "BtDir+3", 1701 "BtDir+4", 1702 "BtDir+5", 1703 "BtDir+6", 1704 "BtDir+7", 1705 }; 1706 1707 char* 1708 btStr(int type) 1709 { 1710 if(type < nelem(bttab)) 1711 return bttab[type]; 1712 return "unknown"; 1713 } 1714 1715 int 1716 labelFmt(Fmt *f) 1717 { 1718 Label *l; 1719 1720 l = va_arg(f->args, Label*); 1721 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1722 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1723 } 1724 1725 int 1726 scoreFmt(Fmt *f) 1727 { 1728 uchar *v; 1729 int i; 1730 u32int addr; 1731 1732 v = va_arg(f->args, uchar*); 1733 if(v == nil){ 1734 fmtprint(f, "*"); 1735 }else if((addr = globalToLocal(v)) != NilBlock) 1736 fmtprint(f, "0x%.8ux", addr); 1737 else{ 1738 for(i = 0; i < VtScoreSize; i++) 1739 fmtprint(f, "%2.2ux", v[i]); 1740 } 1741 1742 return 0; 1743 } 1744 1745 static int 1746 upHeap(int i, Block *b) 1747 { 1748 Block *bb; 1749 u32int now; 1750 int p; 1751 Cache *c; 1752 1753 c = b->c; 1754 now = c->now; 1755 for(; i != 0; i = p){ 1756 p = (i - 1) >> 1; 1757 bb = c->heap[p]; 1758 if(b->used - now >= bb->used - now) 1759 break; 1760 c->heap[i] = bb; 1761 bb->heap = i; 1762 } 1763 c->heap[i] = b; 1764 b->heap = i; 1765 1766 return i; 1767 } 1768 1769 static int 1770 downHeap(int i, Block *b) 1771 { 1772 Block *bb; 1773 u32int now; 1774 int k; 1775 Cache *c; 1776 1777 c = b->c; 1778 now = c->now; 1779 for(; ; i = k){ 1780 k = (i << 1) + 1; 1781 if(k >= c->nheap) 1782 break; 1783 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1784 k++; 1785 bb = c->heap[k]; 1786 if(b->used - now <= bb->used - now) 1787 break; 1788 c->heap[i] = bb; 1789 bb->heap = i; 1790 } 1791 c->heap[i] = b; 1792 b->heap = i; 1793 return i; 1794 } 1795 1796 /* 1797 * Delete a block from the heap. 1798 * Called with c->lk held. 1799 */ 1800 static void 1801 heapDel(Block *b) 1802 { 1803 int i, si; 1804 Cache *c; 1805 1806 c = b->c; 1807 1808 si = b->heap; 1809 if(si == BadHeap) 1810 return; 1811 b->heap = BadHeap; 1812 c->nheap--; 1813 if(si == c->nheap) 1814 return; 1815 b = c->heap[c->nheap]; 1816 i = upHeap(si, b); 1817 if(i == si) 1818 downHeap(i, b); 1819 } 1820 1821 /* 1822 * Insert a block into the heap. 1823 * Called with c->lk held. 1824 */ 1825 static void 1826 heapIns(Block *b) 1827 { 1828 assert(b->heap == BadHeap); 1829 upHeap(b->c->nheap++, b); 1830 rwakeup(&b->c->heapwait); 1831 } 1832 1833 /* 1834 * Get just the label for a block. 1835 */ 1836 int 1837 readLabel(Cache *c, Label *l, u32int addr) 1838 { 1839 int lpb; 1840 Block *b; 1841 u32int a; 1842 1843 lpb = c->size / LabelSize; 1844 a = addr / lpb; 1845 b = cacheLocal(c, PartLabel, a, OReadOnly); 1846 if(b == nil){ 1847 blockPut(b); 1848 return 0; 1849 } 1850 1851 if(!labelUnpack(l, b->data, addr%lpb)){ 1852 blockPut(b); 1853 return 0; 1854 } 1855 blockPut(b); 1856 return 1; 1857 } 1858 1859 /* 1860 * Process unlink queue. 1861 * Called with c->lk held. 1862 */ 1863 static void 1864 unlinkBody(Cache *c) 1865 { 1866 BList *p; 1867 1868 while(c->uhead != nil){ 1869 p = c->uhead; 1870 c->uhead = p->next; 1871 qunlock(&c->lk); 1872 doRemoveLink(c, p); 1873 qlock(&c->lk); 1874 p->next = c->blfree; 1875 c->blfree = p; 1876 } 1877 } 1878 1879 /* 1880 * Occasionally unlink the blocks on the cache unlink queue. 1881 */ 1882 static void 1883 unlinkThread(void *a) 1884 { 1885 Cache *c = a; 1886 1887 threadsetname("unlink"); 1888 1889 qlock(&c->lk); 1890 for(;;){ 1891 while(c->uhead == nil && c->die.l == nil) 1892 rsleep(&c->unlink); 1893 if(c->die.l != nil) 1894 break; 1895 unlinkBody(c); 1896 } 1897 c->ref--; 1898 rwakeup(&c->die); 1899 qunlock(&c->lk); 1900 } 1901 1902 static int 1903 baddrCmp(const void *a0, const void *a1) 1904 { 1905 BAddr *b0, *b1; 1906 b0 = (BAddr*)a0; 1907 b1 = (BAddr*)a1; 1908 1909 if(b0->part < b1->part) 1910 return -1; 1911 if(b0->part > b1->part) 1912 return 1; 1913 if(b0->addr < b1->addr) 1914 return -1; 1915 if(b0->addr > b1->addr) 1916 return 1; 1917 return 0; 1918 } 1919 1920 /* 1921 * Scan the block list for dirty blocks; add them to the list c->baddr. 1922 */ 1923 static void 1924 flushFill(Cache *c) 1925 { 1926 int i, ndirty; 1927 BAddr *p; 1928 Block *b; 1929 1930 qlock(&c->lk); 1931 if(c->ndirty == 0){ 1932 qunlock(&c->lk); 1933 return; 1934 } 1935 1936 p = c->baddr; 1937 ndirty = 0; 1938 for(i=0; i<c->nblocks; i++){ 1939 b = c->blocks + i; 1940 if(b->part == PartError) 1941 continue; 1942 if(b->iostate == BioDirty || b->iostate == BioWriting) 1943 ndirty++; 1944 if(b->iostate != BioDirty) 1945 continue; 1946 p->part = b->part; 1947 p->addr = b->addr; 1948 p->vers = b->vers; 1949 p++; 1950 } 1951 if(ndirty != c->ndirty){ 1952 fprint(2, "%s: ndirty mismatch expected %d found %d\n", 1953 argv0, c->ndirty, ndirty); 1954 c->ndirty = ndirty; 1955 } 1956 qunlock(&c->lk); 1957 1958 c->bw = p - c->baddr; 1959 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1960 } 1961 1962 /* 1963 * This is not thread safe, i.e. it can't be called from multiple threads. 1964 * 1965 * It's okay how we use it, because it only gets called in 1966 * the flushThread. And cacheFree, but only after 1967 * cacheFree has killed off the flushThread. 1968 */ 1969 static int 1970 cacheFlushBlock(Cache *c) 1971 { 1972 Block *b; 1973 BAddr *p; 1974 int lockfail, nfail; 1975 1976 nfail = 0; 1977 for(;;){ 1978 if(c->br == c->be){ 1979 if(c->bw == 0 || c->bw == c->be) 1980 flushFill(c); 1981 c->br = 0; 1982 c->be = c->bw; 1983 c->bw = 0; 1984 c->nflush = 0; 1985 } 1986 1987 if(c->br == c->be) 1988 return 0; 1989 p = c->baddr + c->br; 1990 c->br++; 1991 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock, 1992 &lockfail); 1993 1994 if(b && blockWrite(b, Nowaitlock)){ 1995 c->nflush++; 1996 blockPut(b); 1997 return 1; 1998 } 1999 if(b) 2000 blockPut(b); 2001 2002 /* 2003 * Why didn't we write the block? 2004 */ 2005 2006 /* Block already written out */ 2007 if(b == nil && !lockfail) 2008 continue; 2009 2010 /* Failed to acquire lock; sleep if happens a lot. */ 2011 if(lockfail && ++nfail > 100){ 2012 sleep(500); 2013 nfail = 0; 2014 } 2015 /* Requeue block. */ 2016 if(c->bw < c->be) 2017 c->baddr[c->bw++] = *p; 2018 } 2019 } 2020 2021 /* 2022 * Occasionally flush dirty blocks from memory to the disk. 2023 */ 2024 static void 2025 flushThread(void *a) 2026 { 2027 Cache *c = a; 2028 int i; 2029 2030 threadsetname("flush"); 2031 qlock(&c->lk); 2032 while(c->die.l == nil){ 2033 rsleep(&c->flush); 2034 qunlock(&c->lk); 2035 for(i=0; i<FlushSize; i++) 2036 if(!cacheFlushBlock(c)){ 2037 /* 2038 * If i==0, could be someone is waking us repeatedly 2039 * to flush the cache but there's no work to do. 2040 * Pause a little. 2041 */ 2042 if(i==0){ 2043 // fprint(2, "%s: flushthread found " 2044 // "nothing to flush - %d dirty\n", 2045 // argv0, c->ndirty); 2046 sleep(250); 2047 } 2048 break; 2049 } 2050 if(i==0 && c->ndirty){ 2051 /* 2052 * All the blocks are being written right now -- there's nothing to do. 2053 * We might be spinning with cacheFlush though -- he'll just keep 2054 * kicking us until c->ndirty goes down. Probably we should sleep 2055 * on something that the diskThread can kick, but for now we'll 2056 * just pause for a little while waiting for disks to finish. 2057 */ 2058 sleep(100); 2059 } 2060 qlock(&c->lk); 2061 rwakeupall(&c->flushwait); 2062 } 2063 c->ref--; 2064 rwakeup(&c->die); 2065 qunlock(&c->lk); 2066 } 2067 2068 /* 2069 * Flush the cache. 2070 */ 2071 void 2072 cacheFlush(Cache *c, int wait) 2073 { 2074 qlock(&c->lk); 2075 if(wait){ 2076 while(c->ndirty){ 2077 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2078 // c->ndirty, c->uhead); 2079 rwakeup(&c->flush); 2080 rsleep(&c->flushwait); 2081 } 2082 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2083 }else if(c->ndirty) 2084 rwakeup(&c->flush); 2085 qunlock(&c->lk); 2086 } 2087 2088 /* 2089 * Kick the flushThread every 30 seconds. 2090 */ 2091 static void 2092 cacheSync(void *v) 2093 { 2094 Cache *c; 2095 2096 c = v; 2097 cacheFlush(c, 0); 2098 }