dcache.c (15772B)
1 /* 2 * Disk cache. 3 * 4 * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back. 5 * Getdblock has a mode parameter that determines i/o and access to a block: 6 * if mode is OREAD or ORDWR, it is read from disk if not already in memory. 7 * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned. 8 * It is *not* marked dirty -- once changes have been made, they should be noted 9 * by using dirtydblock() before putdblock(). 10 * 11 * There is a global cache lock as well as a lock on each block. 12 * Within a thread, the cache lock can be acquired while holding a block lock, 13 * but not vice versa; and a block cannot be locked if you already hold the lock 14 * on another block. 15 * 16 * The flush proc writes out dirty blocks in batches, one batch per dirty tag. 17 * For example, the DirtyArena blocks are all written to disk before any of the 18 * DirtyArenaCib blocks. 19 * 20 * This code used to be in charge of flushing the dirty index blocks out to 21 * disk, but updating the index turned out to benefit from extra care. 22 * Now cached index blocks are never marked dirty. The index.c code takes 23 * care of updating them behind our back, and uses _getdblock to update any 24 * cached copies of the blocks as it changes them on disk. 25 */ 26 27 #include "stdinc.h" 28 #include "dat.h" 29 #include "fns.h" 30 31 typedef struct DCache DCache; 32 33 enum 34 { 35 HashLog = 9, 36 HashSize = 1<<HashLog, 37 HashMask = HashSize - 1, 38 }; 39 40 struct DCache 41 { 42 QLock lock; 43 RWLock dirtylock; /* must be held to inspect or set b->dirty */ 44 Rendez full; 45 Round round; 46 DBlock *free; /* list of available lumps */ 47 u32int now; /* ticks for usage timestamps */ 48 int size; /* max. size of any block; allocated to each block */ 49 DBlock **heads; /* hash table for finding address */ 50 int nheap; /* number of available victims */ 51 DBlock **heap; /* heap for locating victims */ 52 int nblocks; /* number of blocks allocated */ 53 DBlock *blocks; /* array of block descriptors */ 54 DBlock **write; /* array of block pointers to be written */ 55 u8int *mem; /* memory for all block descriptors */ 56 int ndirty; /* number of dirty blocks */ 57 int maxdirty; /* max. number of dirty blocks */ 58 }; 59 60 typedef struct Ra Ra; 61 struct Ra 62 { 63 Part *part; 64 u64int addr; 65 }; 66 67 static DCache dcache; 68 69 static int downheap(int i, DBlock *b); 70 static int upheap(int i, DBlock *b); 71 static DBlock *bumpdblock(void); 72 static void delheap(DBlock *db); 73 static void fixheap(int i, DBlock *b); 74 static void flushproc(void*); 75 static void writeproc(void*); 76 77 void 78 initdcache(u32int mem) 79 { 80 DBlock *b, *last; 81 u32int nblocks, blocksize; 82 int i; 83 u8int *p; 84 85 if(mem < maxblocksize * 2) 86 sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2); 87 if(maxblocksize == 0) 88 sysfatal("no max. block size given for disk cache"); 89 blocksize = maxblocksize; 90 nblocks = mem / blocksize; 91 dcache.full.l = &dcache.lock; 92 dcache.nblocks = nblocks; 93 dcache.maxdirty = (nblocks * 2) / 3; 94 trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n", 95 nblocks, blocksize, dcache.maxdirty); 96 dcache.size = blocksize; 97 dcache.heads = MKNZ(DBlock*, HashSize); 98 dcache.heap = MKNZ(DBlock*, nblocks); 99 dcache.blocks = MKNZ(DBlock, nblocks); 100 dcache.write = MKNZ(DBlock*, nblocks); 101 dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize); 102 103 last = nil; 104 p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1)); 105 for(i = 0; i < nblocks; i++){ 106 b = &dcache.blocks[i]; 107 b->data = &p[i * blocksize]; 108 b->heap = TWID32; 109 b->writedonechan = chancreate(sizeof(void*), 1); 110 b->next = last; 111 last = b; 112 } 113 114 dcache.free = last; 115 dcache.nheap = 0; 116 setstat(StatDcacheSize, nblocks); 117 initround(&dcache.round, "dcache", 120*1000); 118 119 vtproc(flushproc, nil); 120 vtproc(delaykickroundproc, &dcache.round); 121 } 122 123 static u32int 124 pbhash(u64int addr) 125 { 126 u32int h; 127 128 #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask) 129 h = (addr >> 32) ^ addr; 130 return hashit(h); 131 } 132 133 DBlock* 134 getdblock(Part *part, u64int addr, int mode) 135 { 136 DBlock *b; 137 138 b = _getdblock(part, addr, mode, 1); 139 if(mode == OREAD || mode == ORDWR) 140 addstat(StatDcacheRead, 1); 141 if(mode == OWRITE || mode == ORDWR) 142 addstat(StatDcacheWrite, 1); 143 return b; 144 } 145 146 DBlock* 147 _getdblock(Part *part, u64int addr, int mode, int load) 148 { 149 DBlock *b; 150 u32int h, size, ms; 151 152 ms = 0; 153 trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr); 154 size = part->blocksize; 155 if(size > dcache.size){ 156 seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size); 157 if(load) 158 addstat(StatDcacheLookup, 1); 159 return nil; 160 } 161 h = pbhash(addr); 162 163 /* 164 * look for the block in the cache 165 */ 166 qlock(&dcache.lock); 167 again: 168 for(b = dcache.heads[h]; b != nil; b = b->next){ 169 if(b->part == part && b->addr == addr){ 170 if(load) 171 addstat2(StatDcacheHit, 1, StatDcacheLookup, 1); 172 goto found; 173 } 174 } 175 176 /* 177 * missed: locate the block with the oldest second to last use. 178 * remove it from the heap, and fix up the heap. 179 */ 180 if(!load){ 181 qunlock(&dcache.lock); 182 return nil; 183 } 184 185 /* 186 * Only start timer here, on cache miss - calling msec() on plain cache hits 187 * makes cache hits system-call bound. 188 */ 189 ms = msec(); 190 addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1); 191 192 b = bumpdblock(); 193 if(b == nil){ 194 trace(TraceBlock, "all disk cache blocks in use"); 195 addstat(StatDcacheStall, 1); 196 rsleep(&dcache.full); 197 addstat(StatDcacheStall, -1); 198 goto again; 199 } 200 201 assert(!b->dirty); 202 203 /* 204 * the new block has no last use, so assume it happens sometime in the middle 205 ZZZ this is not reasonable 206 */ 207 b->used = (b->used2 + dcache.now) / 2; 208 209 /* 210 * rechain the block on the correct hash chain 211 */ 212 b->next = dcache.heads[h]; 213 dcache.heads[h] = b; 214 if(b->next != nil) 215 b->next->prev = b; 216 b->prev = nil; 217 218 b->addr = addr; 219 b->part = part; 220 b->size = 0; 221 222 found: 223 b->ref++; 224 b->used2 = b->used; 225 b->used = dcache.now++; 226 if(b->heap != TWID32) 227 fixheap(b->heap, b); 228 229 if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){ 230 trace(TraceBlock, "getdblock allocwriteproc %s", part->name); 231 part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks); 232 vtproc(writeproc, part); 233 } 234 qunlock(&dcache.lock); 235 236 trace(TraceBlock, "getdblock lock"); 237 addstat(StatDblockStall, 1); 238 if(mode == OREAD) 239 rlock(&b->lock); 240 else 241 wlock(&b->lock); 242 addstat(StatDblockStall, -1); 243 trace(TraceBlock, "getdblock locked"); 244 245 if(b->size != size){ 246 if(mode == OREAD){ 247 addstat(StatDblockStall, 1); 248 runlock(&b->lock); 249 wlock(&b->lock); 250 addstat(StatDblockStall, -1); 251 } 252 if(b->size < size){ 253 if(mode == OWRITE) 254 memset(&b->data[b->size], 0, size - b->size); 255 else{ 256 trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr); 257 diskaccess(0); 258 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){ 259 b->mode = ORDWR; /* so putdblock wunlocks */ 260 putdblock(b); 261 return nil; 262 } 263 trace(TraceBlock, "getdblock readpartdone"); 264 addstat(StatApartRead, 1); 265 addstat(StatApartReadBytes, size-b->size); 266 } 267 } 268 b->size = size; 269 if(mode == OREAD){ 270 addstat(StatDblockStall, 1); 271 wunlock(&b->lock); 272 rlock(&b->lock); 273 addstat(StatDblockStall, -1); 274 } 275 } 276 277 b->mode = mode; 278 trace(TraceBlock, "getdblock exit"); 279 if(ms) 280 addstat(StatDcacheLookupTime, msec() - ms); 281 return b; 282 } 283 284 void 285 putdblock(DBlock *b) 286 { 287 if(b == nil) 288 return; 289 290 trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr); 291 292 if(b->mode == OREAD) 293 runlock(&b->lock); 294 else 295 wunlock(&b->lock); 296 297 qlock(&dcache.lock); 298 if(--b->ref == 0 && !b->dirty){ 299 if(b->heap == TWID32) 300 upheap(dcache.nheap++, b); 301 rwakeupall(&dcache.full); 302 } 303 qunlock(&dcache.lock); 304 } 305 306 void 307 dirtydblock(DBlock *b, int dirty) 308 { 309 int odirty; 310 311 trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux", 312 b->part->name, b->addr, dirty, getcallerpc(&b)); 313 assert(b->ref != 0); 314 assert(b->mode==ORDWR || b->mode==OWRITE); 315 316 odirty = b->dirty; 317 if(b->dirty) 318 assert(b->dirty == dirty); 319 else 320 b->dirty = dirty; 321 322 qlock(&dcache.lock); 323 if(!odirty){ 324 dcache.ndirty++; 325 setstat(StatDcacheDirty, dcache.ndirty); 326 if(dcache.ndirty >= dcache.maxdirty) 327 kickround(&dcache.round, 0); 328 else 329 delaykickround(&dcache.round); 330 } 331 qunlock(&dcache.lock); 332 } 333 334 static void 335 unchain(DBlock *b) 336 { 337 ulong h; 338 339 /* 340 * unchain the block 341 */ 342 if(b->prev == nil){ 343 h = pbhash(b->addr); 344 if(dcache.heads[h] != b) 345 sysfatal("bad hash chains in disk cache"); 346 dcache.heads[h] = b->next; 347 }else 348 b->prev->next = b->next; 349 if(b->next != nil) 350 b->next->prev = b->prev; 351 } 352 353 /* 354 * remove some block from use and update the free list and counters 355 */ 356 static DBlock* 357 bumpdblock(void) 358 { 359 DBlock *b; 360 361 trace(TraceBlock, "bumpdblock enter"); 362 b = dcache.free; 363 if(b != nil){ 364 dcache.free = b->next; 365 return b; 366 } 367 368 if(dcache.ndirty >= dcache.maxdirty) 369 kickdcache(); 370 371 /* 372 * remove blocks until we find one that is unused 373 * referenced blocks are left in the heap even though 374 * they can't be scavenged; this is simple a speed optimization 375 */ 376 for(;;){ 377 if(dcache.nheap == 0){ 378 kickdcache(); 379 trace(TraceBlock, "bumpdblock gotnothing"); 380 return nil; 381 } 382 b = dcache.heap[0]; 383 delheap(b); 384 if(!b->ref && !b->dirty) 385 break; 386 } 387 388 trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr); 389 390 unchain(b); 391 return b; 392 } 393 394 void 395 emptydcache(void) 396 { 397 DBlock *b; 398 399 qlock(&dcache.lock); 400 while(dcache.nheap > 0){ 401 b = dcache.heap[0]; 402 delheap(b); 403 if(!b->ref && !b->dirty){ 404 unchain(b); 405 b->next = dcache.free; 406 dcache.free = b; 407 } 408 } 409 qunlock(&dcache.lock); 410 } 411 412 /* 413 * delete an arbitrary block from the heap 414 */ 415 static void 416 delheap(DBlock *db) 417 { 418 if(db->heap == TWID32) 419 return; 420 fixheap(db->heap, dcache.heap[--dcache.nheap]); 421 db->heap = TWID32; 422 } 423 424 /* 425 * push an element up or down to it's correct new location 426 */ 427 static void 428 fixheap(int i, DBlock *b) 429 { 430 if(upheap(i, b) == i) 431 downheap(i, b); 432 } 433 434 static int 435 upheap(int i, DBlock *b) 436 { 437 DBlock *bb; 438 u32int now; 439 int p; 440 441 now = dcache.now; 442 for(; i != 0; i = p){ 443 p = (i - 1) >> 1; 444 bb = dcache.heap[p]; 445 if(b->used2 - now >= bb->used2 - now) 446 break; 447 dcache.heap[i] = bb; 448 bb->heap = i; 449 } 450 451 dcache.heap[i] = b; 452 b->heap = i; 453 return i; 454 } 455 456 static int 457 downheap(int i, DBlock *b) 458 { 459 DBlock *bb; 460 u32int now; 461 int k; 462 463 now = dcache.now; 464 for(; ; i = k){ 465 k = (i << 1) + 1; 466 if(k >= dcache.nheap) 467 break; 468 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now) 469 k++; 470 bb = dcache.heap[k]; 471 if(b->used2 - now <= bb->used2 - now) 472 break; 473 dcache.heap[i] = bb; 474 bb->heap = i; 475 } 476 477 dcache.heap[i] = b; 478 b->heap = i; 479 return i; 480 } 481 482 static void 483 findblock(DBlock *bb) 484 { 485 DBlock *b, *last; 486 int h; 487 488 last = nil; 489 h = pbhash(bb->addr); 490 for(b = dcache.heads[h]; b != nil; b = b->next){ 491 if(last != b->prev) 492 sysfatal("bad prev link"); 493 if(b == bb) 494 return; 495 last = b; 496 } 497 sysfatal("block missing from hash table"); 498 } 499 500 void 501 checkdcache(void) 502 { 503 DBlock *b; 504 u32int size, now; 505 int i, k, refed, nfree; 506 507 qlock(&dcache.lock); 508 size = dcache.size; 509 now = dcache.now; 510 for(i = 0; i < dcache.nheap; i++){ 511 if(dcache.heap[i]->heap != i) 512 sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap); 513 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now) 514 sysfatal("dc: bad heap ordering"); 515 k = (i << 1) + 1; 516 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) 517 sysfatal("dc: bad heap ordering"); 518 k++; 519 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) 520 sysfatal("dc: bad heap ordering"); 521 } 522 523 refed = 0; 524 for(i = 0; i < dcache.nblocks; i++){ 525 b = &dcache.blocks[i]; 526 if(b->data != &dcache.mem[i * size]) 527 sysfatal("dc: mis-blocked at %d", i); 528 if(b->ref && b->heap == TWID32) 529 refed++; 530 if(b->addr) 531 findblock(b); 532 if(b->heap != TWID32 533 && dcache.heap[b->heap] != b) 534 sysfatal("dc: spurious heap value"); 535 } 536 537 nfree = 0; 538 for(b = dcache.free; b != nil; b = b->next){ 539 if(b->addr != 0 || b->heap != TWID32) 540 sysfatal("dc: bad free list"); 541 nfree++; 542 } 543 544 if(dcache.nheap + nfree + refed != dcache.nblocks) 545 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks); 546 qunlock(&dcache.lock); 547 } 548 549 void 550 flushdcache(void) 551 { 552 trace(TraceProc, "flushdcache enter"); 553 kickround(&dcache.round, 1); 554 trace(TraceProc, "flushdcache exit"); 555 } 556 557 void 558 kickdcache(void) 559 { 560 kickround(&dcache.round, 0); 561 } 562 563 static int 564 parallelwrites(DBlock **b, DBlock **eb, int dirty) 565 { 566 DBlock **p, **q; 567 Part *part; 568 569 for(p=b; p<eb && (*p)->dirty == dirty; p++){ 570 assert(b<=p && p<eb); 571 sendp((*p)->part->writechan, *p); 572 } 573 q = p; 574 for(p=b; p<q; p++){ 575 assert(b<=p && p<eb); 576 recvp((*p)->writedonechan); 577 } 578 579 /* 580 * Flush the partitions that have been written to. 581 */ 582 part = nil; 583 for(p=b; p<q; p++){ 584 if(part != (*p)->part){ 585 part = (*p)->part; 586 flushpart(part); /* what if it fails? */ 587 } 588 } 589 590 return p-b; 591 } 592 593 /* 594 * Sort first by dirty flag, then by partition, then by address in partition. 595 */ 596 static int 597 writeblockcmp(const void *va, const void *vb) 598 { 599 DBlock *a, *b; 600 601 a = *(DBlock**)va; 602 b = *(DBlock**)vb; 603 604 if(a->dirty != b->dirty) 605 return a->dirty - b->dirty; 606 if(a->part != b->part){ 607 if(a->part < b->part) 608 return -1; 609 if(a->part > b->part) 610 return 1; 611 } 612 if(a->addr < b->addr) 613 return -1; 614 return 1; 615 } 616 617 static void 618 flushproc(void *v) 619 { 620 int i, j, n; 621 ulong t0; 622 DBlock *b, **write; 623 624 USED(v); 625 threadsetname("flushproc"); 626 for(;;){ 627 waitforkick(&dcache.round); 628 629 trace(TraceWork, "start"); 630 t0 = nsec()/1000; 631 trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0); 632 633 write = dcache.write; 634 n = 0; 635 for(i=0; i<dcache.nblocks; i++){ 636 b = &dcache.blocks[i]; 637 if(b->dirty) 638 write[n++] = b; 639 } 640 641 qsort(write, n, sizeof(write[0]), writeblockcmp); 642 643 /* Write each stage of blocks out. */ 644 trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0); 645 i = 0; 646 for(j=1; j<DirtyMax; j++){ 647 trace(TraceProc, "writeblocks.%d t=%lud", 648 j, (ulong)(nsec()/1000)-t0); 649 i += parallelwrites(write+i, write+n, j); 650 } 651 if(i != n){ 652 fprint(2, "in flushproc i=%d n=%d\n", i, n); 653 for(i=0; i<n; i++) 654 fprint(2, "\tblock %d: dirty=%d\n", 655 i, write[i]->dirty); 656 abort(); 657 } 658 659 /* 660 * b->dirty is protected by b->lock while ndirty is protected 661 * by dcache.lock, so the --ndirty below is the delayed one 662 * from clearing b->dirty in the write proc. It may happen 663 * that some other proc has come along and redirtied b since 664 * the write. That's okay, it just means that ndirty may be 665 * one too high until we catch up and do the decrement. 666 */ 667 trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0); 668 qlock(&dcache.lock); 669 for(i=0; i<n; i++){ 670 b = write[i]; 671 --dcache.ndirty; 672 if(b->ref == 0 && b->heap == TWID32){ 673 upheap(dcache.nheap++, b); 674 rwakeupall(&dcache.full); 675 } 676 } 677 setstat(StatDcacheDirty, dcache.ndirty); 678 qunlock(&dcache.lock); 679 addstat(StatDcacheFlush, 1); 680 trace(TraceWork, "finish"); 681 } 682 } 683 684 static void 685 writeproc(void *v) 686 { 687 DBlock *b; 688 Part *p; 689 690 p = v; 691 692 threadsetname("writeproc:%s", p->name); 693 for(;;){ 694 b = recvp(p->writechan); 695 trace(TraceWork, "start"); 696 assert(b->part == p); 697 trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr); 698 wlock(&b->lock); 699 trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr); 700 diskaccess(0); 701 if(writepart(p, b->addr, b->data, b->size) < 0) 702 fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n", 703 argv0, p->name, b->addr); 704 addstat(StatApartWrite, 1); 705 addstat(StatApartWriteBytes, b->size); 706 b->dirty = 0; 707 wunlock(&b->lock); 708 trace(TraceProc, "finish %s 0x%llux", p->name, b->addr); 709 trace(TraceWork, "finish"); 710 sendp(b->writedonechan, b); 711 } 712 }