root/kern/vfs_bio.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. size2cqueue
  2. bremfree
  3. buf_init
  4. buf_stub
  5. buf_get
  6. buf_put
  7. bufinit
  8. bio_doread
  9. bread
  10. breadn
  11. bread_cluster_callback
  12. bread_cluster
  13. bwrite
  14. bdwrite
  15. bawrite
  16. buf_dirty
  17. buf_undirty
  18. brelse
  19. incore
  20. getblk
  21. geteblk
  22. getnewbuf
  23. buf_daemon
  24. biowait
  25. biodone
  26. vfs_bufstats
  27. vfs_bufstats

    1 /*      $OpenBSD: vfs_bio.c,v 1.99 2007/08/07 04:32:45 beck Exp $       */
    2 /*      $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $  */
    3 
    4 /*-
    5  * Copyright (c) 1994 Christopher G. Demetriou
    6  * Copyright (c) 1982, 1986, 1989, 1993
    7  *      The Regents of the University of California.  All rights reserved.
    8  * (c) UNIX System Laboratories, Inc.
    9  * All or some portions of this file are derived from material licensed
   10  * to the University of California by American Telephone and Telegraph
   11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   12  * the permission of UNIX System Laboratories, Inc.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
   39  */
   40 
   41 /*
   42  * Some references:
   43  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
   44  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
   45  *              UNIX Operating System (Addison Welley, 1989)
   46  */
   47 
   48 #include <sys/param.h>
   49 #include <sys/systm.h>
   50 #include <sys/proc.h>
   51 #include <sys/buf.h>
   52 #include <sys/vnode.h>
   53 #include <sys/mount.h>
   54 #include <sys/malloc.h>
   55 #include <sys/pool.h>
   56 #include <sys/resourcevar.h>
   57 #include <sys/conf.h>
   58 #include <sys/kernel.h>
   59 
   60 #include <uvm/uvm_extern.h>
   61 
   62 #include <miscfs/specfs/specdev.h>
   63 
   64 /*
   65  * Definitions for the buffer hash lists.
   66  */
   67 #define BUFHASH(dvp, lbn)       \
   68         (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
   69 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
   70 u_long  bufhash;
   71 
   72 /*
   73  * Insq/Remq for the buffer hash lists.
   74  */
   75 #define binshash(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_hash)
   76 #define bremhash(bp)            LIST_REMOVE(bp, b_hash)
   77 
   78 /*
   79  * Definitions for the buffer free lists.
   80  */
   81 #define BQUEUES         6               /* number of free buffer queues */
   82 
   83 #define BQ_DIRTY        0               /* LRU queue with dirty buffers */
   84 
   85 
   86 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
   87 int bqpages[BQUEUES];           /* pages allocated, per queue */
   88 int bqpagelow;
   89 int needbuffer;
   90 struct bio_ops bioops;
   91 
   92 /*
   93  * Buffer pool for I/O buffers.
   94  */
   95 struct pool bufpool;
   96 struct vm_map *buf_map;
   97 struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead);
   98 struct buf *buf_get(size_t);
   99 struct buf *buf_stub(struct vnode *, daddr64_t);
  100 void buf_put(struct buf *);
  101 
  102 /*
  103  * Insq/Remq for the buffer free lists.
  104  */
  105 #define binsheadfree(bp, dp)    TAILQ_INSERT_HEAD(dp, bp, b_freelist)
  106 #define binstailfree(bp, dp)    TAILQ_INSERT_TAIL(dp, bp, b_freelist)
  107 
  108 struct buf *bio_doread(struct vnode *, daddr64_t, int, int);
  109 struct buf *getnewbuf(size_t, int, int, int *);
  110 void buf_init(struct buf *, int);
  111 void bread_cluster_callback(struct buf *);
  112 
  113 /*
  114  * We keep a few counters to monitor the utilization of the buffer cache
  115  *
  116  *  numbufpages   - number of pages totally allocated.
  117  *  numdirtypages - number of pages on BQ_DIRTY queue.
  118  *  lodirtypages  - low water mark for buffer cleaning daemon.
  119  *  hidirtypages  - high water mark for buffer cleaning daemon.
  120  *  numfreepages  - number of pages on BQ_CLEAN and BQ_DIRTY queues. unused.
  121  *  numcleanpages - number of pages on BQ_CLEAN queue.
  122  *                  Used to track the need to speedup the cleaner and 
  123  *                  as a reserve for special processes like syncer.
  124  *  maxcleanpages - the highest page count on BQ_CLEAN.
  125  */
  126 long numbufpages;
  127 long numdirtypages;
  128 long lodirtypages;
  129 long hidirtypages;
  130 long numfreepages;
  131 long numcleanpages;
  132 long locleanpages;
  133 long hicleanpages;
  134 long maxcleanpages;
  135 
  136 struct proc *cleanerproc;
  137 int bd_req;                     /* Sleep point for cleaner daemon. */
  138 
  139 int size2cqueue(int *size);
  140 
  141 int
  142 size2cqueue(int *size)
  143 {
  144         int i = 0, q;
  145         int s = *size;
  146         s -= 1;
  147         while (s > 0) {
  148                 s = s >> 1;
  149                 i++;
  150         }
  151         if (i < PAGE_SHIFT) {
  152                 i = PAGE_SHIFT; /* < 4096 -> 4096 */
  153         }
  154         *size = 1 << i;
  155         q = (i + 1 - PAGE_SHIFT); /* XXX 4096 is queue 1 */
  156         if (q >= BQUEUES)
  157                 panic("queue %d > BQUEUES %d", q, BQUEUES);
  158         if (q == 0)
  159                 panic("can't return dirty q");
  160         return(q);
  161 }
  162 
  163 void
  164 bremfree(struct buf *bp)
  165 {
  166         struct bqueues *dp = NULL;
  167         int queue;
  168 
  169         /*
  170          * We only calculate the head of the freelist when removing
  171          * the last element of the list as that is the only time that
  172          * it is needed (e.g. to reset the tail pointer).
  173          *
  174          * NB: This makes an assumption about how tailq's are implemented.
  175          */
  176         if (TAILQ_NEXT(bp, b_freelist) == NULL) {
  177                 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
  178                         if (dp->tqh_last == &TAILQ_NEXT(bp, b_freelist))
  179                                 break;
  180                 if (dp == &bufqueues[BQUEUES])
  181                         panic("bremfree: lost tail");
  182         }
  183         numfreepages -= btoc(bp->b_bufsize);
  184         if (!ISSET(bp->b_flags, B_DELWRI)) {
  185                 int qs = bp->b_bufsize;
  186                 queue = size2cqueue(&qs);
  187                 numcleanpages -= btoc(bp->b_bufsize);
  188                 bqpages[queue] -= btoc(bp->b_bufsize);
  189         } else
  190                 numdirtypages -= btoc(bp->b_bufsize);
  191         TAILQ_REMOVE(dp, bp, b_freelist);
  192 }
  193 
  194 void
  195 buf_init(struct buf *bp, int size)
  196 {
  197         int npages, queue;
  198 
  199         splassert(IPL_BIO);
  200 
  201         npages = btoc(size);
  202         bzero((char *)bp, sizeof *bp);
  203         bp->b_vnbufs.le_next = NOLIST;
  204         bp->b_freelist.tqe_next = NOLIST;
  205         bp->b_synctime = time_uptime + 300;
  206         bp->b_dev = NODEV;
  207         queue = size2cqueue(&size);
  208         LIST_INIT(&bp->b_dep);
  209         numbufpages += npages;
  210         numfreepages += npages;
  211         numcleanpages += npages;
  212         bqpages[queue] += npages;
  213         if (maxcleanpages < numcleanpages)
  214                 maxcleanpages = numcleanpages;
  215 }
  216 
  217 /*
  218  * This is a non-sleeping expanded equivalent of getblk() that allocates only
  219  * the buffer structure, and not its contents.
  220  */
  221 struct buf *
  222 buf_stub(struct vnode *vp, daddr64_t lblkno)
  223 {
  224         struct buf *bp;
  225         int s;
  226 
  227         s = splbio();
  228         bp = pool_get(&bufpool, PR_NOWAIT);
  229         splx(s);
  230 
  231         if (bp == NULL)
  232                 return (NULL);
  233 
  234         bzero((char *)bp, sizeof *bp);
  235         bp->b_vnbufs.le_next = NOLIST;
  236         bp->b_freelist.tqe_next = NOLIST;
  237         bp->b_synctime = time_uptime + 300;
  238         bp->b_dev = NODEV;
  239         bp->b_bufsize = 0;
  240         bp->b_data = NULL;
  241         bp->b_flags = B_BUSY;
  242         bp->b_dev = NODEV;
  243         bp->b_blkno = bp->b_lblkno = lblkno;
  244         bp->b_iodone = NULL;
  245         bp->b_error = 0;
  246         bp->b_resid = 0;
  247         bp->b_bcount = 0;
  248         bp->b_dirtyoff = bp->b_dirtyend = 0;
  249         bp->b_validoff = bp->b_validend = 0;
  250 
  251         LIST_INIT(&bp->b_dep);
  252 
  253         s = splbio();
  254         LIST_INSERT_HEAD(&bufhead, bp, b_list);
  255         bgetvp(vp, bp);
  256         splx(s);
  257 
  258         return (bp);
  259 }
  260 
  261 struct buf *
  262 buf_get(size_t size)
  263 {
  264         struct bqueues *dp;
  265         struct buf *bp;
  266         int npages;
  267         int queue, qs;
  268         void *data;
  269 
  270         splassert(IPL_BIO);
  271 
  272         KASSERT(size > 0);
  273 
  274         size = round_page(size);
  275         qs = size;
  276         queue = size2cqueue(&qs);
  277         npages = btoc(qs);
  278 
  279         if (numbufpages + npages > bufpages)
  280                 return (NULL);
  281 
  282         bp = pool_get(&bufpool, PR_WAITOK);
  283 
  284         data = (void *)uvm_km_alloc(buf_map, qs);
  285         if (data == NULL) {
  286                 pool_put(&bufpool, bp);
  287                 return (NULL);
  288         }
  289         buf_init(bp, qs);
  290         bp->b_flags = B_INVAL;
  291         bp->b_bufsize = qs;
  292         bp->b_data = data;
  293         dp = &bufqueues[queue];
  294         binsheadfree(bp, dp);
  295         binshash(bp, &invalhash);
  296         LIST_INSERT_HEAD(&bufhead, bp, b_list);
  297 
  298         return (bp);
  299 }
  300 
  301 void
  302 buf_put(struct buf *bp)
  303 {
  304         splassert(IPL_BIO);
  305 #ifdef DIAGNOSTIC
  306         if (bp->b_data != NULL)
  307                 KASSERT(bp->b_bufsize > 0);
  308 #endif
  309 #ifdef QUEUE_MACRO_DEBUG
  310         if (bp->b_freelist.tqe_next != NOLIST &&
  311             bp->b_freelist.tqe_next != (void *)-1)
  312                 panic("buf_put: still on the free list");
  313 
  314         if (bp->b_vnbufs.le_next != NOLIST &&
  315             bp->b_vnbufs.le_next != (void *)-1)
  316                 panic("buf_put: still on the vnode list");
  317 #endif
  318 #ifdef DIAGNOSTIC
  319         if (!LIST_EMPTY(&bp->b_dep))
  320                 panic("buf_put: b_dep is not empty");
  321 #endif
  322         LIST_REMOVE(bp, b_list);
  323 
  324         if (bp->b_data != NULL) {
  325                 bremhash(bp);
  326                 numbufpages -= btoc(bp->b_bufsize);
  327                 uvm_km_free(buf_map, (vaddr_t)bp->b_data, bp->b_bufsize);
  328         }
  329 
  330         pool_put(&bufpool, bp);
  331 }
  332 
  333 /*
  334  * Initialize buffers and hash links for buffers.
  335  */
  336 void
  337 bufinit(void)
  338 {
  339         vaddr_t minaddr, maxaddr;
  340         struct bqueues *dp;
  341 
  342         pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
  343         pool_setipl(&bufpool, IPL_BIO);
  344         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
  345                 TAILQ_INIT(dp);
  346         minaddr = vm_map_min(kernel_map);
  347         buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
  348             ptoa(bufpages), 0, FALSE, NULL);
  349 
  350         /* 
  351          * XXX don't starve any one queue below 5% of the total number
  352          * of buffer cache pages.
  353          */
  354         bqpagelow = bufpages / 20; 
  355 
  356         bufhashtbl = hashinit(bufpages / 4, M_CACHE, M_WAITOK, &bufhash);
  357         hidirtypages = (bufpages / 4) * 3;
  358         lodirtypages = bufpages / 2;
  359 
  360         /*
  361          * Reserve 5% of bufpages for syncer's needs,
  362          * but not more than 25% and if possible
  363          * not less than 2 * MAXBSIZE. locleanpages
  364          * value must be not too small
  365          */
  366         hicleanpages = bufpages / 2;
  367         locleanpages = hicleanpages / 2;
  368         if (locleanpages < btoc(2 * MAXBSIZE))
  369                 locleanpages = btoc(2 * MAXBSIZE);
  370         if (locleanpages > bufpages / 4)
  371                 locleanpages = bufpages / 4;
  372 
  373         maxcleanpages = locleanpages;
  374 }
  375 
  376 struct buf *
  377 bio_doread(struct vnode *vp, daddr64_t blkno, int size, int async)
  378 {
  379         struct buf *bp;
  380 
  381         bp = getblk(vp, blkno, size, 0, 0);
  382 
  383         /*
  384          * If buffer does not have valid data, start a read.
  385          * Note that if buffer is B_INVAL, getblk() won't return it.
  386          * Therefore, it's valid if its I/O has completed or been delayed.
  387          */
  388         if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
  389                 SET(bp->b_flags, B_READ | async);
  390                 VOP_STRATEGY(bp);
  391 
  392                 /* Pay for the read. */
  393                 curproc->p_stats->p_ru.ru_inblock++;            /* XXX */
  394         } else if (async) {
  395                 brelse(bp);
  396         }
  397 
  398         return (bp);
  399 }
  400 
  401 /*
  402  * Read a disk block.
  403  * This algorithm described in Bach (p.54).
  404  */
  405 int
  406 bread(struct vnode *vp, daddr64_t blkno, int size, struct ucred *cred,
  407     struct buf **bpp)
  408 {
  409         struct buf *bp;
  410 
  411         /* Get buffer for block. */
  412         bp = *bpp = bio_doread(vp, blkno, size, 0);
  413 
  414         /* Wait for the read to complete, and return result. */
  415         return (biowait(bp));
  416 }
  417 
  418 /*
  419  * Read-ahead multiple disk blocks. The first is sync, the rest async.
  420  * Trivial modification to the breada algorithm presented in Bach (p.55).
  421  */
  422 int
  423 breadn(struct vnode *vp, daddr64_t blkno, int size, daddr64_t rablks[],
  424     int rasizes[], int nrablks, struct ucred *cred, struct buf **bpp)
  425 {
  426         struct buf *bp;
  427         int i;
  428 
  429         bp = *bpp = bio_doread(vp, blkno, size, 0);
  430 
  431         /*
  432          * For each of the read-ahead blocks, start a read, if necessary.
  433          */
  434         for (i = 0; i < nrablks; i++) {
  435                 /* If it's in the cache, just go on to next one. */
  436                 if (incore(vp, rablks[i]))
  437                         continue;
  438 
  439                 /* Get a buffer for the read-ahead block */
  440                 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
  441         }
  442 
  443         /* Otherwise, we had to start a read for it; wait until it's valid. */
  444         return (biowait(bp));
  445 }
  446 
  447 /*
  448  * Called from interrupt context.
  449  */
  450 void
  451 bread_cluster_callback(struct buf *bp)
  452 {
  453         int i;
  454         struct buf **xbpp;
  455 
  456         xbpp = (struct buf **)bp->b_saveaddr;
  457 
  458         for (i = 0; xbpp[i] != 0; i++) {
  459                 if (ISSET(bp->b_flags, B_ERROR))
  460                         SET(xbpp[i]->b_flags, B_INVAL | B_ERROR);
  461                 biodone(xbpp[i]);
  462         }
  463 
  464         free(xbpp, M_TEMP);
  465         bp->b_data = NULL;
  466         buf_put(bp);
  467 }
  468 
  469 int
  470 bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp)
  471 {
  472         struct buf *bp, **xbpp;
  473         int howmany, i, maxra, inc;
  474         daddr64_t sblkno;
  475         size_t spill;
  476 
  477         *rbpp = bio_doread(vp, blkno, size, 0);
  478 
  479         if (size != round_page(size))
  480                 return (biowait(*rbpp));
  481 
  482         if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra))
  483                 return (biowait(*rbpp));
  484 
  485         maxra++; 
  486         if (sblkno == -1 || maxra < 2)
  487                 return (biowait(*rbpp));
  488 
  489         howmany = MAXPHYS / size;
  490         if (howmany > maxra)
  491                 howmany = maxra;
  492 
  493         xbpp = malloc((howmany + 1) * sizeof(struct buf *), M_TEMP, M_NOWAIT);
  494         if (xbpp == NULL)
  495                 return (biowait(*rbpp));
  496 
  497         for (i = 0; i < howmany; i++) {
  498                 if (incore(vp, blkno + i + 1)) {
  499                         for (--i; i >= 0; i--) {
  500                                 SET(xbpp[i]->b_flags, B_INVAL);
  501                                 brelse(xbpp[i]);
  502                         }
  503                         free(xbpp, M_TEMP);
  504                         return (biowait(*rbpp));
  505                 }
  506                 xbpp[i] = buf_stub(vp, blkno + i + 1);
  507                 if (xbpp[i] == NULL) {
  508                         for (--i; i >= 0; i--) {
  509                                 SET(xbpp[i]->b_flags, B_INVAL);
  510                                 brelse(xbpp[i]);
  511                         }
  512                         free(xbpp, M_TEMP);
  513                         return (biowait(*rbpp));
  514                 }
  515         }
  516 
  517         xbpp[howmany] = 0;
  518 
  519         bp = getnewbuf(howmany * size, 0, 0, NULL);
  520         if (bp == NULL) {
  521                 for (i = 0; i < howmany; i++) {
  522                         SET(xbpp[i]->b_flags, B_INVAL);
  523                         brelse(xbpp[i]);
  524                 }
  525                 free(xbpp, M_TEMP);
  526                 return (biowait(*rbpp));
  527         }
  528 
  529         inc = btodb(size);
  530 
  531         for (i = 0; i < howmany; i++) {
  532                 SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
  533                 binshash(xbpp[i], BUFHASH(vp, xbpp[i]->b_lblkno));
  534                 xbpp[i]->b_blkno = sblkno + (i * inc);
  535                 xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
  536                 xbpp[i]->b_data = bp->b_data + (i * size);
  537         }
  538 
  539         bp->b_blkno = sblkno;
  540         bp->b_lblkno = blkno + 1;
  541         SET(bp->b_flags, B_READ | B_ASYNC | B_CALL);
  542         bp->b_saveaddr = (void *)xbpp;
  543         bp->b_iodone = bread_cluster_callback;
  544         bp->b_vp = vp;
  545         spill = bp->b_bufsize - bp->b_bcount;
  546         if (spill) {
  547                 uvm_km_free(buf_map, (vaddr_t) bp->b_data + bp->b_bcount,
  548                     spill);
  549                 numbufpages -= atop(spill);
  550         }
  551         VOP_STRATEGY(bp);
  552         curproc->p_stats->p_ru.ru_inblock++;
  553 
  554         return (biowait(*rbpp));
  555 }
  556 
  557 /*
  558  * Block write.  Described in Bach (p.56)
  559  */
  560 int
  561 bwrite(struct buf *bp)
  562 {
  563         int rv, async, wasdelayed, s;
  564         struct vnode *vp;
  565         struct mount *mp;
  566 
  567         vp = bp->b_vp;
  568         if (vp != NULL)
  569                 mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
  570         else
  571                 mp = NULL;
  572 
  573         /*
  574          * Remember buffer type, to switch on it later.  If the write was
  575          * synchronous, but the file system was mounted with MNT_ASYNC,
  576          * convert it to a delayed write.
  577          * XXX note that this relies on delayed tape writes being converted
  578          * to async, not sync writes (which is safe, but ugly).
  579          */
  580         async = ISSET(bp->b_flags, B_ASYNC);
  581         if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) {
  582                 bdwrite(bp);
  583                 return (0);
  584         }
  585 
  586         /*
  587          * Collect statistics on synchronous and asynchronous writes.
  588          * Writes to block devices are charged to their associated
  589          * filesystem (if any).
  590          */
  591         if (mp != NULL) {
  592                 if (async)
  593                         mp->mnt_stat.f_asyncwrites++;
  594                 else
  595                         mp->mnt_stat.f_syncwrites++;
  596         }
  597 
  598         wasdelayed = ISSET(bp->b_flags, B_DELWRI);
  599         CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
  600 
  601         s = splbio();
  602 
  603         /*
  604          * If not synchronous, pay for the I/O operation and make
  605          * sure the buf is on the correct vnode queue.  We have
  606          * to do this now, because if we don't, the vnode may not
  607          * be properly notified that its I/O has completed.
  608          */
  609         if (wasdelayed) {
  610                 reassignbuf(bp);
  611         } else
  612                 curproc->p_stats->p_ru.ru_oublock++;
  613         
  614 
  615         /* Initiate disk write.  Make sure the appropriate party is charged. */
  616         bp->b_vp->v_numoutput++;
  617         splx(s);
  618         SET(bp->b_flags, B_WRITEINPROG);
  619         VOP_STRATEGY(bp);
  620 
  621         if (async)
  622                 return (0);
  623 
  624         /*
  625          * If I/O was synchronous, wait for it to complete.
  626          */
  627         rv = biowait(bp);
  628 
  629         /* Release the buffer. */
  630         brelse(bp);
  631 
  632         return (rv);
  633 }
  634 
  635 
  636 /*
  637  * Delayed write.
  638  *
  639  * The buffer is marked dirty, but is not queued for I/O.
  640  * This routine should be used when the buffer is expected
  641  * to be modified again soon, typically a small write that
  642  * partially fills a buffer.
  643  *
  644  * NB: magnetic tapes cannot be delayed; they must be
  645  * written in the order that the writes are requested.
  646  *
  647  * Described in Leffler, et al. (pp. 208-213).
  648  */
  649 void
  650 bdwrite(struct buf *bp)
  651 {
  652         int s;
  653 
  654         /*
  655          * If the block hasn't been seen before:
  656          *      (1) Mark it as having been seen,
  657          *      (2) Charge for the write.
  658          *      (3) Make sure it's on its vnode's correct block list,
  659          *      (4) If a buffer is rewritten, move it to end of dirty list
  660          */
  661         if (!ISSET(bp->b_flags, B_DELWRI)) {
  662                 SET(bp->b_flags, B_DELWRI);
  663                 bp->b_synctime = time_uptime + 35;
  664                 s = splbio();
  665                 reassignbuf(bp);
  666                 splx(s);
  667                 curproc->p_stats->p_ru.ru_oublock++;    /* XXX */
  668         } else {
  669                 /*
  670                  * see if this buffer has slacked through the syncer
  671                  * and enforce an async write upon it.
  672                  */
  673                 if (bp->b_synctime < time_uptime) {
  674                         bawrite(bp);
  675                         return;
  676                 }
  677         }
  678 
  679         /* If this is a tape block, write the block now. */
  680         if (major(bp->b_dev) < nblkdev &&
  681             bdevsw[major(bp->b_dev)].d_type == D_TAPE) {
  682                 bawrite(bp);
  683                 return;
  684         }
  685 
  686         /* Otherwise, the "write" is done, so mark and release the buffer. */
  687         CLR(bp->b_flags, B_NEEDCOMMIT);
  688         SET(bp->b_flags, B_DONE);
  689         brelse(bp);
  690 }
  691 
  692 /*
  693  * Asynchronous block write; just an asynchronous bwrite().
  694  */
  695 void
  696 bawrite(struct buf *bp)
  697 {
  698 
  699         SET(bp->b_flags, B_ASYNC);
  700         VOP_BWRITE(bp);
  701 }
  702 
  703 /*
  704  * Must be called at splbio()
  705  */
  706 void
  707 buf_dirty(struct buf *bp)
  708 {
  709         splassert(IPL_BIO);
  710 
  711         if (ISSET(bp->b_flags, B_DELWRI) == 0) {
  712                 SET(bp->b_flags, B_DELWRI);
  713                 bp->b_synctime = time_uptime + 35;
  714                 reassignbuf(bp);
  715         }
  716 }
  717 
  718 /*
  719  * Must be called at splbio()
  720  */
  721 void
  722 buf_undirty(struct buf *bp)
  723 {
  724         splassert(IPL_BIO);
  725 
  726         if (ISSET(bp->b_flags, B_DELWRI)) {
  727                 CLR(bp->b_flags, B_DELWRI);
  728                 reassignbuf(bp);
  729         }
  730 }
  731 
  732 /*
  733  * Release a buffer on to the free lists.
  734  * Described in Bach (p. 46).
  735  */
  736 void
  737 brelse(struct buf *bp)
  738 {
  739         struct bqueues *bufq;
  740         int s;
  741 
  742         /* Block disk interrupts. */
  743         s = splbio();
  744 
  745         if (bp->b_data != NULL)
  746                 KASSERT(bp->b_bufsize > 0);
  747 
  748         /*
  749          * Determine which queue the buffer should be on, then put it there.
  750          */
  751 
  752         /* If it's not cacheable, or an error, mark it invalid. */
  753         if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
  754                 SET(bp->b_flags, B_INVAL);
  755 
  756         if (ISSET(bp->b_flags, B_INVAL)) {
  757                 int queue, qs;
  758 
  759                 /*
  760                  * If the buffer is invalid, place it in the clean queue, so it
  761                  * can be reused.
  762                  */
  763                 if (LIST_FIRST(&bp->b_dep) != NULL)
  764                         buf_deallocate(bp);
  765 
  766                 if (ISSET(bp->b_flags, B_DELWRI)) {
  767                         CLR(bp->b_flags, B_DELWRI);
  768                 }
  769 
  770                 if (bp->b_vp)
  771                         brelvp(bp);
  772 
  773                 /*
  774                  * If the buffer has no associated data, place it back in the
  775                  * pool.
  776                  */
  777                 if (bp->b_data == NULL) {
  778                         buf_put(bp);
  779                         splx(s);
  780                         return;
  781                 }
  782 
  783                 qs = bp->b_bufsize;
  784                 queue = size2cqueue(&qs);
  785                 numcleanpages += btoc(bp->b_bufsize);
  786                 bqpages[queue] += btoc(bp->b_bufsize);
  787                 if (maxcleanpages < numcleanpages)
  788                         maxcleanpages = numcleanpages;
  789                 binsheadfree(bp, &bufqueues[queue]);
  790         } else {
  791                 /*
  792                  * It has valid data.  Put it on the end of the appropriate
  793                  * queue, so that it'll stick around for as long as possible.
  794                  */
  795                 int queue, qs;
  796                 numfreepages += btoc(bp->b_bufsize);
  797                 qs = bp->b_bufsize;
  798                 queue = size2cqueue(&qs);
  799 
  800                 if (!ISSET(bp->b_flags, B_DELWRI)) {
  801                         numcleanpages += btoc(bp->b_bufsize);
  802                         bqpages[queue] += btoc(bp->b_bufsize);
  803                         if (maxcleanpages < numcleanpages)
  804                                 maxcleanpages = numcleanpages;
  805                         bufq = &bufqueues[queue];
  806                 } else {
  807                         numdirtypages += btoc(bp->b_bufsize);
  808                         bufq = &bufqueues[BQ_DIRTY];
  809                 }
  810                 if (ISSET(bp->b_flags, B_AGE)) {
  811                         binsheadfree(bp, bufq);
  812                         bp->b_synctime = time_uptime + 30;
  813                 } else {
  814                         binstailfree(bp, bufq);
  815                         bp->b_synctime = time_uptime + 300;
  816                 }
  817         }
  818 
  819         /* Unlock the buffer. */
  820         CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_DEFERRED));
  821 
  822         /* Wake up any processes waiting for any buffer to become free. */
  823         if (needbuffer) {
  824                 needbuffer--;
  825                 wakeup_one(&needbuffer);
  826         }
  827 
  828         /* Wake up any processes waiting for _this_ buffer to become free. */
  829         if (ISSET(bp->b_flags, B_WANTED)) {
  830                 CLR(bp->b_flags, B_WANTED);
  831                 wakeup(bp);
  832         }
  833 
  834         splx(s);
  835 }
  836 
  837 /*
  838  * Determine if a block is in the cache. Just look on what would be its hash
  839  * chain. If it's there, return a pointer to it, unless it's marked invalid.
  840  */
  841 struct buf *
  842 incore(struct vnode *vp, daddr64_t blkno)
  843 {
  844         struct buf *bp;
  845 
  846         /* Search hash chain */
  847         LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
  848                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
  849                     !ISSET(bp->b_flags, B_INVAL))
  850                         return (bp);
  851         }
  852 
  853         return (NULL);
  854 }
  855 
  856 /*
  857  * Get a block of requested size that is associated with
  858  * a given vnode and block offset. If it is found in the
  859  * block cache, mark it as having been found, make it busy
  860  * and return it. Otherwise, return an empty block of the
  861  * correct size. It is up to the caller to ensure that the
  862  * cached blocks be of the correct size.
  863  */
  864 struct buf *
  865 getblk(struct vnode *vp, daddr64_t blkno, int size, int slpflag, int slptimeo)
  866 {
  867         struct bufhashhdr *bh;
  868         struct buf *bp, *nb = NULL;
  869         int s, error;
  870 
  871         /*
  872          * XXX
  873          * The following is an inlined version of 'incore()', but with
  874          * the 'invalid' test moved to after the 'busy' test.  It's
  875          * necessary because there are some cases in which the NFS
  876          * code sets B_INVAL prior to writing data to the server, but
  877          * in which the buffers actually contain valid data.  In this
  878          * case, we can't allow the system to allocate a new buffer for
  879          * the block until the write is finished.
  880          */
  881         bh = BUFHASH(vp, blkno);
  882 start:
  883         LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
  884                 if (bp->b_lblkno != blkno || bp->b_vp != vp)
  885                         continue;
  886 
  887                 s = splbio();
  888                 if (ISSET(bp->b_flags, B_BUSY)) {
  889                         if (nb != NULL) {
  890                                 SET(nb->b_flags, B_INVAL);
  891                                 binshash(nb, &invalhash);
  892                                 brelse(nb);
  893                                 nb = NULL;
  894                         }
  895                         SET(bp->b_flags, B_WANTED);
  896                         error = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
  897                             slptimeo);
  898                         splx(s);
  899                         if (error)
  900                                 return (NULL);
  901                         goto start;
  902                 }
  903 
  904                 if (!ISSET(bp->b_flags, B_INVAL)) {
  905                         SET(bp->b_flags, (B_BUSY | B_CACHE));
  906                         bremfree(bp);
  907                         splx(s);
  908                         break;
  909                 }
  910                 splx(s);
  911         }
  912         if (nb && bp) {
  913                 SET(nb->b_flags, B_INVAL);
  914                 binshash(nb, &invalhash);
  915                 brelse(nb);
  916                 nb = NULL;
  917         }
  918         if (bp == NULL && nb == NULL) {
  919                 nb = getnewbuf(size, slpflag, slptimeo, &error);
  920                 if (nb == NULL) {
  921                         if (error == ERESTART || error == EINTR)
  922                                 return (NULL);
  923                 }
  924                 goto start;
  925         }
  926         if (nb) {
  927                 bp = nb;
  928                 binshash(bp, bh);
  929                 bp->b_blkno = bp->b_lblkno = blkno;
  930                 s = splbio();
  931                 bgetvp(vp, bp);
  932                 splx(s);
  933         }
  934         return (bp);
  935 }
  936 
  937 /*
  938  * Get an empty, disassociated buffer of given size.
  939  */
  940 struct buf *
  941 geteblk(int size)
  942 {
  943         struct buf *bp;
  944 
  945         while ((bp = getnewbuf(size, 0, 0, NULL)) == NULL)
  946                 ;
  947         SET(bp->b_flags, B_INVAL);
  948         binshash(bp, &invalhash);
  949 
  950         return (bp);
  951 }
  952 
  953 /*
  954  * Find a buffer which is available for use.
  955  */
  956 struct buf *
  957 getnewbuf(size_t size, int slpflag, int slptimeo, int *ep)
  958 {
  959         struct buf *bp;
  960         int s, error, queue, qs;
  961 
  962 #if 0           /* we would really like this but sblock update kills it */
  963         KASSERT(curproc != syncerproc && curproc != cleanerproc);
  964 #endif
  965 
  966         s = splbio();
  967         /*
  968          * Wake up cleaner if we're getting low on pages.
  969          */
  970         if (numdirtypages >= hidirtypages || numcleanpages <= locleanpages)
  971                 wakeup(&bd_req);
  972 
  973         /* we just ask. it can say no.. */
  974 getsome:
  975         qs = size;
  976         queue = size2cqueue(&qs);
  977         bp = buf_get(qs); /* XXX use qs instead and no need in buf_get? */
  978         if (bp == NULL) {
  979                 /*
  980                  * No free ones, try to reuse a clean one of the same or
  981                  * larger size.
  982                  */
  983                 do {
  984                         bp = TAILQ_FIRST(&bufqueues[queue]);
  985                         queue++;
  986                 } while (bp == NULL && queue < BQUEUES);
  987         }
  988         if (bp == NULL) {
  989                 /* we couldn't reuse a free one, nothing of the right size */
  990                 /* XXX free 20 buffers per q - ugly hack  should really
  991                  * reuse big ones without truncating. fix later 
  992                  */
  993                 int q, gotsome = 0;
  994                 int freemax = 20; 
  995                 for (q = 1; q < BQUEUES; q++) {
  996                         int i = freemax;
  997                         while (bqpages[q] > bqpagelow
  998                             && (bp = TAILQ_FIRST(&bufqueues[q]))
  999                             && i--) {
 1000                                 gotsome++;
 1001                                 bremfree(bp);
 1002                                 if (LIST_FIRST(&bp->b_dep) != NULL)
 1003                                         buf_deallocate(bp);
 1004 
 1005                                 if (ISSET(bp->b_flags, B_DELWRI)) {
 1006                                         CLR(bp->b_flags, B_DELWRI);
 1007                                 }
 1008 
 1009                                 if (bp->b_vp)
 1010                                         brelvp(bp);
 1011 
 1012                                 buf_put(bp);
 1013                         }
 1014                 }
 1015                 if (gotsome)
 1016                         goto getsome;
 1017         }
 1018         if (bp == NULL) {
 1019                 /* wait for a free buffer of any kind */
 1020                 needbuffer++;
 1021                 error = tsleep(&needbuffer, slpflag | (PRIBIO + 1),
 1022                     "getnewbuf", slptimeo);
 1023                 if (ep != NULL) {
 1024                         *ep = error;
 1025                         if (error) {
 1026                                 splx(s);
 1027                                 return (NULL);
 1028                         }
 1029                 }
 1030                 goto getsome;
 1031         }
 1032 
 1033         bremfree(bp);
 1034         /* Buffer is no longer on free lists. */
 1035         SET(bp->b_flags, B_BUSY);
 1036 
 1037 #ifdef DIAGNOSTIC
 1038         if (ISSET(bp->b_flags, B_DELWRI))
 1039                 panic("Dirty buffer on BQ_CLEAN");
 1040 #endif
 1041 
 1042         /* disassociate us from our vnode, if we had one... */
 1043         if (bp->b_vp)
 1044                 brelvp(bp);
 1045 
 1046         splx(s);
 1047 
 1048 #ifdef DIAGNOSTIC
 1049         /* CLEAN buffers must have no dependencies */ 
 1050         if (LIST_FIRST(&bp->b_dep) != NULL)
 1051                 panic("BQ_CLEAN has buffer with dependencies");
 1052 #endif
 1053 
 1054         /* clear out various other fields */
 1055         bp->b_flags = B_BUSY;
 1056         bp->b_dev = NODEV;
 1057         bp->b_blkno = bp->b_lblkno = 0;
 1058         bp->b_iodone = NULL;
 1059         bp->b_error = 0;
 1060         bp->b_resid = 0;
 1061         bp->b_bcount = size;
 1062         bp->b_dirtyoff = bp->b_dirtyend = 0;
 1063         bp->b_validoff = bp->b_validend = 0;
 1064 
 1065         bremhash(bp);
 1066         return (bp);
 1067 }
 1068 
 1069 /*
 1070  * Buffer cleaning daemon.
 1071  */
 1072 void
 1073 buf_daemon(struct proc *p)
 1074 {
 1075         struct timeval starttime, timediff;
 1076         struct buf *bp;
 1077         int s;
 1078 
 1079         cleanerproc = curproc;
 1080 
 1081         s = splbio();
 1082         for (;;) {
 1083                 if (!numdirtypages ||
 1084                     (numdirtypages < hidirtypages && !needbuffer))
 1085                         tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
 1086 
 1087                 getmicrouptime(&starttime);
 1088 
 1089                 while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
 1090                         struct timeval tv;
 1091 
 1092                         if (numdirtypages < lodirtypages && !needbuffer)
 1093                                 break;
 1094 
 1095                         bremfree(bp);
 1096                         SET(bp->b_flags, B_BUSY);
 1097                         splx(s);
 1098 
 1099                         if (ISSET(bp->b_flags, B_INVAL)) {
 1100                                 brelse(bp);
 1101                                 s = splbio();
 1102                                 continue;
 1103                         }
 1104 #ifdef DIAGNOSTIC
 1105                         if (!ISSET(bp->b_flags, B_DELWRI))
 1106                                 panic("Clean buffer on BQ_DIRTY");
 1107 #endif
 1108                         if (LIST_FIRST(&bp->b_dep) != NULL &&
 1109                             !ISSET(bp->b_flags, B_DEFERRED) &&
 1110                             buf_countdeps(bp, 0, 0)) {
 1111                                 SET(bp->b_flags, B_DEFERRED);
 1112                                 s = splbio();
 1113                                 numfreepages += btoc(bp->b_bufsize);
 1114                                 numdirtypages += btoc(bp->b_bufsize);
 1115                                 binstailfree(bp, &bufqueues[BQ_DIRTY]);
 1116                                 CLR(bp->b_flags, B_BUSY);
 1117                                 continue;
 1118                         }
 1119 
 1120                         bawrite(bp);
 1121 
 1122                         /* Never allow processing to run for more than 1 sec */
 1123                         getmicrouptime(&tv);
 1124                         timersub(&tv, &starttime, &timediff);
 1125                         if (timediff.tv_sec)
 1126                                 break;
 1127 
 1128                         s = splbio();
 1129                 }
 1130         }
 1131 }
 1132 
 1133 /*
 1134  * Wait for operations on the buffer to complete.
 1135  * When they do, extract and return the I/O's error value.
 1136  */
 1137 int
 1138 biowait(struct buf *bp)
 1139 {
 1140         int s;
 1141 
 1142         s = splbio();
 1143         while (!ISSET(bp->b_flags, B_DONE))
 1144                 tsleep(bp, PRIBIO + 1, "biowait", 0);
 1145         splx(s);
 1146 
 1147         /* check for interruption of I/O (e.g. via NFS), then errors. */
 1148         if (ISSET(bp->b_flags, B_EINTR)) {
 1149                 CLR(bp->b_flags, B_EINTR);
 1150                 return (EINTR);
 1151         }
 1152 
 1153         if (ISSET(bp->b_flags, B_ERROR))
 1154                 return (bp->b_error ? bp->b_error : EIO);
 1155         else
 1156                 return (0);
 1157 }
 1158 
 1159 /*
 1160  * Mark I/O complete on a buffer.
 1161  *
 1162  * If a callback has been requested, e.g. the pageout
 1163  * daemon, do so. Otherwise, awaken waiting processes.
 1164  *
 1165  * [ Leffler, et al., says on p.247:
 1166  *      "This routine wakes up the blocked process, frees the buffer
 1167  *      for an asynchronous write, or, for a request by the pagedaemon
 1168  *      process, invokes a procedure specified in the buffer structure" ]
 1169  *
 1170  * In real life, the pagedaemon (or other system processes) wants
 1171  * to do async stuff to, and doesn't want the buffer brelse()'d.
 1172  * (for swap pager, that puts swap buffers on the free lists (!!!),
 1173  * for the vn device, that puts malloc'd buffers on the free lists!)
 1174  *
 1175  * Must be called at splbio().
 1176  */
 1177 void
 1178 biodone(struct buf *bp)
 1179 {
 1180         splassert(IPL_BIO);
 1181 
 1182         if (ISSET(bp->b_flags, B_DONE))
 1183                 panic("biodone already");
 1184         SET(bp->b_flags, B_DONE);               /* note that it's done */
 1185 
 1186         if (LIST_FIRST(&bp->b_dep) != NULL)
 1187                 buf_complete(bp);
 1188 
 1189         if (!ISSET(bp->b_flags, B_READ)) {
 1190                 CLR(bp->b_flags, B_WRITEINPROG);
 1191                 vwakeup(bp->b_vp);
 1192         }
 1193 
 1194         if (ISSET(bp->b_flags, B_CALL)) {       /* if necessary, call out */
 1195                 CLR(bp->b_flags, B_CALL);       /* but note callout done */
 1196                 (*bp->b_iodone)(bp);
 1197         } else {
 1198                 if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
 1199                         brelse(bp);
 1200                 } else {                        /* or just wakeup the buffer */
 1201                         CLR(bp->b_flags, B_WANTED);
 1202                         wakeup(bp);
 1203                 }
 1204         }
 1205 }
 1206 
 1207 #if 1
 1208 void
 1209 vfs_bufstats(void) {
 1210         return;
 1211 }
 1212 /* #ifdef DDB */
 1213 #else
 1214 /*
 1215  * Print out statistics on the current allocation of the buffer pool.
 1216  * Can be enabled to print out on every ``sync'' by setting "syncprt"
 1217  * in vfs_syscalls.c using sysctl.
 1218  */
 1219 void
 1220 vfs_bufstats(void)
 1221 {
 1222         int s, i, j, count;
 1223         struct buf *bp;
 1224         struct bqueues *dp;
 1225         int counts[MAXBSIZE/PAGE_SIZE+1];
 1226         int totals[BQUEUES];
 1227         long ptotals[BQUEUES];
 1228         long pages;
 1229         static char *bname[BQUEUES] = { "CLEAN", "DIRTY", "EMPTY" };
 1230 
 1231         s = splbio();
 1232         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
 1233                 count = 0;
 1234                 pages = 0;
 1235                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
 1236                         counts[j] = 0;
 1237                 TAILQ_FOREACH(bp, dp, b_freelist) {
 1238                         counts[bp->b_bufsize/PAGE_SIZE]++;
 1239                         count++;
 1240                         pages += btoc(bp->b_bufsize);
 1241                 }
 1242                 totals[i] = count;
 1243                 ptotals[i] = pages;
 1244                 printf("%s: total-%d(%d pages)", bname[i], count, pages);
 1245                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
 1246                         if (counts[j] != 0)
 1247                                 printf(", %d-%d", j * PAGE_SIZE, counts[j]);
 1248                 printf("\n");
 1249         }
 1250         if ((ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]) != numfreepages)
 1251                 printf("numfreepages counter wrong: %ld != %ld\n",
 1252                     numfreepages, ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]);
 1253         if (ptotals[BQ_CLEAN] != numcleanpages)
 1254                 printf("numcleanpages counter wrong: %ld != %ld\n",
 1255                     numcleanpages, ptotals[<BQ_CLEAN]);
 1256         else
 1257                 printf("numcleanpages: %ld\n", numcleanpages);
 1258         if (numdirtypages != ptotals[BQ_DIRTY])
 1259                 printf("numdirtypages counter wrong: %ld != %ld\n",
 1260                     numdirtypages, ptotals[BQ_DIRTY]);
 1261         else
 1262                 printf("numdirtypages: %ld\n", numdirtypages);
 1263 
 1264         printf("syncer eating up to %ld pages from %ld reserved\n",
 1265             maxcleanpages - hicleanpages, locleanpages);
 1266         splx(s);
 1267 }
 1268 #endif /* DEBUG */

/* [<][>][^][v][top][bottom][index][help] */