kern/vfs_sync.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
vn_initialize_syncerd
vn_syncer_add_to_worklist
sched_sync
speedup_syncer
vfs_allocate_syncvnode
sync_fsync
sync_inactive
sync_print
    1 /*       $OpenBSD: vfs_sync.c,v 1.43 2007/06/01 23:47:56 deraadt Exp $  */
    2 
    3 /*
    4  *  Portions of this code are:
    5  *
    6  * Copyright (c) 1989, 1993
    7  *      The Regents of the University of California.  All rights reserved.
    8  * (c) UNIX System Laboratories, Inc.
    9  * All or some portions of this file are derived from material licensed
   10  * to the University of California by American Telephone and Telegraph
   11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   12  * the permission of UNIX System Laboratories, Inc.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  */
   38 
   39 /*
   40  * Syncer daemon
   41  */
   42 
   43 #include <sys/queue.h>
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/proc.h>
   47 #include <sys/mount.h>
   48 #include <sys/vnode.h>
   49 #include <sys/buf.h>
   50 #include <sys/malloc.h>
   51 
   52 #include <sys/kernel.h>
   53 #include <sys/sched.h>
   54 
   55 #ifdef FFS_SOFTUPDATES
   56 int   softdep_process_worklist(struct mount *);
   57 #endif
   58 
   59 /*
   60  * The workitem queue.
   61  */
   62 #define SYNCER_MAXDELAY 32              /* maximum sync delay time */
   63 #define SYNCER_DEFAULT 30               /* default sync delay time */
   64 int syncer_maxdelay = SYNCER_MAXDELAY;  /* maximum delay time */
   65 time_t syncdelay = SYNCER_DEFAULT;      /* time to delay syncing vnodes */
   66 
   67 int rushjob = 0;                        /* number of slots to run ASAP */
   68 int stat_rush_requests = 0;             /* number of rush requests */
   69 
   70 static int syncer_delayno = 0;
   71 static long syncer_mask;
   72 LIST_HEAD(synclist, vnode);
   73 static struct synclist *syncer_workitem_pending;
   74 
   75 struct proc *syncerproc;
   76 
   77 /*
   78  * The workitem queue.
   79  *
   80  * It is useful to delay writes of file data and filesystem metadata
   81  * for tens of seconds so that quickly created and deleted files need
   82  * not waste disk bandwidth being created and removed. To realize this,
   83  * we append vnodes to a "workitem" queue. When running with a soft
   84  * updates implementation, most pending metadata dependencies should
   85  * not wait for more than a few seconds. Thus, mounted block devices
   86  * are delayed only about half the time that file data is delayed.
   87  * Similarly, directory updates are more critical, so are only delayed
   88  * about a third the time that file data is delayed. Thus, there are
   89  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
   90  * one each second (driven off the filesystem syncer process). The
   91  * syncer_delayno variable indicates the next queue that is to be processed.
   92  * Items that need to be processed soon are placed in this queue:
   93  *
   94  *      syncer_workitem_pending[syncer_delayno]
   95  *
   96  * A delay of fifteen seconds is done by placing the request fifteen
   97  * entries later in the queue:
   98  *
   99  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  100  *
  101  */
  102 
  103 void
  104 vn_initialize_syncerd(void)
  105 {
  106         syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, M_WAITOK,
  107             &syncer_mask);
  108         syncer_maxdelay = syncer_mask + 1;
  109 }
  110 
  111 /*
  112  * Add an item to the syncer work queue.
  113  */
  114 void
  115 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
  116 {
  117         int s, slot;
  118 
  119         if (delay > syncer_maxdelay - 2)
  120                 delay = syncer_maxdelay - 2;
  121         slot = (syncer_delayno + delay) & syncer_mask;
  122 
  123         s = splbio();
  124         if (vp->v_bioflag & VBIOONSYNCLIST)
  125                 LIST_REMOVE(vp, v_synclist);
  126 
  127         vp->v_bioflag |= VBIOONSYNCLIST;
  128         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
  129         splx(s);
  130 }
  131 
  132 /*
  133  * System filesystem synchronizer daemon.
  134  */
  135 void
  136 sched_sync(struct proc *p)
  137 {
  138         struct synclist *slp;
  139         struct vnode *vp;
  140         long starttime;
  141         int s;
  142 
  143         syncerproc = curproc;
  144 
  145         for (;;) {
  146                 starttime = time_second;
  147 
  148                 /*
  149                  * Push files whose dirty time has expired.
  150                  */
  151                 s = splbio();
  152                 slp = &syncer_workitem_pending[syncer_delayno];
  153 
  154                 syncer_delayno += 1;
  155                 if (syncer_delayno == syncer_maxdelay)
  156                         syncer_delayno = 0;
  157 
  158                 while ((vp = LIST_FIRST(slp)) != NULL) {
  159                         if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT, p)) {
  160                                 /*
  161                                  * If we fail to get the lock, we move this
  162                                  * vnode one second ahead in time.
  163                                  * XXX - no good, but the best we can do.
  164                                  */
  165                                 vn_syncer_add_to_worklist(vp, 1);
  166                                 continue;
  167                         }
  168                         splx(s);
  169                         (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
  170                         vput(vp);
  171                         s = splbio();
  172                         if (LIST_FIRST(slp) == vp) {
  173                                 /*
  174                                  * Note: disk vps can remain on the
  175                                  * worklist too with no dirty blocks, but
  176                                  * since sync_fsync() moves it to a different
  177                                  * slot we are safe.
  178                                  */
  179 #ifdef DIAGNOSTIC
  180                                 if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
  181                                     vp->v_type != VBLK) {
  182                                         vprint("fsync failed", vp);
  183                                         if (vp->v_mount != NULL)
  184                                                 printf("mounted on: %s\n",
  185                                                     vp->v_mount->mnt_stat.f_mntonname);
  186                                         panic("sched_sync: fsync failed");
  187                                 }
  188 #endif /* DIAGNOSTIC */
  189                                 /*
  190                                  * Put us back on the worklist.  The worklist
  191                                  * routine will remove us from our current
  192                                  * position and then add us back in at a later
  193                                  * position.
  194                                  */
  195                                 vn_syncer_add_to_worklist(vp, syncdelay);
  196                         }
  197                 }
  198 
  199                 splx(s);
  200 
  201 #ifdef FFS_SOFTUPDATES
  202                 /*
  203                  * Do soft update processing.
  204                  */
  205                 softdep_process_worklist(NULL);
  206 #endif
  207 
  208                 /*
  209                  * The variable rushjob allows the kernel to speed up the
  210                  * processing of the filesystem syncer process. A rushjob
  211                  * value of N tells the filesystem syncer to process the next
  212                  * N seconds worth of work on its queue ASAP. Currently rushjob
  213                  * is used by the soft update code to speed up the filesystem
  214                  * syncer process when the incore state is getting so far
  215                  * ahead of the disk that the kernel memory pool is being
  216                  * threatened with exhaustion.
  217                  */
  218                 if (rushjob > 0) {
  219                         rushjob -= 1;
  220                         continue;
  221                 }
  222                 /*
  223                  * If it has taken us less than a second to process the
  224                  * current work, then wait. Otherwise start right over
  225                  * again. We can still lose time if any single round
  226                  * takes more than two seconds, but it does not really
  227                  * matter as we are just trying to generally pace the
  228                  * filesystem activity.
  229                  */
  230                 if (time_second == starttime)
  231                         tsleep(&lbolt, PPAUSE, "syncer", 0);
  232         }
  233 }
  234 
  235 /*
  236  * Request the syncer daemon to speed up its work.
  237  * We never push it to speed up more than half of its
  238  * normal turn time, otherwise it could take over the cpu.
  239  */
  240 int
  241 speedup_syncer(void)
  242 {
  243         int s;
  244 
  245         SCHED_LOCK(s);
  246         if (syncerproc && syncerproc->p_wchan == &lbolt)
  247                 setrunnable(syncerproc);
  248         SCHED_UNLOCK(s);
  249         if (rushjob < syncdelay / 2) {
  250                 rushjob += 1;
  251                 stat_rush_requests += 1;
  252                 return 1;
  253         }
  254         return 0;
  255 }
  256 
  257 /*
  258  * Routine to create and manage a filesystem syncer vnode.
  259  */
  260 #define sync_close nullop
  261 int   sync_fsync(void *);
  262 int   sync_inactive(void *);
  263 #define sync_reclaim nullop
  264 #define sync_lock vop_generic_lock
  265 #define sync_unlock vop_generic_unlock
  266 int   sync_print(void *);
  267 #define sync_islocked vop_generic_islocked
  268 
  269 int (**sync_vnodeop_p)(void *);
  270 struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
  271       { &vop_default_desc, vn_default_error },
  272       { &vop_close_desc, sync_close },                /* close */
  273       { &vop_fsync_desc, sync_fsync },                /* fsync */
  274       { &vop_inactive_desc, sync_inactive },          /* inactive */
  275       { &vop_reclaim_desc, sync_reclaim },            /* reclaim */
  276       { &vop_lock_desc, sync_lock },                  /* lock */
  277       { &vop_unlock_desc, sync_unlock },              /* unlock */
  278       { &vop_print_desc, sync_print },                /* print */
  279       { &vop_islocked_desc, sync_islocked },          /* islocked */
  280       { (struct vnodeop_desc*)NULL, (int(*)(void *))NULL }
  281 };
  282 struct vnodeopv_desc sync_vnodeop_opv_desc = {
  283         &sync_vnodeop_p, sync_vnodeop_entries
  284 };
  285 
  286 /*
  287  * Create a new filesystem syncer vnode for the specified mount point.
  288  */
  289 int
  290 vfs_allocate_syncvnode(struct mount *mp)
  291 {
  292         struct vnode *vp;
  293         static long start, incr, next;
  294         int error;
  295 
  296         /* Allocate a new vnode */
  297         if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
  298                 mp->mnt_syncer = NULL;
  299                 return (error);
  300         }
  301         vp->v_writecount = 1;
  302         vp->v_type = VNON;
  303         /*
  304          * Place the vnode onto the syncer worklist. We attempt to
  305          * scatter them about on the list so that they will go off
  306          * at evenly distributed times even if all the filesystems
  307          * are mounted at once.
  308          */
  309         next += incr;
  310         if (next == 0 || next > syncer_maxdelay) {
  311                 start /= 2;
  312                 incr /= 2;
  313                 if (start == 0) {
  314                         start = syncer_maxdelay / 2;
  315                         incr = syncer_maxdelay;
  316                 }
  317                 next = start;
  318         }
  319         vn_syncer_add_to_worklist(vp, next);
  320         mp->mnt_syncer = vp;
  321         return (0);
  322 }
  323 
  324 /*
  325  * Do a lazy sync of the filesystem.
  326  */
  327 int
  328 sync_fsync(void *v)
  329 {
  330         struct vop_fsync_args *ap = v;
  331         struct vnode *syncvp = ap->a_vp;
  332         struct mount *mp = syncvp->v_mount;
  333         int asyncflag;
  334 
  335         /*
  336          * We only need to do something if this is a lazy evaluation.
  337          */
  338         if (ap->a_waitfor != MNT_LAZY)
  339                 return (0);
  340 
  341         /*
  342          * Move ourselves to the back of the sync list.
  343          */
  344         vn_syncer_add_to_worklist(syncvp, syncdelay);
  345 
  346         /*
  347          * Walk the list of vnodes pushing all that are dirty and
  348          * not already on the sync list.
  349          */
  350         if (vfs_busy(mp, VB_READ|VB_NOWAIT) == 0) {
  351                 asyncflag = mp->mnt_flag & MNT_ASYNC;
  352                 mp->mnt_flag &= ~MNT_ASYNC;
  353                 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, ap->a_p);
  354                 if (asyncflag)
  355                         mp->mnt_flag |= MNT_ASYNC;
  356                 vfs_unbusy(mp);
  357         }
  358 
  359         return (0);
  360 }
  361 
  362 /*
  363  * The syncer vnode is no longer needed and is being decommissioned.
  364  */
  365 int
  366 sync_inactive(void *v)
  367 {
  368         struct vop_inactive_args *ap = v;
  369 
  370         struct vnode *vp = ap->a_vp;
  371         int s;
  372 
  373         if (vp->v_usecount == 0) {
  374                 VOP_UNLOCK(vp, 0, ap->a_p);
  375                 return (0);
  376         }
  377 
  378         vp->v_mount->mnt_syncer = NULL;
  379 
  380         s = splbio();
  381 
  382         LIST_REMOVE(vp, v_synclist);
  383         vp->v_bioflag &= ~VBIOONSYNCLIST;
  384 
  385         splx(s);
  386 
  387         vp->v_writecount = 0;
  388         vput(vp);
  389 
  390         return (0);
  391 }
  392 
  393 /*
  394  * Print out a syncer vnode.
  395  */
  396 int
  397 sync_print(void *v)
  398 {
  399         printf("syncer vnode\n");
  400 
  401         return (0);
  402 }
/* [<][>][^][v][top][bottom][index][help] */
root/kern/vfs_sync.c

DEFINITIONS