ufs/ffs/ffs_softdep.c

/* [<][>][^][v][top][bottom][index][help] */
This source file includes following definitions.
acquire_lock
free_lock
acquire_lock_interlocked
free_lock_interlocked
sema_init
sema_get
sema_release
softdep_free
softdep_freequeue_add
softdep_freequeue_process
worklist_insert
worklist_remove
workitem_free
add_to_worklist
softdep_process_worklist
process_worklist_item
softdep_move_dependencies
softdep_flushworklist
softdep_flushfiles
LIST_HEAD
LIST_HEAD
LIST_HEAD
softdep_initialize
softdep_mount
softdep_setup_inomapdep
softdep_setup_blkmapdep
bmsafemap_lookup
softdep_setup_allocdirect
allocdirect_merge
newfreefrag
handle_workitem_freefrag
newallocindir
softdep_setup_allocindir_page
softdep_setup_allocindir_meta
setup_allocindir_phase2
softdep_setup_freeblocks
deallocate_dependencies
free_allocdirect
free_newdirblk
softdep_freefile
check_inode_unwritten
free_inodedep
handle_workitem_freeblocks
indir_trunc
free_allocindir
softdep_setup_directory_add
softdep_change_directoryentry_offset
free_diradd
softdep_setup_remove
newdirrem
softdep_setup_directory_change
softdep_change_linkcnt
handle_workitem_remove
handle_workitem_freefile
softdep_disk_io_initiation
initiate_write_filepage
initiate_write_inodeblock_ufs1
initiate_write_inodeblock_ufs2
softdep_disk_write_complete
handle_allocdirect_partdone
handle_allocindir_partdone
handle_written_inodeblock
diradd_inode_written
handle_written_mkdir
handle_written_filepage
softdep_load_inodeblock
softdep_update_inodeblock
merge_inode_lists
softdep_fsync
softdep_fsync_mountdev
softdep_sync_metadata
flush_inodedep_deps
flush_pagedep_deps
softdep_slowdown
request_cleanup
pause_timer
clear_remove
clear_inodedeps
softdep_count_dependencies
getdirtybuf
drain_output
softdep_deallocate_dependencies
softdep_error
softdep_print
worklist_print
    1 /*      $OpenBSD: ffs_softdep.c,v 1.92 2007/07/11 15:32:22 millert Exp $        */
    2 
    3 /*
    4  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
    5  *
    6  * The soft updates code is derived from the appendix of a University
    7  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
    8  * "Soft Updates: A Solution to the Metadata Update Problem in File
    9  * Systems", CSE-TR-254-95, August 1995).
   10  *
   11  * Further information about soft updates can be obtained from:
   12  *
   13  *      Marshall Kirk McKusick          http://www.mckusick.com/softdep/
   14  *      1614 Oxford Street              mckusick@mckusick.com
   15  *      Berkeley, CA 94709-1608         +1-510-843-9542
   16  *      USA
   17  *
   18  * Redistribution and use in source and binary forms, with or without
   19  * modification, are permitted provided that the following conditions
   20  * are met:
   21  *
   22  * 1. Redistributions of source code must retain the above copyright
   23  *    notice, this list of conditions and the following disclaimer.
   24  * 2. Redistributions in binary form must reproduce the above copyright
   25  *    notice, this list of conditions and the following disclaimer in the
   26  *    documentation and/or other materials provided with the distribution.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
   29  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   30  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   31  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
   32  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   38  * SUCH DAMAGE.
   39  *
   40  *      from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
   41  * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.86 2001/02/04 16:08:18 phk Exp $
   42  */
   43 
   44 #include <sys/param.h>
   45 #include <sys/buf.h>
   46 #include <sys/kernel.h>
   47 #include <sys/malloc.h>
   48 #include <sys/mount.h>
   49 #include <sys/proc.h>
   50 #include <sys/pool.h>
   51 #include <sys/syslog.h>
   52 #include <sys/systm.h>
   53 #include <sys/vnode.h>
   54 #include <miscfs/specfs/specdev.h>
   55 #include <ufs/ufs/dir.h>
   56 #include <ufs/ufs/quota.h>
   57 #include <ufs/ufs/inode.h>
   58 #include <ufs/ufs/ufsmount.h>
   59 #include <ufs/ffs/fs.h>
   60 #include <ufs/ffs/softdep.h>
   61 #include <ufs/ffs/ffs_extern.h>
   62 #include <ufs/ufs/ufs_extern.h>
   63 
   64 #define STATIC
   65 
   66 /*
   67  * Mapping of dependency structure types to malloc types.
   68  */
   69 #define D_PAGEDEP       0
   70 #define D_INODEDEP      1
   71 #define D_NEWBLK        2
   72 #define D_BMSAFEMAP     3
   73 #define D_ALLOCDIRECT   4
   74 #define D_INDIRDEP      5
   75 #define D_ALLOCINDIR    6
   76 #define D_FREEFRAG      7
   77 #define D_FREEBLKS      8
   78 #define D_FREEFILE      9
   79 #define D_DIRADD        10
   80 #define D_MKDIR         11
   81 #define D_DIRREM        12
   82 #define D_NEWDIRBLK     13
   83 #define D_LAST          13
   84 /*
   85  * Names of softdep types.
   86  */
   87 const char *softdep_typenames[] = {
   88         "pagedep",
   89         "inodedep",
   90         "newblk",
   91         "bmsafemap",
   92         "allocdirect",
   93         "indirdep",
   94         "allocindir",
   95         "freefrag",
   96         "freeblks",
   97         "freefile",
   98         "diradd",
   99         "mkdir",
  100         "dirrem",
  101         "newdirblk",
  102 };
  103 #define TYPENAME(type) \
  104         ((unsigned)(type) <= D_LAST ? softdep_typenames[type] : "???")
  105 /*
  106  * Finding the current process.
  107  */
  108 #define CURPROC curproc
  109 /*
  110  * End system adaptation definitions.
  111  */
  112 
  113 /*
  114  * Internal function prototypes.
  115  */
  116 STATIC  void softdep_error(char *, int);
  117 STATIC  void drain_output(struct vnode *, int);
  118 STATIC  int getdirtybuf(struct buf *, int);
  119 STATIC  void clear_remove(struct proc *);
  120 STATIC  void clear_inodedeps(struct proc *);
  121 STATIC  int flush_pagedep_deps(struct vnode *, struct mount *,
  122             struct diraddhd *);
  123 STATIC  int flush_inodedep_deps(struct fs *, ino_t);
  124 STATIC  int handle_written_filepage(struct pagedep *, struct buf *);
  125 STATIC  void diradd_inode_written(struct diradd *, struct inodedep *);
  126 STATIC  int handle_written_inodeblock(struct inodedep *, struct buf *);
  127 STATIC  void handle_allocdirect_partdone(struct allocdirect *);
  128 STATIC  void handle_allocindir_partdone(struct allocindir *);
  129 STATIC  void initiate_write_filepage(struct pagedep *, struct buf *);
  130 STATIC  void handle_written_mkdir(struct mkdir *, int);
  131 STATIC  void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
  132 #ifdef FFS2
  133 STATIC  void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
  134 #endif
  135 STATIC  void handle_workitem_freefile(struct freefile *);
  136 STATIC  void handle_workitem_remove(struct dirrem *);
  137 STATIC  struct dirrem *newdirrem(struct buf *, struct inode *,
  138             struct inode *, int, struct dirrem **);
  139 STATIC  void free_diradd(struct diradd *);
  140 STATIC  void free_allocindir(struct allocindir *, struct inodedep *);
  141 STATIC  void free_newdirblk(struct newdirblk *);
  142 STATIC  int indir_trunc(struct inode *, daddr_t, int, daddr64_t, long *);
  143 STATIC  void deallocate_dependencies(struct buf *, struct inodedep *);
  144 STATIC  void free_allocdirect(struct allocdirectlst *,
  145             struct allocdirect *, int);
  146 STATIC  int check_inode_unwritten(struct inodedep *);
  147 STATIC  int free_inodedep(struct inodedep *);
  148 STATIC  void handle_workitem_freeblocks(struct freeblks *);
  149 STATIC  void merge_inode_lists(struct inodedep *);
  150 STATIC  void setup_allocindir_phase2(struct buf *, struct inode *,
  151             struct allocindir *);
  152 STATIC  struct allocindir *newallocindir(struct inode *, int, daddr_t,
  153             daddr_t);
  154 STATIC  void handle_workitem_freefrag(struct freefrag *);
  155 STATIC  struct freefrag *newfreefrag(struct inode *, daddr_t, long);
  156 STATIC  void allocdirect_merge(struct allocdirectlst *,
  157             struct allocdirect *, struct allocdirect *);
  158 STATIC  struct bmsafemap *bmsafemap_lookup(struct buf *);
  159 STATIC  int newblk_lookup(struct fs *, daddr_t, int,
  160             struct newblk **);
  161 STATIC  int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
  162 STATIC  int pagedep_lookup(struct inode *, daddr64_t, int, struct pagedep **);
  163 STATIC  void pause_timer(void *);
  164 STATIC  int request_cleanup(int, int);
  165 STATIC  int process_worklist_item(struct mount *, int);
  166 STATIC  void add_to_worklist(struct worklist *);
  167 
  168 /*
  169  * Exported softdep operations.
  170  */
  171 void softdep_disk_io_initiation(struct buf *);
  172 void softdep_disk_write_complete(struct buf *);
  173 void softdep_deallocate_dependencies(struct buf *);
  174 void softdep_move_dependencies(struct buf *, struct buf *);
  175 int softdep_count_dependencies(struct buf *bp, int, int);
  176 
  177 /*
  178  * Locking primitives.
  179  *
  180  * For a uniprocessor, all we need to do is protect against disk
  181  * interrupts. For a multiprocessor, this lock would have to be
  182  * a mutex. A single mutex is used throughout this file, though
  183  * finer grain locking could be used if contention warranted it.
  184  *
  185  * For a multiprocessor, the sleep call would accept a lock and
  186  * release it after the sleep processing was complete. In a uniprocessor
  187  * implementation there is no such interlock, so we simple mark
  188  * the places where it needs to be done with the `interlocked' form
  189  * of the lock calls. Since the uniprocessor sleep already interlocks
  190  * the spl, there is nothing that really needs to be done.
  191  */
  192 #ifndef /* NOT */ DEBUG
  193 STATIC struct lockit {
  194         int     lkt_spl;
  195 } lk = { 0 };
  196 #define ACQUIRE_LOCK(lk)                (lk)->lkt_spl = splbio()
  197 #define FREE_LOCK(lk)                   splx((lk)->lkt_spl)
  198 #define ACQUIRE_LOCK_INTERLOCKED(lk,s)  (lk)->lkt_spl = (s)
  199 #define FREE_LOCK_INTERLOCKED(lk)       ((lk)->lkt_spl)
  200 
  201 #else /* DEBUG */
  202 STATIC struct lockit {
  203         int     lkt_spl;
  204         pid_t   lkt_held;
  205         int     lkt_line;
  206 } lk = { 0, -1 };
  207 STATIC int lockcnt;
  208 
  209 STATIC  void acquire_lock(struct lockit *, int);
  210 STATIC  void free_lock(struct lockit *, int);
  211 STATIC  void acquire_lock_interlocked(struct lockit *, int, int);
  212 STATIC  int free_lock_interlocked(struct lockit *, int);
  213 
  214 #define ACQUIRE_LOCK(lk)                acquire_lock(lk, __LINE__)
  215 #define FREE_LOCK(lk)                   free_lock(lk, __LINE__)
  216 #define ACQUIRE_LOCK_INTERLOCKED(lk,s)  acquire_lock_interlocked(lk, (s), __LINE__)
  217 #define FREE_LOCK_INTERLOCKED(lk)       free_lock_interlocked(lk, __LINE__)
  218 
  219 STATIC void
  220 acquire_lock(lk, line)
  221         struct lockit *lk;
  222         int line;
  223 {
  224         pid_t holder;
  225         int original_line;
  226 
  227         if (lk->lkt_held != -1) {
  228                 holder = lk->lkt_held;
  229                 original_line = lk->lkt_line;
  230                 FREE_LOCK(lk);
  231                 if (holder == CURPROC->p_pid)
  232                         panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
  233                 else
  234                         panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
  235         }
  236         lk->lkt_spl = splbio();
  237         lk->lkt_held = CURPROC->p_pid;
  238         lk->lkt_line = line;
  239         lockcnt++;
  240 }
  241 
  242 STATIC void
  243 free_lock(lk, line)
  244         struct lockit *lk;
  245         int line;
  246 {
  247 
  248         if (lk->lkt_held == -1)
  249                 panic("softdep_unlock: lock not held at line %d", line);
  250         lk->lkt_held = -1;
  251         splx(lk->lkt_spl);
  252 }
  253 
  254 STATIC void
  255 acquire_lock_interlocked(lk, s, line)
  256         struct lockit *lk;
  257         int s;
  258         int line;
  259 {
  260         pid_t holder;
  261         int original_line;
  262 
  263         if (lk->lkt_held != -1) {
  264                 holder = lk->lkt_held;
  265                 original_line = lk->lkt_line;
  266                 FREE_LOCK_INTERLOCKED(lk);
  267                 if (holder == CURPROC->p_pid)
  268                         panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
  269                 else
  270                         panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
  271         }
  272         lk->lkt_held = CURPROC->p_pid;
  273         lk->lkt_line = line;
  274         lk->lkt_spl = s;
  275         lockcnt++;
  276 }
  277 
  278 STATIC int
  279 free_lock_interlocked(lk, line)
  280         struct lockit *lk;
  281         int line;
  282 {
  283 
  284         if (lk->lkt_held == -1)
  285                 panic("softdep_unlock_interlocked: lock not held at line %d", line);
  286         lk->lkt_held = -1;
  287 
  288         return (lk->lkt_spl);
  289 }
  290 #endif /* DEBUG */
  291 
  292 /*
  293  * Place holder for real semaphores.
  294  */
  295 struct sema {
  296         int     value;
  297         pid_t   holder;
  298         char    *name;
  299         int     prio;
  300         int     timo;
  301 };
  302 STATIC  void sema_init(struct sema *, char *, int, int);
  303 STATIC  int sema_get(struct sema *, struct lockit *);
  304 STATIC  void sema_release(struct sema *);
  305 
  306 STATIC void
  307 sema_init(semap, name, prio, timo)
  308         struct sema *semap;
  309         char *name;
  310         int prio, timo;
  311 {
  312 
  313         semap->holder = -1;
  314         semap->value = 0;
  315         semap->name = name;
  316         semap->prio = prio;
  317         semap->timo = timo;
  318 }
  319 
  320 STATIC int
  321 sema_get(semap, interlock)
  322         struct sema *semap;
  323         struct lockit *interlock;
  324 {
  325         int s;
  326 
  327         if (semap->value++ > 0) {
  328                 if (interlock != NULL)
  329                         s = FREE_LOCK_INTERLOCKED(interlock);
  330                 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
  331                 if (interlock != NULL) {
  332                         ACQUIRE_LOCK_INTERLOCKED(interlock, s);
  333                         FREE_LOCK(interlock);
  334                 }
  335                 return (0);
  336         }
  337         semap->holder = CURPROC->p_pid;
  338         if (interlock != NULL)
  339                 FREE_LOCK(interlock);
  340         return (1);
  341 }
  342 
  343 STATIC void
  344 sema_release(semap)
  345         struct sema *semap;
  346 {
  347 
  348         if (semap->value <= 0 || semap->holder != CURPROC->p_pid) {
  349 #ifdef DEBUG
  350                 if (lk.lkt_held != -1)
  351                         FREE_LOCK(&lk);
  352 #endif
  353                 panic("sema_release: not held");
  354         }
  355         if (--semap->value > 0) {
  356                 semap->value = 0;
  357                 wakeup(semap);
  358         }
  359         semap->holder = -1;
  360 }
  361 
  362 /*
  363  * Memory management.
  364  */
  365 STATIC struct pool pagedep_pool;
  366 STATIC struct pool inodedep_pool;
  367 STATIC struct pool newblk_pool;
  368 STATIC struct pool bmsafemap_pool;
  369 STATIC struct pool allocdirect_pool;
  370 STATIC struct pool indirdep_pool;
  371 STATIC struct pool allocindir_pool;
  372 STATIC struct pool freefrag_pool;
  373 STATIC struct pool freeblks_pool;
  374 STATIC struct pool freefile_pool;
  375 STATIC struct pool diradd_pool;
  376 STATIC struct pool mkdir_pool;
  377 STATIC struct pool dirrem_pool;
  378 STATIC struct pool newdirblk_pool;
  379 
  380 static __inline void
  381 softdep_free(struct worklist *item, int type)
  382 {
  383 
  384         switch (type) {
  385         case D_PAGEDEP:
  386                 pool_put(&pagedep_pool, item);
  387                 break;
  388 
  389         case D_INODEDEP:
  390                 pool_put(&inodedep_pool, item);
  391                 break;
  392 
  393         case D_BMSAFEMAP:
  394                 pool_put(&bmsafemap_pool, item);
  395                 break;
  396 
  397         case D_ALLOCDIRECT:
  398                 pool_put(&allocdirect_pool, item);
  399                 break;
  400 
  401         case D_INDIRDEP:
  402                 pool_put(&indirdep_pool, item);
  403                 break;
  404 
  405         case D_ALLOCINDIR:
  406                 pool_put(&allocindir_pool, item);
  407                 break;
  408 
  409         case D_FREEFRAG:
  410                 pool_put(&freefrag_pool, item);
  411                 break;
  412 
  413         case D_FREEBLKS:
  414                 pool_put(&freeblks_pool, item);
  415                 break;
  416 
  417         case D_FREEFILE:
  418                 pool_put(&freefile_pool, item);
  419                 break;
  420 
  421         case D_DIRADD:
  422                 pool_put(&diradd_pool, item);
  423                 break;
  424 
  425         case D_MKDIR:
  426                 pool_put(&mkdir_pool, item);
  427                 break;
  428 
  429         case D_DIRREM:
  430                 pool_put(&dirrem_pool, item);
  431                 break;
  432 
  433         case D_NEWDIRBLK:
  434                 pool_put(&newdirblk_pool, item);
  435                 break;
  436 
  437         default:
  438 #ifdef DEBUG
  439                 if (lk.lkt_held != -1)
  440                         FREE_LOCK(&lk);
  441 #endif
  442                 panic("softdep_free: unknown type %d", type);
  443         }
  444 }
  445 
  446 struct workhead softdep_freequeue;
  447 
  448 static __inline void
  449 softdep_freequeue_add(struct worklist *item)
  450 {
  451         int s;
  452 
  453         s = splbio();
  454         LIST_INSERT_HEAD(&softdep_freequeue, item, wk_list);
  455         splx(s);
  456 }
  457 
  458 static __inline void
  459 softdep_freequeue_process(void)
  460 {
  461         struct worklist *wk;
  462 
  463         splassert(IPL_BIO);
  464 
  465         while ((wk = LIST_FIRST(&softdep_freequeue)) != NULL) {
  466                 LIST_REMOVE(wk, wk_list);
  467                 FREE_LOCK(&lk);
  468                 softdep_free(wk, wk->wk_type);
  469                 ACQUIRE_LOCK(&lk);
  470         }
  471 }
  472 
  473 /*
  474  * Worklist queue management.
  475  * These routines require that the lock be held.
  476  */
  477 #ifndef /* NOT */ DEBUG
  478 #define WORKLIST_INSERT(head, item) do {        \
  479         (item)->wk_state |= ONWORKLIST;         \
  480         LIST_INSERT_HEAD(head, item, wk_list);  \
  481 } while (0)
  482 #define WORKLIST_REMOVE(item) do {              \
  483         (item)->wk_state &= ~ONWORKLIST;        \
  484         LIST_REMOVE(item, wk_list);             \
  485 } while (0)
  486 #define WORKITEM_FREE(item, type) softdep_freequeue_add((struct worklist *)item)
  487 
  488 #else /* DEBUG */
  489 STATIC  void worklist_insert(struct workhead *, struct worklist *);
  490 STATIC  void worklist_remove(struct worklist *);
  491 STATIC  void workitem_free(struct worklist *);
  492 
  493 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
  494 #define WORKLIST_REMOVE(item) worklist_remove(item)
  495 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item)
  496 
  497 STATIC void
  498 worklist_insert(head, item)
  499         struct workhead *head;
  500         struct worklist *item;
  501 {
  502 
  503         if (lk.lkt_held == -1)
  504                 panic("worklist_insert: lock not held");
  505         if (item->wk_state & ONWORKLIST) {
  506                 FREE_LOCK(&lk);
  507                 panic("worklist_insert: already on list");
  508         }
  509         item->wk_state |= ONWORKLIST;
  510         LIST_INSERT_HEAD(head, item, wk_list);
  511 }
  512 
  513 STATIC void
  514 worklist_remove(item)
  515         struct worklist *item;
  516 {
  517 
  518         if (lk.lkt_held == -1)
  519                 panic("worklist_remove: lock not held");
  520         if ((item->wk_state & ONWORKLIST) == 0) {
  521                 FREE_LOCK(&lk);
  522                 panic("worklist_remove: not on list");
  523         }
  524         item->wk_state &= ~ONWORKLIST;
  525         LIST_REMOVE(item, wk_list);
  526 }
  527 
  528 STATIC void
  529 workitem_free(item)
  530         struct worklist *item;
  531 {
  532 
  533         if (item->wk_state & ONWORKLIST) {
  534                 if (lk.lkt_held != -1)
  535                         FREE_LOCK(&lk);
  536                 panic("workitem_free: still on list");
  537         }
  538         softdep_freequeue_add(item);
  539 }
  540 #endif /* DEBUG */
  541 
  542 /*
  543  * Workitem queue management
  544  */
  545 STATIC struct workhead softdep_workitem_pending;
  546 STATIC struct worklist *worklist_tail;
  547 STATIC int num_on_worklist;     /* number of worklist items to be processed */
  548 STATIC int softdep_worklist_busy; /* 1 => trying to do unmount */
  549 STATIC int softdep_worklist_req; /* serialized waiters */
  550 STATIC int max_softdeps;        /* maximum number of structs before slowdown */
  551 STATIC int tickdelay = 2;       /* number of ticks to pause during slowdown */
  552 STATIC int proc_waiting;        /* tracks whether we have a timeout posted */
  553 STATIC int *stat_countp;        /* statistic to count in proc_waiting timeout */
  554 STATIC struct timeout proc_waiting_timeout; 
  555 STATIC struct proc *filesys_syncer; /* proc of filesystem syncer process */
  556 STATIC int req_clear_inodedeps; /* syncer process flush some inodedeps */
  557 #define FLUSH_INODES    1
  558 STATIC int req_clear_remove;    /* syncer process flush some freeblks */
  559 #define FLUSH_REMOVE    2
  560 /*
  561  * runtime statistics
  562  */
  563 STATIC int stat_worklist_push;  /* number of worklist cleanups */
  564 STATIC int stat_blk_limit_push; /* number of times block limit neared */
  565 STATIC int stat_ino_limit_push; /* number of times inode limit neared */
  566 STATIC int stat_blk_limit_hit;  /* number of times block slowdown imposed */
  567 STATIC int stat_ino_limit_hit;  /* number of times inode slowdown imposed */
  568 STATIC int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
  569 STATIC int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
  570 STATIC int stat_inode_bitmap;   /* bufs redirtied as inode bitmap not written */
  571 STATIC int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
  572 STATIC int stat_dir_entry;      /* bufs redirtied as dir entry cannot write */
  573 
  574 /*
  575  * Add an item to the end of the work queue.
  576  * This routine requires that the lock be held.
  577  * This is the only routine that adds items to the list.
  578  * The following routine is the only one that removes items
  579  * and does so in order from first to last.
  580  */
  581 STATIC void
  582 add_to_worklist(wk)
  583         struct worklist *wk;
  584 {
  585 
  586         if (wk->wk_state & ONWORKLIST) {
  587 #ifdef DEBUG
  588                 if (lk.lkt_held != -1)
  589                         FREE_LOCK(&lk);
  590 #endif
  591                 panic("add_to_worklist: already on list");
  592         }
  593         wk->wk_state |= ONWORKLIST;
  594         if (LIST_FIRST(&softdep_workitem_pending) == NULL)
  595                 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
  596         else
  597                 LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
  598         worklist_tail = wk;
  599         num_on_worklist += 1;
  600 }
  601 
  602 /*
  603  * Process that runs once per second to handle items in the background queue.
  604  *
  605  * Note that we ensure that everything is done in the order in which they
  606  * appear in the queue. The code below depends on this property to ensure
  607  * that blocks of a file are freed before the inode itself is freed. This
  608  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
  609  * until all the old ones have been purged from the dependency lists.
  610  */
  611 int 
  612 softdep_process_worklist(matchmnt)
  613         struct mount *matchmnt;
  614 {
  615         struct proc *p = CURPROC;
  616         int matchcnt, loopcount;
  617         struct timeval starttime;
  618 
  619         /*
  620          * First process any items on the delayed-free queue.
  621          */
  622         ACQUIRE_LOCK(&lk);
  623         softdep_freequeue_process();
  624         FREE_LOCK(&lk);
  625 
  626         /*
  627          * Record the process identifier of our caller so that we can give
  628          * this process preferential treatment in request_cleanup below.
  629          * We can't do this in softdep_initialize, because the syncer doesn't
  630          * have to run then.
  631          * NOTE! This function _could_ be called with a curproc != syncerproc.
  632          */
  633         filesys_syncer = syncerproc;
  634         matchcnt = 0;
  635 
  636         /*
  637          * There is no danger of having multiple processes run this
  638          * code, but we have to single-thread it when softdep_flushfiles()
  639          * is in operation to get an accurate count of the number of items
  640          * related to its mount point that are in the list.
  641          */
  642         if (matchmnt == NULL) {
  643                 if (softdep_worklist_busy < 0)
  644                         return(-1);
  645                 softdep_worklist_busy += 1;
  646         }
  647 
  648         /*
  649          * If requested, try removing inode or removal dependencies.
  650          */
  651         if (req_clear_inodedeps) {
  652                 clear_inodedeps(p);
  653                 req_clear_inodedeps -= 1;
  654                 wakeup_one(&proc_waiting);
  655         }
  656         if (req_clear_remove) {
  657                 clear_remove(p);
  658                 req_clear_remove -= 1;
  659                 wakeup_one(&proc_waiting);
  660         }
  661         loopcount = 1;
  662         getmicrouptime(&starttime);
  663         while (num_on_worklist > 0) {
  664                 matchcnt += process_worklist_item(matchmnt, 0);
  665 
  666                 /*
  667                  * If a umount operation wants to run the worklist
  668                  * accurately, abort.
  669                  */
  670                 if (softdep_worklist_req && matchmnt == NULL) {
  671                         matchcnt = -1;
  672                         break;
  673                 }
  674 
  675                 /*
  676                  * If requested, try removing inode or removal dependencies.
  677                  */
  678                 if (req_clear_inodedeps) {
  679                         clear_inodedeps(p);
  680                         req_clear_inodedeps -= 1;
  681                         wakeup_one(&proc_waiting);
  682                 }
  683                 if (req_clear_remove) {
  684                         clear_remove(p);
  685                         req_clear_remove -= 1;
  686                         wakeup_one(&proc_waiting);
  687                 }
  688                 /*
  689                  * We do not generally want to stop for buffer space, but if
  690                  * we are really being a buffer hog, we will stop and wait.
  691                  */
  692 #if 0
  693                 if (loopcount++ % 128 == 0)
  694                         bwillwrite();
  695 #endif
  696                 /*
  697                  * Never allow processing to run for more than one
  698                  * second. Otherwise the other syncer tasks may get
  699                  * excessively backlogged.
  700                  */
  701                 {
  702                         struct timeval diff;
  703                         struct timeval tv;
  704 
  705                         getmicrouptime(&tv);
  706                         timersub(&tv, &starttime, &diff);
  707                         if (diff.tv_sec != 0 && matchmnt == NULL) {
  708                                 matchcnt = -1;
  709                                 break;
  710                         }
  711                 }
  712 
  713                 /*
  714                  * Process any new items on the delayed-free queue.
  715                  */
  716                 ACQUIRE_LOCK(&lk);
  717                 softdep_freequeue_process();
  718                 FREE_LOCK(&lk);
  719         }
  720         if (matchmnt == NULL) {
  721                 softdep_worklist_busy -= 1;
  722                 if (softdep_worklist_req && softdep_worklist_busy == 0)
  723                         wakeup(&softdep_worklist_req);
  724         }
  725         return (matchcnt);
  726 }
  727 
  728 /*
  729  * Process one item on the worklist.
  730  */
  731 STATIC int
  732 process_worklist_item(matchmnt, flags)
  733         struct mount *matchmnt;
  734         int flags;
  735 {
  736         struct worklist *wk, *wkend;
  737         struct dirrem *dirrem;
  738         struct mount *mp;
  739         struct vnode *vp;
  740         int matchcnt = 0;
  741 
  742         ACQUIRE_LOCK(&lk);
  743         /*
  744          * Normally we just process each item on the worklist in order.
  745          * However, if we are in a situation where we cannot lock any
  746          * inodes, we have to skip over any dirrem requests whose
  747          * vnodes are resident and locked.
  748          */
  749         LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
  750                 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
  751                         break;
  752                 dirrem = WK_DIRREM(wk);
  753                 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
  754                     dirrem->dm_oldinum);
  755                 if (vp == NULL || !VOP_ISLOCKED(vp))
  756                         break;
  757         }
  758         if (wk == 0) {
  759                 FREE_LOCK(&lk);
  760                 return (0);
  761         }
  762         /*
  763          * Remove the item to be processed. If we are removing the last
  764          * item on the list, we need to recalculate the tail pointer.
  765          * As this happens rarely and usually when the list is short,
  766          * we just run down the list to find it rather than tracking it
  767          * in the above loop.
  768          */
  769         WORKLIST_REMOVE(wk);
  770         if (wk == worklist_tail) {
  771                 LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
  772                         if (LIST_NEXT(wkend, wk_list) == NULL)
  773                                 break;
  774                 worklist_tail = wkend;
  775         }
  776         num_on_worklist -= 1;
  777         FREE_LOCK(&lk);
  778         switch (wk->wk_type) {
  779 
  780         case D_DIRREM:
  781                 /* removal of a directory entry */
  782                 mp = WK_DIRREM(wk)->dm_mnt;
  783 #if 0
  784                 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
  785                         panic("%s: dirrem on suspended filesystem",
  786                                 "process_worklist_item");
  787 #endif
  788                 if (mp == matchmnt)
  789                         matchcnt += 1;
  790                 handle_workitem_remove(WK_DIRREM(wk));
  791                 break;
  792 
  793         case D_FREEBLKS:
  794                 /* releasing blocks and/or fragments from a file */
  795                 mp = WK_FREEBLKS(wk)->fb_mnt;
  796 #if 0
  797                 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
  798                         panic("%s: freeblks on suspended filesystem",
  799                                 "process_worklist_item");
  800 #endif
  801                 if (mp == matchmnt)
  802                         matchcnt += 1;
  803                 handle_workitem_freeblocks(WK_FREEBLKS(wk));
  804                 break;
  805 
  806         case D_FREEFRAG:
  807                 /* releasing a fragment when replaced as a file grows */
  808                 mp = WK_FREEFRAG(wk)->ff_mnt;
  809 #if 0
  810                 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
  811                         panic("%s: freefrag on suspended filesystem",
  812                                 "process_worklist_item");
  813 #endif
  814                 if (mp == matchmnt)
  815                         matchcnt += 1;
  816                 handle_workitem_freefrag(WK_FREEFRAG(wk));
  817                 break;
  818 
  819         case D_FREEFILE:
  820                 /* releasing an inode when its link count drops to 0 */
  821                 mp = WK_FREEFILE(wk)->fx_mnt;
  822 #if 0
  823                 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
  824                         panic("%s: freefile on suspended filesystem",
  825                                 "process_worklist_item");
  826 #endif
  827                 if (mp == matchmnt)
  828                         matchcnt += 1;
  829                 handle_workitem_freefile(WK_FREEFILE(wk));
  830                 break;
  831 
  832         default:
  833                 panic("%s_process_worklist: Unknown type %s",
  834                     "softdep", TYPENAME(wk->wk_type));
  835                 /* NOTREACHED */
  836         }
  837         return (matchcnt);
  838 }
  839 
  840 /*
  841  * Move dependencies from one buffer to another.
  842  */
  843 void
  844 softdep_move_dependencies(oldbp, newbp)
  845         struct buf *oldbp;
  846         struct buf *newbp;
  847 {
  848         struct worklist *wk, *wktail;
  849 
  850         if (LIST_FIRST(&newbp->b_dep) != NULL)
  851                 panic("softdep_move_dependencies: need merge code");
  852         wktail = 0;
  853         ACQUIRE_LOCK(&lk);
  854         while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
  855                 LIST_REMOVE(wk, wk_list);
  856                 if (wktail == 0)
  857                         LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
  858                 else
  859                         LIST_INSERT_AFTER(wktail, wk, wk_list);
  860                 wktail = wk;
  861         }
  862         FREE_LOCK(&lk);
  863 }
  864 
  865 /*
  866  * Purge the work list of all items associated with a particular mount point.
  867  */
  868 int
  869 softdep_flushworklist(oldmnt, countp, p)
  870         struct mount *oldmnt;
  871         int *countp;
  872         struct proc *p;
  873 {
  874         struct vnode *devvp;
  875         int count, error = 0;
  876 
  877         /*
  878          * Await our turn to clear out the queue, then serialize access.
  879          */
  880         while (softdep_worklist_busy) {
  881                 softdep_worklist_req += 1;
  882                 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
  883                 softdep_worklist_req -= 1;
  884         }
  885         softdep_worklist_busy = -1;
  886         /*
  887          * Alternately flush the block device associated with the mount
  888          * point and process any dependencies that the flushing
  889          * creates. We continue until no more worklist dependencies
  890          * are found.
  891          */
  892         *countp = 0;
  893         devvp = VFSTOUFS(oldmnt)->um_devvp;
  894         while ((count = softdep_process_worklist(oldmnt)) > 0) {
  895                 *countp += count;
  896                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
  897                 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
  898                 VOP_UNLOCK(devvp, 0, p);
  899                 if (error)
  900                         break;
  901         }
  902         softdep_worklist_busy = 0;
  903         if (softdep_worklist_req)
  904                 wakeup(&softdep_worklist_req);
  905         return (error);
  906 }
  907 
  908 /*
  909  * Flush all vnodes and worklist items associated with a specified mount point.
  910  */
  911 int
  912 softdep_flushfiles(oldmnt, flags, p)
  913         struct mount *oldmnt;
  914         int flags;
  915         struct proc *p;
  916 {
  917         int error, count, loopcnt;
  918 
  919         /*
  920          * Alternately flush the vnodes associated with the mount
  921          * point and process any dependencies that the flushing
  922          * creates. In theory, this loop can happen at most twice,
  923          * but we give it a few extra just to be sure.
  924          */
  925         for (loopcnt = 10; loopcnt > 0; loopcnt--) {
  926                 /*
  927                  * Do another flush in case any vnodes were brought in
  928                  * as part of the cleanup operations.
  929                  */
  930                 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
  931                         break;
  932                 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
  933                     count == 0)
  934                         break;
  935         }
  936         /*
  937          * If we are unmounting then it is an error to fail. If we
  938          * are simply trying to downgrade to read-only, then filesystem
  939          * activity can keep us busy forever, so we just fail with EBUSY.
  940          */
  941         if (loopcnt == 0) {
  942                 error = EBUSY;
  943         }
  944         return (error);
  945 }
  946 
  947 /*
  948  * Structure hashing.
  949  * 
  950  * There are three types of structures that can be looked up:
  951  *      1) pagedep structures identified by mount point, inode number,
  952  *         and logical block.
  953  *      2) inodedep structures identified by mount point and inode number.
  954  *      3) newblk structures identified by mount point and
  955  *         physical block number.
  956  *
  957  * The "pagedep" and "inodedep" dependency structures are hashed
  958  * separately from the file blocks and inodes to which they correspond.
  959  * This separation helps when the in-memory copy of an inode or
  960  * file block must be replaced. It also obviates the need to access
  961  * an inode or file page when simply updating (or de-allocating)
  962  * dependency structures. Lookup of newblk structures is needed to
  963  * find newly allocated blocks when trying to associate them with
  964  * their allocdirect or allocindir structure.
  965  *
  966  * The lookup routines optionally create and hash a new instance when
  967  * an existing entry is not found.
  968  */
  969 #define DEPALLOC        0x0001  /* allocate structure if lookup fails */
  970 #define NODELAY         0x0002  /* cannot do background work */
  971 
  972 /*
  973  * Structures and routines associated with pagedep caching.
  974  */
  975 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
  976 u_long  pagedep_hash;           /* size of hash table - 1 */
  977 #define PAGEDEP_HASH(mp, inum, lbn) \
  978         (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
  979             pagedep_hash])
  980 STATIC struct sema pagedep_in_progress;
  981 
  982 /*
  983  * Look up a pagedep. Return 1 if found, 0 if not found or found
  984  * when asked to allocate but not associated with any buffer.
  985  * If not found, allocate if DEPALLOC flag is passed.
  986  * Found or allocated entry is returned in pagedeppp.
  987  * This routine must be called with splbio interrupts blocked.
  988  */
  989 STATIC int
  990 pagedep_lookup(ip, lbn, flags, pagedeppp)
  991         struct inode *ip;
  992         daddr64_t lbn;
  993         int flags;
  994         struct pagedep **pagedeppp;
  995 {
  996         struct pagedep *pagedep;
  997         struct pagedep_hashhead *pagedephd;
  998         struct mount *mp;
  999         int i;
 1000 
 1001         splassert(IPL_BIO);
 1002 
 1003 #ifdef DEBUG
 1004         if (lk.lkt_held == -1)
 1005                 panic("pagedep_lookup: lock not held");
 1006 #endif
 1007         mp = ITOV(ip)->v_mount;
 1008         pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
 1009 top:
 1010         LIST_FOREACH(pagedep, pagedephd, pd_hash)
 1011                 if (ip->i_number == pagedep->pd_ino &&
 1012                     lbn == pagedep->pd_lbn &&
 1013                     mp == pagedep->pd_mnt)
 1014                         break;
 1015         if (pagedep) {
 1016                 *pagedeppp = pagedep;
 1017                 if ((flags & DEPALLOC) != 0 &&
 1018                     (pagedep->pd_state & ONWORKLIST) == 0)
 1019                         return (0);
 1020                 return (1);
 1021         }
 1022         if ((flags & DEPALLOC) == 0) {
 1023                 *pagedeppp = NULL;
 1024                 return (0);
 1025         }
 1026         if (sema_get(&pagedep_in_progress, &lk) == 0) {
 1027                 ACQUIRE_LOCK(&lk);
 1028                 goto top;
 1029         }
 1030         pagedep = pool_get(&pagedep_pool, PR_WAITOK);
 1031         bzero(pagedep, sizeof(struct pagedep));
 1032         pagedep->pd_list.wk_type = D_PAGEDEP;
 1033         pagedep->pd_mnt = mp;
 1034         pagedep->pd_ino = ip->i_number;
 1035         pagedep->pd_lbn = lbn;
 1036         LIST_INIT(&pagedep->pd_dirremhd);
 1037         LIST_INIT(&pagedep->pd_pendinghd);
 1038         for (i = 0; i < DAHASHSZ; i++)
 1039                 LIST_INIT(&pagedep->pd_diraddhd[i]);
 1040         ACQUIRE_LOCK(&lk);
 1041         LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 1042         sema_release(&pagedep_in_progress);
 1043         *pagedeppp = pagedep;
 1044         return (0);
 1045 }
 1046 
 1047 /*
 1048  * Structures and routines associated with inodedep caching.
 1049  */
 1050 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
 1051 STATIC u_long   inodedep_hash;  /* size of hash table - 1 */
 1052 STATIC long     num_inodedep;   /* number of inodedep allocated */
 1053 #define INODEDEP_HASH(fs, inum) \
 1054       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
 1055 STATIC struct sema inodedep_in_progress;
 1056 
 1057 /*
 1058  * Look up a inodedep. Return 1 if found, 0 if not found.
 1059  * If not found, allocate if DEPALLOC flag is passed.
 1060  * Found or allocated entry is returned in inodedeppp.
 1061  * This routine must be called with splbio interrupts blocked.
 1062  */
 1063 STATIC int
 1064 inodedep_lookup(fs, inum, flags, inodedeppp)
 1065         struct fs *fs;
 1066         ino_t inum;
 1067         int flags;
 1068         struct inodedep **inodedeppp;
 1069 {
 1070         struct inodedep *inodedep;
 1071         struct inodedep_hashhead *inodedephd;
 1072         int firsttry;
 1073 
 1074         splassert(IPL_BIO);
 1075 
 1076 #ifdef DEBUG
 1077         if (lk.lkt_held == -1)
 1078                 panic("inodedep_lookup: lock not held");
 1079 #endif
 1080         firsttry = 1;
 1081         inodedephd = INODEDEP_HASH(fs, inum);
 1082 top:
 1083         LIST_FOREACH(inodedep, inodedephd, id_hash)
 1084                 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
 1085                         break;
 1086         if (inodedep) {
 1087                 *inodedeppp = inodedep;
 1088                 return (1);
 1089         }
 1090         if ((flags & DEPALLOC) == 0) {
 1091                 *inodedeppp = NULL;
 1092                 return (0);
 1093         }
 1094         /*
 1095          * If we are over our limit, try to improve the situation.
 1096          */
 1097         if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
 1098             request_cleanup(FLUSH_INODES, 1)) {
 1099                 firsttry = 0;
 1100                 goto top;
 1101         }
 1102         if (sema_get(&inodedep_in_progress, &lk) == 0) {
 1103                 ACQUIRE_LOCK(&lk);
 1104                 goto top;
 1105         }
 1106         num_inodedep += 1;
 1107         inodedep = pool_get(&inodedep_pool, PR_WAITOK);
 1108         inodedep->id_list.wk_type = D_INODEDEP;
 1109         inodedep->id_fs = fs;
 1110         inodedep->id_ino = inum;
 1111         inodedep->id_state = ALLCOMPLETE;
 1112         inodedep->id_nlinkdelta = 0;
 1113         inodedep->id_savedino1 = NULL;
 1114         inodedep->id_savedsize = -1;
 1115         inodedep->id_buf = NULL;
 1116         LIST_INIT(&inodedep->id_pendinghd);
 1117         LIST_INIT(&inodedep->id_inowait);
 1118         LIST_INIT(&inodedep->id_bufwait);
 1119         TAILQ_INIT(&inodedep->id_inoupdt);
 1120         TAILQ_INIT(&inodedep->id_newinoupdt);
 1121         ACQUIRE_LOCK(&lk);
 1122         LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 1123         sema_release(&inodedep_in_progress);
 1124         *inodedeppp = inodedep;
 1125         return (0);
 1126 }
 1127 
 1128 /*
 1129  * Structures and routines associated with newblk caching.
 1130  */
 1131 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
 1132 u_long  newblk_hash;            /* size of hash table - 1 */
 1133 #define NEWBLK_HASH(fs, inum) \
 1134         (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
 1135 STATIC struct sema newblk_in_progress;
 1136 
 1137 /*
 1138  * Look up a newblk. Return 1 if found, 0 if not found.
 1139  * If not found, allocate if DEPALLOC flag is passed.
 1140  * Found or allocated entry is returned in newblkpp.
 1141  */
 1142 STATIC int
 1143 newblk_lookup(fs, newblkno, flags, newblkpp)
 1144         struct fs *fs;
 1145         daddr_t newblkno;
 1146         int flags;
 1147         struct newblk **newblkpp;
 1148 {
 1149         struct newblk *newblk;
 1150         struct newblk_hashhead *newblkhd;
 1151 
 1152         newblkhd = NEWBLK_HASH(fs, newblkno);
 1153 top:
 1154         LIST_FOREACH(newblk, newblkhd, nb_hash)
 1155                 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
 1156                         break;
 1157         if (newblk) {
 1158                 *newblkpp = newblk;
 1159                 return (1);
 1160         }
 1161         if ((flags & DEPALLOC) == 0) {
 1162                 *newblkpp = NULL;
 1163                 return (0);
 1164         }
 1165         if (sema_get(&newblk_in_progress, 0) == 0)
 1166                 goto top;
 1167         newblk = pool_get(&newblk_pool, PR_WAITOK);
 1168         newblk->nb_state = 0;
 1169         newblk->nb_fs = fs;
 1170         newblk->nb_newblkno = newblkno;
 1171         LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 1172         sema_release(&newblk_in_progress);
 1173         *newblkpp = newblk;
 1174         return (0);
 1175 }
 1176 
 1177 /*
 1178  * Executed during filesystem system initialization before
 1179  * mounting any file systems.
 1180  */
 1181 void 
 1182 softdep_initialize()
 1183 {
 1184 
 1185         bioops.io_start = softdep_disk_io_initiation;
 1186         bioops.io_complete = softdep_disk_write_complete;
 1187         bioops.io_deallocate = softdep_deallocate_dependencies;
 1188         bioops.io_movedeps = softdep_move_dependencies;
 1189         bioops.io_countdeps = softdep_count_dependencies;
 1190 
 1191         LIST_INIT(&mkdirlisthd);
 1192         LIST_INIT(&softdep_workitem_pending);
 1193 #ifdef KMEMSTATS
 1194         max_softdeps = min (desiredvnodes * 8,
 1195             kmemstats[M_INODEDEP].ks_limit / (2 * sizeof(struct inodedep)));
 1196 #else
 1197         max_softdeps = desiredvnodes * 4;
 1198 #endif
 1199         pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, M_WAITOK,
 1200             &pagedep_hash);
 1201         sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
 1202         inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, M_WAITOK,
 1203             &inodedep_hash);
 1204         sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
 1205         newblk_hashtbl = hashinit(64, M_NEWBLK, M_WAITOK, &newblk_hash);
 1206         sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
 1207         timeout_set(&proc_waiting_timeout, pause_timer, 0);
 1208         pool_init(&pagedep_pool, sizeof(struct pagedep), 0, 0, 0,
 1209             "pagedeppl", &pool_allocator_nointr);
 1210         pool_init(&inodedep_pool, sizeof(struct inodedep), 0, 0, 0,
 1211             "inodedeppl", &pool_allocator_nointr);
 1212         pool_init(&newblk_pool, sizeof(struct newblk), 0, 0, 0,
 1213             "newblkpl", &pool_allocator_nointr);
 1214         pool_init(&bmsafemap_pool, sizeof(struct bmsafemap), 0, 0, 0,
 1215             "bmsafemappl", &pool_allocator_nointr);
 1216         pool_init(&allocdirect_pool, sizeof(struct allocdirect), 0, 0, 0,
 1217             "allocdirectpl", &pool_allocator_nointr);
 1218         pool_init(&indirdep_pool, sizeof(struct indirdep), 0, 0, 0,
 1219             "indirdeppl", &pool_allocator_nointr);
 1220         pool_init(&allocindir_pool, sizeof(struct allocindir), 0, 0, 0,
 1221             "allocindirpl", &pool_allocator_nointr);
 1222         pool_init(&freefrag_pool, sizeof(struct freefrag), 0, 0, 0,
 1223             "freefragpl", &pool_allocator_nointr);
 1224         pool_init(&freeblks_pool, sizeof(struct freeblks), 0, 0, 0,
 1225             "freeblkspl", &pool_allocator_nointr);
 1226         pool_init(&freefile_pool, sizeof(struct freefile), 0, 0, 0,
 1227             "freefilepl", &pool_allocator_nointr);
 1228         pool_init(&diradd_pool, sizeof(struct diradd), 0, 0, 0,
 1229             "diraddpl", &pool_allocator_nointr);
 1230         pool_init(&mkdir_pool, sizeof(struct mkdir), 0, 0, 0,
 1231             "mkdirpl", &pool_allocator_nointr);
 1232         pool_init(&dirrem_pool, sizeof(struct dirrem), 0, 0, 0,
 1233             "dirrempl", &pool_allocator_nointr);
 1234         pool_init(&newdirblk_pool, sizeof(struct newdirblk), 0, 0, 0,
 1235             "newdirblkpl", &pool_allocator_nointr);
 1236 }
 1237 
 1238 /*
 1239  * Called at mount time to notify the dependency code that a
 1240  * filesystem wishes to use it.
 1241  */
 1242 int
 1243 softdep_mount(devvp, mp, fs, cred)
 1244         struct vnode *devvp;
 1245         struct mount *mp;
 1246         struct fs *fs;
 1247         struct ucred *cred;
 1248 {
 1249         struct csum_total cstotal;
 1250         struct cg *cgp;
 1251         struct buf *bp;
 1252         int error, cyl;
 1253 
 1254         /*
 1255          * When doing soft updates, the counters in the
 1256          * superblock may have gotten out of sync, so we have
 1257          * to scan the cylinder groups and recalculate them.
 1258          */
 1259         if ((fs->fs_flags & FS_UNCLEAN) == 0)
 1260                 return (0);
 1261         bzero(&cstotal, sizeof cstotal);
 1262         for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 1263                 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 1264                     fs->fs_cgsize, cred, &bp)) != 0) {
 1265                         brelse(bp);
 1266                         return (error);
 1267                 }
 1268                 cgp = (struct cg *)bp->b_data;
 1269                 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 1270                 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 1271                 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 1272                 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 1273                 fs->fs_cs(fs, cyl) = cgp->cg_cs;
 1274                 brelse(bp);
 1275         }
 1276 #ifdef DEBUG
 1277         if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 1278                 printf("ffs_mountfs: superblock updated for soft updates\n");
 1279 #endif
 1280         bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 1281         return (0);
 1282 }
 1283 
 1284 /*
 1285  * Protecting the freemaps (or bitmaps).
 1286  * 
 1287  * To eliminate the need to execute fsck before mounting a file system
 1288  * after a power failure, one must (conservatively) guarantee that the
 1289  * on-disk copy of the bitmaps never indicate that a live inode or block is
 1290  * free.  So, when a block or inode is allocated, the bitmap should be
 1291  * updated (on disk) before any new pointers.  When a block or inode is
 1292  * freed, the bitmap should not be updated until all pointers have been
 1293  * reset.  The latter dependency is handled by the delayed de-allocation
 1294  * approach described below for block and inode de-allocation.  The former
 1295  * dependency is handled by calling the following procedure when a block or
 1296  * inode is allocated. When an inode is allocated an "inodedep" is created
 1297  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
 1298  * Each "inodedep" is also inserted into the hash indexing structure so
 1299  * that any additional link additions can be made dependent on the inode
 1300  * allocation.
 1301  * 
 1302  * The ufs file system maintains a number of free block counts (e.g., per
 1303  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
 1304  * in addition to the bitmaps.  These counts are used to improve efficiency
 1305  * during allocation and therefore must be consistent with the bitmaps.
 1306  * There is no convenient way to guarantee post-crash consistency of these
 1307  * counts with simple update ordering, for two main reasons: (1) The counts
 1308  * and bitmaps for a single cylinder group block are not in the same disk
 1309  * sector.  If a disk write is interrupted (e.g., by power failure), one may
 1310  * be written and the other not.  (2) Some of the counts are located in the
 1311  * superblock rather than the cylinder group block. So, we focus our soft
 1312  * updates implementation on protecting the bitmaps. When mounting a
 1313  * filesystem, we recompute the auxiliary counts from the bitmaps.
 1314  */
 1315 
 1316 /*
 1317  * Called just after updating the cylinder group block to allocate an inode.
 1318  */
 1319 void
 1320 softdep_setup_inomapdep(bp, ip, newinum)
 1321         struct buf *bp;         /* buffer for cylgroup block with inode map */
 1322         struct inode *ip;       /* inode related to allocation */
 1323         ino_t newinum;          /* new inode number being allocated */
 1324 {
 1325         struct inodedep *inodedep;
 1326         struct bmsafemap *bmsafemap;
 1327 
 1328         /*
 1329          * Create a dependency for the newly allocated inode.
 1330          * Panic if it already exists as something is seriously wrong.
 1331          * Otherwise add it to the dependency list for the buffer holding
 1332          * the cylinder group map from which it was allocated.
 1333          */
 1334         ACQUIRE_LOCK(&lk);
 1335         if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC | NODELAY, &inodedep)
 1336             != 0) {
 1337                 FREE_LOCK(&lk);
 1338                 panic("softdep_setup_inomapdep: found inode");
 1339         }
 1340         inodedep->id_buf = bp;
 1341         inodedep->id_state &= ~DEPCOMPLETE;
 1342         bmsafemap = bmsafemap_lookup(bp);
 1343         LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 1344         FREE_LOCK(&lk);
 1345 }
 1346 
 1347 /*
 1348  * Called just after updating the cylinder group block to
 1349  * allocate block or fragment.
 1350  */
 1351 void
 1352 softdep_setup_blkmapdep(bp, fs, newblkno)
 1353         struct buf *bp;         /* buffer for cylgroup block with block map */
 1354         struct fs *fs;          /* filesystem doing allocation */
 1355         daddr_t newblkno;       /* number of newly allocated block */
 1356 {
 1357         struct newblk *newblk;
 1358         struct bmsafemap *bmsafemap;
 1359 
 1360         /*
 1361          * Create a dependency for the newly allocated block.
 1362          * Add it to the dependency list for the buffer holding
 1363          * the cylinder group map from which it was allocated.
 1364          */
 1365         if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
 1366                 panic("softdep_setup_blkmapdep: found block");
 1367         ACQUIRE_LOCK(&lk);
 1368         newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
 1369         LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 1370         FREE_LOCK(&lk);
 1371 }
 1372 
 1373 /*
 1374  * Find the bmsafemap associated with a cylinder group buffer.
 1375  * If none exists, create one. The buffer must be locked when
 1376  * this routine is called and this routine must be called with
 1377  * splbio interrupts blocked.
 1378  */
 1379 STATIC struct bmsafemap *
 1380 bmsafemap_lookup(bp)
 1381         struct buf *bp;
 1382 {
 1383         struct bmsafemap *bmsafemap;
 1384         struct worklist *wk;
 1385 
 1386         splassert(IPL_BIO);
 1387 
 1388 #ifdef DEBUG
 1389         if (lk.lkt_held == -1)
 1390                 panic("bmsafemap_lookup: lock not held");
 1391 #endif
 1392         LIST_FOREACH(wk, &bp->b_dep, wk_list)
 1393                 if (wk->wk_type == D_BMSAFEMAP)
 1394                         return (WK_BMSAFEMAP(wk));
 1395         FREE_LOCK(&lk);
 1396         bmsafemap = pool_get(&bmsafemap_pool, PR_WAITOK);
 1397         bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
 1398         bmsafemap->sm_list.wk_state = 0;
 1399         bmsafemap->sm_buf = bp;
 1400         LIST_INIT(&bmsafemap->sm_allocdirecthd);
 1401         LIST_INIT(&bmsafemap->sm_allocindirhd);
 1402         LIST_INIT(&bmsafemap->sm_inodedephd);
 1403         LIST_INIT(&bmsafemap->sm_newblkhd);
 1404         ACQUIRE_LOCK(&lk);
 1405         WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 1406         return (bmsafemap);
 1407 }
 1408 
 1409 /*
 1410  * Direct block allocation dependencies.
 1411  * 
 1412  * When a new block is allocated, the corresponding disk locations must be
 1413  * initialized (with zeros or new data) before the on-disk inode points to
 1414  * them.  Also, the freemap from which the block was allocated must be
 1415  * updated (on disk) before the inode's pointer. These two dependencies are
 1416  * independent of each other and are needed for all file blocks and indirect
 1417  * blocks that are pointed to directly by the inode.  Just before the
 1418  * "in-core" version of the inode is updated with a newly allocated block
 1419  * number, a procedure (below) is called to setup allocation dependency
 1420  * structures.  These structures are removed when the corresponding
 1421  * dependencies are satisfied or when the block allocation becomes obsolete
 1422  * (i.e., the file is deleted, the block is de-allocated, or the block is a
 1423  * fragment that gets upgraded).  All of these cases are handled in
 1424  * procedures described later.
 1425  * 
 1426  * When a file extension causes a fragment to be upgraded, either to a larger
 1427  * fragment or to a full block, the on-disk location may change (if the
 1428  * previous fragment could not simply be extended). In this case, the old
 1429  * fragment must be de-allocated, but not until after the inode's pointer has
 1430  * been updated. In most cases, this is handled by later procedures, which
 1431  * will construct a "freefrag" structure to be added to the workitem queue
 1432  * when the inode update is complete (or obsolete).  The main exception to
 1433  * this is when an allocation occurs while a pending allocation dependency
 1434  * (for the same block pointer) remains.  This case is handled in the main
 1435  * allocation dependency setup procedure by immediately freeing the
 1436  * unreferenced fragments.
 1437  */ 
 1438 void 
 1439 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 1440         struct inode *ip;       /* inode to which block is being added */
 1441         daddr64_t lbn;          /* block pointer within inode */
 1442         daddr_t newblkno;       /* disk block number being added */
 1443         daddr_t oldblkno;       /* previous block number, 0 unless frag */
 1444         long newsize;           /* size of new block */
 1445         long oldsize;           /* size of new block */
 1446         struct buf *bp;         /* bp for allocated block */
 1447 {
 1448         struct allocdirect *adp, *oldadp;
 1449         struct allocdirectlst *adphead;
 1450         struct bmsafemap *bmsafemap;
 1451         struct inodedep *inodedep;
 1452         struct pagedep *pagedep;
 1453         struct newblk *newblk;
 1454 
 1455         adp = pool_get(&allocdirect_pool, PR_WAITOK);
 1456         bzero(adp, sizeof(struct allocdirect));
 1457         adp->ad_list.wk_type = D_ALLOCDIRECT;
 1458         adp->ad_lbn = lbn;
 1459         adp->ad_newblkno = newblkno;
 1460         adp->ad_oldblkno = oldblkno;
 1461         adp->ad_newsize = newsize;
 1462         adp->ad_oldsize = oldsize;
 1463         adp->ad_state = ATTACHED;
 1464         LIST_INIT(&adp->ad_newdirblk);
 1465         if (newblkno == oldblkno)
 1466                 adp->ad_freefrag = NULL;
 1467         else
 1468                 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 1469 
 1470         if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 1471                 panic("softdep_setup_allocdirect: lost block");
 1472 
 1473         ACQUIRE_LOCK(&lk);
 1474         inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 1475         adp->ad_inodedep = inodedep;
 1476 
 1477         if (newblk->nb_state == DEPCOMPLETE) {
 1478                 adp->ad_state |= DEPCOMPLETE;
 1479                 adp->ad_buf = NULL;
 1480         } else {
 1481                 bmsafemap = newblk->nb_bmsafemap;
 1482                 adp->ad_buf = bmsafemap->sm_buf;
 1483                 LIST_REMOVE(newblk, nb_deps);
 1484                 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 1485         }
 1486         LIST_REMOVE(newblk, nb_hash);
 1487         pool_put(&newblk_pool, newblk);
 1488 
 1489         if (bp == NULL) {
 1490                 /*
 1491                  * XXXUBC - Yes, I know how to fix this, but not right now.
 1492                  */
 1493                 panic("softdep_setup_allocdirect: Bonk art in the head");
 1494         }
 1495         WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 1496         if (lbn >= NDADDR) {
 1497                 /* allocating an indirect block */
 1498                 if (oldblkno != 0) {
 1499                         FREE_LOCK(&lk);
 1500                         panic("softdep_setup_allocdirect: non-zero indir");
 1501                 }
 1502         } else {
 1503                 /*
 1504                  * Allocating a direct block.
 1505                  *
 1506                  * If we are allocating a directory block, then we must
 1507                  * allocate an associated pagedep to track additions and
 1508                  * deletions.
 1509                  */
 1510                 if ((DIP(ip, mode) & IFMT) == IFDIR &&
 1511                     pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 1512                         WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 1513         }
 1514         /*
 1515          * The list of allocdirects must be kept in sorted and ascending
 1516          * order so that the rollback routines can quickly determine the
 1517          * first uncommitted block (the size of the file stored on disk
 1518          * ends at the end of the lowest committed fragment, or if there
 1519          * are no fragments, at the end of the highest committed block).
 1520          * Since files generally grow, the typical case is that the new
 1521          * block is to be added at the end of the list. We speed this
 1522          * special case by checking against the last allocdirect in the
 1523          * list before laboriously traversing the list looking for the
 1524          * insertion point.
 1525          */
 1526         adphead = &inodedep->id_newinoupdt;
 1527         oldadp = TAILQ_LAST(adphead, allocdirectlst);
 1528         if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 1529                 /* insert at end of list */
 1530                 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 1531                 if (oldadp != NULL && oldadp->ad_lbn == lbn)
 1532                         allocdirect_merge(adphead, adp, oldadp);
 1533                 FREE_LOCK(&lk);
 1534                 return;
 1535         }
 1536         TAILQ_FOREACH(oldadp, adphead, ad_next) {
 1537                 if (oldadp->ad_lbn >= lbn)
 1538                         break;
 1539         }
 1540         if (oldadp == NULL) {
 1541                 FREE_LOCK(&lk);
 1542                 panic("softdep_setup_allocdirect: lost entry");
 1543         }
 1544         /* insert in middle of list */
 1545         TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 1546         if (oldadp->ad_lbn == lbn)
 1547                 allocdirect_merge(adphead, adp, oldadp);
 1548         FREE_LOCK(&lk);
 1549 }
 1550 
 1551 /*
 1552  * Replace an old allocdirect dependency with a newer one.
 1553  * This routine must be called with splbio interrupts blocked.
 1554  */
 1555 STATIC void
 1556 allocdirect_merge(adphead, newadp, oldadp)
 1557         struct allocdirectlst *adphead; /* head of list holding allocdirects */
 1558         struct allocdirect *newadp;     /* allocdirect being added */
 1559         struct allocdirect *oldadp;     /* existing allocdirect being checked */
 1560 {
 1561         struct worklist *wk;
 1562         struct freefrag *freefrag;
 1563         struct newdirblk *newdirblk;
 1564 
 1565         splassert(IPL_BIO);
 1566 
 1567 #ifdef DEBUG
 1568         if (lk.lkt_held == -1)
 1569                 panic("allocdirect_merge: lock not held");
 1570 #endif
 1571         if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 1572             newadp->ad_oldsize != oldadp->ad_newsize ||
 1573             newadp->ad_lbn >= NDADDR) {
 1574                 FREE_LOCK(&lk);
 1575                 panic("allocdirect_merge: old %d != new %d || lbn %ld >= %d",
 1576                     newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
 1577                     NDADDR);
 1578         }
 1579         newadp->ad_oldblkno = oldadp->ad_oldblkno;
 1580         newadp->ad_oldsize = oldadp->ad_oldsize;
 1581         /*
 1582          * If the old dependency had a fragment to free or had never
 1583          * previously had a block allocated, then the new dependency
 1584          * can immediately post its freefrag and adopt the old freefrag.
 1585          * This action is done by swapping the freefrag dependencies.
 1586          * The new dependency gains the old one's freefrag, and the
 1587          * old one gets the new one and then immediately puts it on
 1588          * the worklist when it is freed by free_allocdirect. It is
 1589          * not possible to do this swap when the old dependency had a
 1590          * non-zero size but no previous fragment to free. This condition
 1591          * arises when the new block is an extension of the old block.
 1592          * Here, the first part of the fragment allocated to the new
 1593          * dependency is part of the block currently claimed on disk by
 1594          * the old dependency, so cannot legitimately be freed until the
 1595          * conditions for the new dependency are fulfilled.
 1596          */
 1597         if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
 1598                 freefrag = newadp->ad_freefrag;
 1599                 newadp->ad_freefrag = oldadp->ad_freefrag;
 1600                 oldadp->ad_freefrag = freefrag;
 1601         }
 1602         /*
 1603          * If we are tracking a new directory-block allocation,
 1604          * move it from the old allocdirect to the new allocdirect.
 1605          */
 1606         if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 1607                 newdirblk = WK_NEWDIRBLK(wk);
 1608                 WORKLIST_REMOVE(&newdirblk->db_list);
 1609                 if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
 1610                         panic("allocdirect_merge: extra newdirblk");
 1611                 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
 1612         }
 1613         free_allocdirect(adphead, oldadp, 0);
 1614 }
 1615                 
 1616 /*
 1617  * Allocate a new freefrag structure if needed.
 1618  */
 1619 STATIC struct freefrag *
 1620 newfreefrag(ip, blkno, size)
 1621         struct inode *ip;
 1622         daddr_t blkno;
 1623         long size;
 1624 {
 1625         struct freefrag *freefrag;
 1626         struct fs *fs;
 1627 
 1628         if (blkno == 0)
 1629                 return (NULL);
 1630         fs = ip->i_fs;
 1631         if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 1632                 panic("newfreefrag: frag size");
 1633         freefrag = pool_get(&freefrag_pool, PR_WAITOK);
 1634         freefrag->ff_list.wk_type = D_FREEFRAG;
 1635         freefrag->ff_state = DIP(ip, uid) & ~ONWORKLIST; /* used below */
 1636         freefrag->ff_inum = ip->i_number;
 1637         freefrag->ff_mnt = ITOV(ip)->v_mount;
 1638         freefrag->ff_devvp = ip->i_devvp;
 1639         freefrag->ff_blkno = blkno;
 1640         freefrag->ff_fragsize = size;
 1641         return (freefrag);
 1642 }
 1643 
 1644 /*
 1645  * This workitem de-allocates fragments that were replaced during
 1646  * file block allocation.
 1647  */
 1648 STATIC void 
 1649 handle_workitem_freefrag(freefrag)
 1650         struct freefrag *freefrag;
 1651 {
 1652         struct inode tip;
 1653         struct ufs1_dinode dtip1;
 1654 
 1655         tip.i_vnode = NULL;
 1656         tip.i_din1 = &dtip1;
 1657         tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs;
 1658         tip.i_ump = VFSTOUFS(freefrag->ff_mnt);
 1659         tip.i_dev = freefrag->ff_devvp->v_rdev;
 1660         tip.i_number = freefrag->ff_inum;
 1661         tip.i_ffs1_uid = freefrag->ff_state & ~ONWORKLIST; /* set above */
 1662         ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
 1663         pool_put(&freefrag_pool, freefrag);
 1664 }
 1665 
 1666 /*
 1667  * Indirect block allocation dependencies.
 1668  * 
 1669  * The same dependencies that exist for a direct block also exist when
 1670  * a new block is allocated and pointed to by an entry in a block of
 1671  * indirect pointers. The undo/redo states described above are also
 1672  * used here. Because an indirect block contains many pointers that
 1673  * may have dependencies, a second copy of the entire in-memory indirect
 1674  * block is kept. The buffer cache copy is always completely up-to-date.
 1675  * The second copy, which is used only as a source for disk writes,
 1676  * contains only the safe pointers (i.e., those that have no remaining
 1677  * update dependencies). The second copy is freed when all pointers
 1678  * are safe. The cache is not allowed to replace indirect blocks with
 1679  * pending update dependencies. If a buffer containing an indirect
 1680  * block with dependencies is written, these routines will mark it
 1681  * dirty again. It can only be successfully written once all the
 1682  * dependencies are removed. The ffs_fsync routine in conjunction with
 1683  * softdep_sync_metadata work together to get all the dependencies
 1684  * removed so that a file can be successfully written to disk. Three
 1685  * procedures are used when setting up indirect block pointer
 1686  * dependencies. The division is necessary because of the organization
 1687  * of the "balloc" routine and because of the distinction between file
 1688  * pages and file metadata blocks.
 1689  */
 1690 
 1691 /*
 1692  * Allocate a new allocindir structure.
 1693  */
 1694 STATIC struct allocindir *
 1695 newallocindir(ip, ptrno, newblkno, oldblkno)
 1696         struct inode *ip;       /* inode for file being extended */
 1697         int ptrno;              /* offset of pointer in indirect block */
 1698         daddr_t newblkno;       /* disk block number being added */
 1699         daddr_t oldblkno;       /* previous block number, 0 if none */
 1700 {
 1701         struct allocindir *aip;
 1702 
 1703         aip = pool_get(&allocindir_pool, PR_WAITOK);
 1704         bzero(aip,sizeof(struct allocindir));
 1705         aip->ai_list.wk_type = D_ALLOCINDIR;
 1706         aip->ai_state = ATTACHED;
 1707         aip->ai_offset = ptrno;
 1708         aip->ai_newblkno = newblkno;
 1709         aip->ai_oldblkno = oldblkno;
 1710         aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
 1711         return (aip);
 1712 }
 1713 
 1714 /*
 1715  * Called just before setting an indirect block pointer
 1716  * to a newly allocated file page.
 1717  */
 1718 void
 1719 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 1720         struct inode *ip;       /* inode for file being extended */
 1721         daddr64_t lbn;          /* allocated block number within file */
 1722         struct buf *bp;         /* buffer with indirect blk referencing page */
 1723         int ptrno;              /* offset of pointer in indirect block */
 1724         daddr_t newblkno;       /* disk block number being added */
 1725         daddr_t oldblkno;       /* previous block number, 0 if none */
 1726         struct buf *nbp;        /* buffer holding allocated page */
 1727 {
 1728         struct allocindir *aip;
 1729         struct pagedep *pagedep;
 1730 
 1731         aip = newallocindir(ip, ptrno, newblkno, oldblkno);
 1732         ACQUIRE_LOCK(&lk);
 1733         /*
 1734          * If we are allocating a directory page, then we must
 1735          * allocate an associated pagedep to track additions and
 1736          * deletions.
 1737          */
 1738         if ((DIP(ip, mode) & IFMT) == IFDIR &&
 1739             pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 1740                 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
 1741         if (nbp == NULL) {
 1742                 /*
 1743                  * XXXUBC - Yes, I know how to fix this, but not right now.
 1744                  */
 1745                 panic("softdep_setup_allocindir_page: Bonk art in the head");
 1746         }
 1747         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 1748         FREE_LOCK(&lk);
 1749         setup_allocindir_phase2(bp, ip, aip);
 1750 }
 1751 
 1752 /*
 1753  * Called just before setting an indirect block pointer to a
 1754  * newly allocated indirect block.
 1755  */
 1756 void
 1757 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 1758         struct buf *nbp;        /* newly allocated indirect block */
 1759         struct inode *ip;       /* inode for file being extended */
 1760         struct buf *bp;         /* indirect block referencing allocated block */
 1761         int ptrno;              /* offset of pointer in indirect block */
 1762         daddr_t newblkno;       /* disk block number being added */
 1763 {
 1764         struct allocindir *aip;
 1765 
 1766         aip = newallocindir(ip, ptrno, newblkno, 0);
 1767         ACQUIRE_LOCK(&lk);
 1768         WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 1769         FREE_LOCK(&lk);
 1770         setup_allocindir_phase2(bp, ip, aip);
 1771 }
 1772 
 1773 /*
 1774  * Called to finish the allocation of the "aip" allocated
 1775  * by one of the two routines above.
 1776  */
 1777 STATIC void 
 1778 setup_allocindir_phase2(bp, ip, aip)
 1779         struct buf *bp;         /* in-memory copy of the indirect block */
 1780         struct inode *ip;       /* inode for file being extended */
 1781         struct allocindir *aip; /* allocindir allocated by the above routines */
 1782 {
 1783         struct worklist *wk;
 1784         struct indirdep *indirdep, *newindirdep;
 1785         struct bmsafemap *bmsafemap;
 1786         struct allocindir *oldaip;
 1787         struct freefrag *freefrag;
 1788         struct newblk *newblk;
 1789 
 1790         if (bp->b_lblkno >= 0)
 1791                 panic("setup_allocindir_phase2: not indir blk");
 1792         for (indirdep = NULL, newindirdep = NULL; ; ) {
 1793                 ACQUIRE_LOCK(&lk);
 1794                 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 1795                         if (wk->wk_type != D_INDIRDEP)
 1796                                 continue;
 1797                         indirdep = WK_INDIRDEP(wk);
 1798                         break;
 1799                 }
 1800                 if (indirdep == NULL && newindirdep) {
 1801                         indirdep = newindirdep;
 1802                         WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
 1803                         newindirdep = NULL;
 1804                 }
 1805                 FREE_LOCK(&lk);
 1806                 if (indirdep) {
 1807                         if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
 1808                             &newblk) == 0)
 1809                                 panic("setup_allocindir: lost block");
 1810                         ACQUIRE_LOCK(&lk);
 1811                         if (newblk->nb_state == DEPCOMPLETE) {
 1812                                 aip->ai_state |= DEPCOMPLETE;
 1813                                 aip->ai_buf = NULL;
 1814                         } else {
 1815                                 bmsafemap = newblk->nb_bmsafemap;
 1816                                 aip->ai_buf = bmsafemap->sm_buf;
 1817                                 LIST_REMOVE(newblk, nb_deps);
 1818                                 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
 1819                                     aip, ai_deps);
 1820                         }
 1821                         LIST_REMOVE(newblk, nb_hash);
 1822                         pool_put(&newblk_pool, newblk);
 1823                         aip->ai_indirdep = indirdep;
 1824                         /*
 1825                          * Check to see if there is an existing dependency
 1826                          * for this block. If there is, merge the old
 1827                          * dependency into the new one.
 1828                          */
 1829                         if (aip->ai_oldblkno == 0)
 1830                                 oldaip = NULL;
 1831                         else
 1832 
 1833                                 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
 1834                                         if (oldaip->ai_offset == aip->ai_offset)
 1835                                                 break;
 1836                         freefrag = NULL;
 1837                         if (oldaip != NULL) {
 1838                                 if (oldaip->ai_newblkno != aip->ai_oldblkno) {
 1839                                         FREE_LOCK(&lk);
 1840                                         panic("setup_allocindir_phase2: blkno");
 1841                                 }
 1842                                 aip->ai_oldblkno = oldaip->ai_oldblkno;
 1843                                 freefrag = aip->ai_freefrag;
 1844                                 aip->ai_freefrag = oldaip->ai_freefrag;
 1845                                 oldaip->ai_freefrag = NULL;
 1846                                 free_allocindir(oldaip, NULL);
 1847                         }
 1848                         LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
 1849                         if (ip->i_ump->um_fstype == UM_UFS1)
 1850                                 ((int32_t *)indirdep->ir_savebp->b_data)
 1851                                     [aip->ai_offset] = aip->ai_oldblkno;
 1852                         else
 1853                                 ((int64_t *)indirdep->ir_savebp->b_data)
 1854                                     [aip->ai_offset] = aip->ai_oldblkno;
 1855                         FREE_LOCK(&lk);
 1856                         if (freefrag != NULL)
 1857                                 handle_workitem_freefrag(freefrag);
 1858                 }
 1859                 if (newindirdep) {
 1860                         if (indirdep->ir_savebp != NULL)
 1861                                 brelse(newindirdep->ir_savebp);
 1862                         WORKITEM_FREE(newindirdep, D_INDIRDEP);
 1863                 }
 1864                 if (indirdep)
 1865                         break;
 1866                 newindirdep = pool_get(&indirdep_pool, PR_WAITOK);
 1867                 newindirdep->ir_list.wk_type = D_INDIRDEP;
 1868                 newindirdep->ir_state = ATTACHED;
 1869                 if (ip->i_ump->um_fstype == UM_UFS1)
 1870                         newindirdep->ir_state |= UFS1FMT;
 1871                 LIST_INIT(&newindirdep->ir_deplisthd);
 1872                 LIST_INIT(&newindirdep->ir_donehd);
 1873                 if (bp->b_blkno == bp->b_lblkno) {
 1874                         VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
 1875                                 NULL);
 1876                 }
 1877                 newindirdep->ir_savebp =
 1878                     getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
 1879 #if 0
 1880                 BUF_KERNPROC(newindirdep->ir_savebp);
 1881 #endif
 1882                 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 1883         }
 1884 }
 1885 
 1886 /*
 1887  * Block de-allocation dependencies.
 1888  * 
 1889  * When blocks are de-allocated, the on-disk pointers must be nullified before
 1890  * the blocks are made available for use by other files.  (The true
 1891  * requirement is that old pointers must be nullified before new on-disk
 1892  * pointers are set.  We chose this slightly more stringent requirement to
 1893  * reduce complexity.) Our implementation handles this dependency by updating
 1894  * the inode (or indirect block) appropriately but delaying the actual block
 1895  * de-allocation (i.e., freemap and free space count manipulation) until
 1896  * after the updated versions reach stable storage.  After the disk is
 1897  * updated, the blocks can be safely de-allocated whenever it is convenient.
 1898  * This implementation handles only the common case of reducing a file's
 1899  * length to zero. Other cases are handled by the conventional synchronous
 1900  * write approach.
 1901  *
 1902  * The ffs implementation with which we worked double-checks
 1903  * the state of the block pointers and file size as it reduces
 1904  * a file's length.  Some of this code is replicated here in our
 1905  * soft updates implementation.  The freeblks->fb_chkcnt field is
 1906  * used to transfer a part of this information to the procedure
 1907  * that eventually de-allocates the blocks.
 1908  *
 1909  * This routine should be called from the routine that shortens
 1910  * a file's length, before the inode's size or block pointers
 1911  * are modified. It will save the block pointer information for
 1912  * later release and zero the inode so that the calling routine
 1913  * can release it.
 1914  */
 1915 void
 1916 softdep_setup_freeblocks(ip, length)
 1917         struct inode *ip;       /* The inode whose length is to be reduced */
 1918         off_t length;           /* The new length for the file */
 1919 {
 1920         struct freeblks *freeblks;
 1921         struct inodedep *inodedep;
 1922         struct allocdirect *adp;
 1923         struct vnode *vp;
 1924         struct buf *bp;
 1925         struct fs *fs;
 1926         int i, delay, error;
 1927 
 1928         fs = ip->i_fs;
 1929         if (length != 0)
 1930                 panic("softdep_setup_freeblocks: non-zero length");
 1931         freeblks = pool_get(&freeblks_pool, PR_WAITOK);
 1932         bzero(freeblks, sizeof(struct freeblks));
 1933         freeblks->fb_list.wk_type = D_FREEBLKS;
 1934         freeblks->fb_state = ATTACHED;
 1935         freeblks->fb_uid = DIP(ip, uid);
 1936         freeblks->fb_previousinum = ip->i_number;
 1937         freeblks->fb_devvp = ip->i_devvp;
 1938         freeblks->fb_mnt = ITOV(ip)->v_mount;
 1939         freeblks->fb_oldsize = DIP(ip, size);
 1940         freeblks->fb_newsize = length;
 1941         freeblks->fb_chkcnt = DIP(ip, blocks);
 1942 
 1943         for (i = 0; i < NDADDR; i++) {
 1944                 freeblks->fb_dblks[i] = DIP(ip, db[i]);
 1945                 DIP_ASSIGN(ip, db[i], 0);
 1946         }
 1947 
 1948         for (i = 0; i < NIADDR; i++) {
 1949                 freeblks->fb_iblks[i] = DIP(ip, ib[i]);
 1950                 DIP_ASSIGN(ip, ib[i], 0);
 1951         }
 1952 
 1953         DIP_ASSIGN(ip, blocks, 0);
 1954         DIP_ASSIGN(ip, size, 0);
 1955 
 1956         /*
 1957          * Push the zero'ed inode to to its disk buffer so that we are free
 1958          * to delete its dependencies below. Once the dependencies are gone
 1959          * the buffer can be safely released.
 1960          */
 1961         if ((error = bread(ip->i_devvp,
 1962             fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 1963             (int)fs->fs_bsize, NOCRED, &bp)) != 0)
 1964                 softdep_error("softdep_setup_freeblocks", error);
 1965 
 1966         if (ip->i_ump->um_fstype == UM_UFS1)
 1967                 *((struct ufs1_dinode *) bp->b_data +
 1968                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 1969         else
 1970                 *((struct ufs2_dinode *) bp->b_data +
 1971                     ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 1972 
 1973         /*
 1974          * Find and eliminate any inode dependencies.
 1975          */
 1976         ACQUIRE_LOCK(&lk);
 1977         (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
 1978         if ((inodedep->id_state & IOSTARTED) != 0) {
 1979                 FREE_LOCK(&lk);
 1980                 panic("softdep_setup_freeblocks: inode busy");
 1981         }
 1982         /*
 1983          * Add the freeblks structure to the list of operations that
 1984          * must await the zero'ed inode being written to disk. If we
 1985          * still have a bitmap dependency (delay == 0), then the inode
 1986          * has never been written to disk, so we can process the
 1987          * freeblks below once we have deleted the dependencies.
 1988          */
 1989         delay = (inodedep->id_state & DEPCOMPLETE);
 1990         if (delay)
 1991                 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
 1992         /*
 1993          * Because the file length has been truncated to zero, any
 1994          * pending block allocation dependency structures associated
 1995          * with this inode are obsolete and can simply be de-allocated.
 1996          * We must first merge the two dependency lists to get rid of
 1997          * any duplicate freefrag structures, then purge the merged list.
 1998          * If we still have a bitmap dependency, then the inode has never
 1999          * been written to disk, so we can free any fragments without delay.
 2000          */
 2001         merge_inode_lists(inodedep);
 2002         while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
 2003                 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
 2004         FREE_LOCK(&lk);
 2005         bdwrite(bp);
 2006         /*
 2007          * We must wait for any I/O in progress to finish so that
 2008          * all potential buffers on the dirty list will be visible.
 2009          * Once they are all there, walk the list and get rid of
 2010          * any dependencies.
 2011          */
 2012         vp = ITOV(ip);
 2013         ACQUIRE_LOCK(&lk);
 2014         drain_output(vp, 1);
 2015         while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
 2016                 if (!getdirtybuf(bp, MNT_WAIT))
 2017                         break;
 2018                 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
 2019                 deallocate_dependencies(bp, inodedep);
 2020                 bp->b_flags |= B_INVAL | B_NOCACHE;
 2021                 FREE_LOCK(&lk);
 2022                 brelse(bp);
 2023                 ACQUIRE_LOCK(&lk);
 2024         }
 2025         if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
 2026                 (void) free_inodedep(inodedep);
 2027 
 2028         if (delay) {
 2029                 freeblks->fb_state |= DEPCOMPLETE;
 2030                 /*
 2031                  * If the inode with zeroed block pointers is now on disk we
 2032                  * can start freeing blocks. Add freeblks to the worklist
 2033                  * instead of calling handle_workitem_freeblocks() directly as
 2034                  * it is more likely that additional IO is needed to complete
 2035                  * the request than in the !delay case.
 2036                  */
 2037                 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 2038                         add_to_worklist(&freeblks->fb_list);
 2039         }
 2040 
 2041         FREE_LOCK(&lk);
 2042         /*
 2043          * If the inode has never been written to disk (delay == 0),
 2044          * then we can process the freeblks now that we have deleted
 2045          * the dependencies.
 2046          */
 2047         if (!delay)
 2048                 handle_workitem_freeblocks(freeblks);
 2049 }
 2050 
 2051 /*
 2052  * Reclaim any dependency structures from a buffer that is about to
 2053  * be reallocated to a new vnode. The buffer must be locked, thus,
 2054  * no I/O completion operations can occur while we are manipulating
 2055  * its associated dependencies. The mutex is held so that other I/O's
 2056  * associated with related dependencies do not occur.
 2057  */
 2058 STATIC void
 2059 deallocate_dependencies(bp, inodedep)
 2060         struct buf *bp;
 2061         struct inodedep *inodedep;
 2062 {
 2063         struct worklist *wk;
 2064         struct indirdep *indirdep;
 2065         struct allocindir *aip;
 2066         struct pagedep *pagedep;
 2067         struct dirrem *dirrem;
 2068         struct diradd *dap;
 2069         int i;
 2070 
 2071         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 2072                 switch (wk->wk_type) {
 2073 
 2074                 case D_INDIRDEP:
 2075                         indirdep = WK_INDIRDEP(wk);
 2076                         /*
 2077                          * None of the indirect pointers will ever be visible,
 2078                          * so they can simply be tossed. GOINGAWAY ensures
 2079                          * that allocated pointers will be saved in the buffer
 2080                          * cache until they are freed. Note that they will
 2081                          * only be able to be found by their physical address
 2082                          * since the inode mapping the logical address will
 2083                          * be gone. The save buffer used for the safe copy
 2084                          * was allocated in setup_allocindir_phase2 using
 2085                          * the physical address so it could be used for this
 2086                          * purpose. Hence we swap the safe copy with the real
 2087                          * copy, allowing the safe copy to be freed and holding
 2088                          * on to the real copy for later use in indir_trunc.
 2089                          */
 2090                         if (indirdep->ir_state & GOINGAWAY) {
 2091                                 FREE_LOCK(&lk);
 2092                                 panic("deallocate_dependencies: already gone");
 2093                         }
 2094                         indirdep->ir_state |= GOINGAWAY;
 2095                         while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
 2096                                 free_allocindir(aip, inodedep);
 2097                         if (bp->b_lblkno >= 0 ||
 2098                             bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
 2099                                 FREE_LOCK(&lk);
 2100                                 panic("deallocate_dependencies: not indir");
 2101                         }
 2102                         bcopy(bp->b_data, indirdep->ir_savebp->b_data,
 2103                             bp->b_bcount);
 2104                         WORKLIST_REMOVE(wk);
 2105                         WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
 2106                         continue;
 2107 
 2108                 case D_PAGEDEP:
 2109                         pagedep = WK_PAGEDEP(wk);
 2110                         /*
 2111                          * None of the directory additions will ever be
 2112                          * visible, so they can simply be tossed.
 2113                          */
 2114                         for (i = 0; i < DAHASHSZ; i++)
 2115                                 while ((dap =
 2116                                     LIST_FIRST(&pagedep->pd_diraddhd[i])))
 2117                                         free_diradd(dap);
 2118                         while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
 2119                                 free_diradd(dap);
 2120                         /*
 2121                          * Copy any directory remove dependencies to the list
 2122                          * to be processed after the zero'ed inode is written.
 2123                          * If the inode has already been written, then they 
 2124                          * can be dumped directly onto the work list.
 2125                          */
 2126                         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd))) {
 2127                                 LIST_REMOVE(dirrem, dm_next);
 2128                                 dirrem->dm_dirinum = pagedep->pd_ino;
 2129                                 if (inodedep == NULL ||
 2130                                     (inodedep->id_state & ALLCOMPLETE) ==
 2131                                      ALLCOMPLETE)
 2132                                         add_to_worklist(&dirrem->dm_list);
 2133                                 else
 2134                                         WORKLIST_INSERT(&inodedep->id_bufwait,
 2135                                             &dirrem->dm_list);
 2136                         }
 2137                         if ((pagedep->pd_state & NEWBLOCK) != 0) {
 2138                                 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
 2139                                         if (wk->wk_type == D_NEWDIRBLK &&
 2140                                             WK_NEWDIRBLK(wk)->db_pagedep ==
 2141                                             pagedep)
 2142                                                 break;
 2143                                 if (wk != NULL) {
 2144                                         WORKLIST_REMOVE(wk);
 2145                                         free_newdirblk(WK_NEWDIRBLK(wk));
 2146                                 } else {
 2147                                         FREE_LOCK(&lk);
 2148                                         panic("deallocate_dependencies: "
 2149                                             "lost pagedep");
 2150                                         }
 2151                         }
 2152                         WORKLIST_REMOVE(&pagedep->pd_list);
 2153                         LIST_REMOVE(pagedep, pd_hash);
 2154                         WORKITEM_FREE(pagedep, D_PAGEDEP);
 2155                         continue;
 2156 
 2157                 case D_ALLOCINDIR:
 2158                         free_allocindir(WK_ALLOCINDIR(wk), inodedep);
 2159                         continue;
 2160 
 2161                 case D_ALLOCDIRECT:
 2162                 case D_INODEDEP:
 2163                         FREE_LOCK(&lk);
 2164                         panic("deallocate_dependencies: Unexpected type %s",
 2165                             TYPENAME(wk->wk_type));
 2166                         /* NOTREACHED */
 2167 
 2168                 default:
 2169                         FREE_LOCK(&lk);
 2170                         panic("deallocate_dependencies: Unknown type %s",
 2171                             TYPENAME(wk->wk_type));
 2172                         /* NOTREACHED */
 2173                 }
 2174         }
 2175 }
 2176 
 2177 /*
 2178  * Free an allocdirect. Generate a new freefrag work request if appropriate.
 2179  * This routine must be called with splbio interrupts blocked.
 2180  */
 2181 STATIC void
 2182 free_allocdirect(adphead, adp, delay)
 2183         struct allocdirectlst *adphead;
 2184         struct allocdirect *adp;
 2185         int delay;
 2186 {
 2187         struct newdirblk *newdirblk;
 2188         struct worklist *wk;
 2189 
 2190         splassert(IPL_BIO);
 2191 
 2192 #ifdef DEBUG
 2193         if (lk.lkt_held == -1)
 2194                 panic("free_allocdirect: lock not held");
 2195 #endif
 2196         if ((adp->ad_state & DEPCOMPLETE) == 0)
 2197                 LIST_REMOVE(adp, ad_deps);
 2198         TAILQ_REMOVE(adphead, adp, ad_next);
 2199         if ((adp->ad_state & COMPLETE) == 0)
 2200                 WORKLIST_REMOVE(&adp->ad_list);
 2201         if (adp->ad_freefrag != NULL) {
 2202                 if (delay)
 2203                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 2204                             &adp->ad_freefrag->ff_list);
 2205                 else
 2206                         add_to_worklist(&adp->ad_freefrag->ff_list);
 2207         }
 2208         if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
 2209                 newdirblk = WK_NEWDIRBLK(wk);
 2210                 WORKLIST_REMOVE(&newdirblk->db_list);
 2211                 if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
 2212                         panic("free_allocdirect: extra newdirblk");
 2213                 if (delay)
 2214                         WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 2215                             &newdirblk->db_list);
 2216                 else
 2217                         free_newdirblk(newdirblk);
 2218         }
 2219         WORKITEM_FREE(adp, D_ALLOCDIRECT);
 2220 }
 2221 
 2222 /*
 2223  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
 2224  * This routine must be called with splbio interrupts blocked.
 2225  */
 2226 void
 2227 free_newdirblk(newdirblk)
 2228         struct newdirblk *newdirblk;
 2229 {
 2230         struct pagedep *pagedep;
 2231         struct diradd *dap;
 2232         int i;
 2233 
 2234         splassert(IPL_BIO);
 2235 
 2236 #ifdef DEBUG
 2237         if (lk.lkt_held == -1)
 2238                 panic("free_newdirblk: lock not held");
 2239 #endif
 2240         /*
 2241          * If the pagedep is still linked onto the directory buffer
 2242          * dependency chain, then some of the entries on the
 2243          * pd_pendinghd list may not be committed to disk yet. In
 2244          * this case, we will simply clear the NEWBLOCK flag and
 2245          * let the pd_pendinghd list be processed when the pagedep
 2246          * is next written. If the pagedep is no longer on the buffer
 2247          * dependency chain, then all the entries on the pd_pending
 2248          * list are committed to disk and we can free them here.
 2249          */
 2250         pagedep = newdirblk->db_pagedep;
 2251         pagedep->pd_state &= ~NEWBLOCK;
 2252         if ((pagedep->pd_state & ONWORKLIST) == 0)
 2253                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 2254                         free_diradd(dap);
 2255         /*
 2256          * If no dependencies remain, the pagedep will be freed.
 2257          */
 2258         for (i = 0; i < DAHASHSZ; i++)
 2259                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
 2260                         break;
 2261         if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
 2262                 LIST_REMOVE(pagedep, pd_hash);
 2263                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 2264         }
 2265         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 2266 }
 2267 
 2268 /*
 2269  * Prepare an inode to be freed. The actual free operation is not
 2270  * done until the zero'ed inode has been written to disk.
 2271  */
 2272 void
 2273 softdep_freefile(pvp, ino, mode)
 2274                 struct vnode *pvp;
 2275                 ino_t ino;
 2276                 mode_t mode;
 2277 {
 2278         struct inode *ip = VTOI(pvp);
 2279         struct inodedep *inodedep;
 2280         struct freefile *freefile;
 2281 
 2282         /*
 2283          * This sets up the inode de-allocation dependency.
 2284          */
 2285         freefile = pool_get(&freefile_pool, PR_WAITOK);
 2286         freefile->fx_list.wk_type = D_FREEFILE;
 2287         freefile->fx_list.wk_state = 0;
 2288         freefile->fx_mode = mode;
 2289         freefile->fx_oldinum = ino;
 2290         freefile->fx_devvp = ip->i_devvp;
 2291         freefile->fx_mnt = ITOV(ip)->v_mount;
 2292 
 2293         /*
 2294          * If the inodedep does not exist, then the zero'ed inode has
 2295          * been written to disk. If the allocated inode has never been
 2296          * written to disk, then the on-disk inode is zero'ed. In either
 2297          * case we can free the file immediately.
 2298          */
 2299         ACQUIRE_LOCK(&lk);
 2300         if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
 2301             check_inode_unwritten(inodedep)) {
 2302                 FREE_LOCK(&lk);
 2303                 handle_workitem_freefile(freefile);
 2304                 return;
 2305         }
 2306         WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 2307         FREE_LOCK(&lk);
 2308 }
 2309 
 2310 /*
 2311  * Check to see if an inode has never been written to disk. If
 2312  * so free the inodedep and return success, otherwise return failure.
 2313  * This routine must be called with splbio interrupts blocked.
 2314  *
 2315  * If we still have a bitmap dependency, then the inode has never
 2316  * been written to disk. Drop the dependency as it is no longer
 2317  * necessary since the inode is being deallocated. We set the
 2318  * ALLCOMPLETE flags since the bitmap now properly shows that the
 2319  * inode is not allocated. Even if the inode is actively being
 2320  * written, it has been rolled back to its zero'ed state, so we
 2321  * are ensured that a zero inode is what is on the disk. For short
 2322  * lived files, this change will usually result in removing all the
 2323  * dependencies from the inode so that it can be freed immediately.
 2324  */
 2325 STATIC int
 2326 check_inode_unwritten(inodedep)
 2327         struct inodedep *inodedep;
 2328 {
 2329         splassert(IPL_BIO);
 2330 
 2331         if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
 2332             LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
 2333             LIST_FIRST(&inodedep->id_bufwait) != NULL ||
 2334             LIST_FIRST(&inodedep->id_inowait) != NULL ||
 2335             TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
 2336             TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
 2337             inodedep->id_nlinkdelta != 0)
 2338                 return (0);
 2339         inodedep->id_state |= ALLCOMPLETE;
 2340         LIST_REMOVE(inodedep, id_deps);
 2341         inodedep->id_buf = NULL;
 2342         if (inodedep->id_state & ONWORKLIST)
 2343                 WORKLIST_REMOVE(&inodedep->id_list);
 2344         if (inodedep->id_savedino1 != NULL) {
 2345                 FREE(inodedep->id_savedino1, M_INODEDEP);
 2346                 inodedep->id_savedino1 = NULL;
 2347         }
 2348         if (free_inodedep(inodedep) == 0) {
 2349                 FREE_LOCK(&lk);
 2350                 panic("check_inode_unwritten: busy inode");
 2351         }
 2352         return (1);
 2353 }
 2354 
 2355 /*
 2356  * Try to free an inodedep structure. Return 1 if it could be freed.
 2357  */
 2358 STATIC int
 2359 free_inodedep(inodedep)
 2360         struct inodedep *inodedep;
 2361 {
 2362 
 2363         if ((inodedep->id_state & ONWORKLIST) != 0 ||
 2364             (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
 2365             LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
 2366             LIST_FIRST(&inodedep->id_bufwait) != NULL ||
 2367             LIST_FIRST(&inodedep->id_inowait) != NULL ||
 2368             TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
 2369             TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
 2370             inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
 2371                 return (0);
 2372         LIST_REMOVE(inodedep, id_hash);
 2373         WORKITEM_FREE(inodedep, D_INODEDEP);
 2374         num_inodedep -= 1;
 2375         return (1);
 2376 }
 2377 
 2378 /*
 2379  * This workitem routine performs the block de-allocation.
 2380  * The workitem is added to the pending list after the updated
 2381  * inode block has been written to disk.  As mentioned above,
 2382  * checks regarding the number of blocks de-allocated (compared
 2383  * to the number of blocks allocated for the file) are also
 2384  * performed in this function.
 2385  */
 2386 STATIC void
 2387 handle_workitem_freeblocks(freeblks)
 2388         struct freeblks *freeblks;
 2389 {
 2390         struct inode tip;
 2391         daddr_t bn;
 2392         union {
 2393                 struct ufs1_dinode di1;
 2394                 struct ufs2_dinode di2;
 2395         } di;
 2396         struct fs *fs;
 2397         int i, level, bsize;
 2398         long nblocks, blocksreleased = 0;
 2399         int error, allerror = 0;
 2400         daddr64_t baselbns[NIADDR], tmpval;
 2401 
 2402         if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UM_UFS1)
 2403                 tip.i_din1 = &di.di1;
 2404         else
 2405                 tip.i_din2 = &di.di2;
 2406 
 2407         tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
 2408         tip.i_number = freeblks->fb_previousinum;
 2409         tip.i_ump = VFSTOUFS(freeblks->fb_mnt);
 2410         tip.i_dev = freeblks->fb_devvp->v_rdev;
 2411         DIP_ASSIGN(&tip, size, freeblks->fb_oldsize);
 2412         DIP_ASSIGN(&tip, uid, freeblks->fb_uid);
 2413         tip.i_vnode = NULL;
 2414         tmpval = 1;
 2415         baselbns[0] = NDADDR;
 2416         for (i = 1; i < NIADDR; i++) {
 2417                 tmpval *= NINDIR(fs);
 2418                 baselbns[i] = baselbns[i - 1] + tmpval;
 2419         }
 2420         nblocks = btodb(fs->fs_bsize);
 2421         blocksreleased = 0;
 2422         /*
 2423          * Indirect blocks first.
 2424          */
 2425         for (level = (NIADDR - 1); level >= 0; level--) {
 2426                 if ((bn = freeblks->fb_iblks[level]) == 0)
 2427                         continue;
 2428                 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
 2429                     baselbns[level], &blocksreleased)) != 0)
 2430                         allerror = error;
 2431                 ffs_blkfree(&tip, bn, fs->fs_bsize);
 2432                 blocksreleased += nblocks;
 2433         }
 2434         /*
 2435          * All direct blocks or frags.
 2436          */
 2437         for (i = (NDADDR - 1); i >= 0; i--) {
 2438                 if ((bn = freeblks->fb_dblks[i]) == 0)
 2439                         continue;
 2440                 bsize = blksize(fs, &tip, i);
 2441                 ffs_blkfree(&tip, bn, bsize);
 2442                 blocksreleased += btodb(bsize);
 2443         }
 2444 
 2445 #ifdef DIAGNOSTIC
 2446         if (freeblks->fb_chkcnt != blocksreleased)
 2447                 printf("handle_workitem_freeblocks: block count\n");
 2448         if (allerror)
 2449                 softdep_error("handle_workitem_freeblks", allerror);
 2450 #endif /* DIAGNOSTIC */
 2451         WORKITEM_FREE(freeblks, D_FREEBLKS);
 2452 }
 2453 
 2454 /*
 2455  * Release blocks associated with the inode ip and stored in the indirect
 2456  * block dbn. If level is greater than SINGLE, the block is an indirect block
 2457  * and recursive calls to indirtrunc must be used to cleanse other indirect
 2458  * blocks.
 2459  */
 2460 STATIC int
 2461 indir_trunc(ip, dbn, level, lbn, countp)
 2462         struct inode *ip;
 2463         daddr_t dbn;
 2464         int level;
 2465         daddr64_t lbn;
 2466         long *countp;
 2467 {
 2468         struct buf *bp;
 2469         int32_t *bap1 = NULL;
 2470         int64_t nb, *bap2 = NULL;
 2471         struct fs *fs;
 2472         struct worklist *wk;
 2473         struct indirdep *indirdep;
 2474         int i, lbnadd, nblocks, ufs1fmt;
 2475         int error, allerror = 0;
 2476 
 2477         fs = ip->i_fs;
 2478         lbnadd = 1;
 2479         for (i = level; i > 0; i--)
 2480                 lbnadd *= NINDIR(fs);
 2481         /*
 2482          * Get buffer of block pointers to be freed. This routine is not
 2483          * called until the zero'ed inode has been written, so it is safe
 2484          * to free blocks as they are encountered. Because the inode has
 2485          * been zero'ed, calls to bmap on these blocks will fail. So, we
 2486          * have to use the on-disk address and the block device for the
 2487          * filesystem to look them up. If the file was deleted before its
 2488          * indirect blocks were all written to disk, the routine that set
 2489          * us up (deallocate_dependencies) will have arranged to leave
 2490          * a complete copy of the indirect block in memory for our use.
 2491          * Otherwise we have to read the blocks in from the disk.
 2492          */
 2493         ACQUIRE_LOCK(&lk);
 2494         if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
 2495             (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 2496                 if (wk->wk_type != D_INDIRDEP ||
 2497                     (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
 2498                     (indirdep->ir_state & GOINGAWAY) == 0) {
 2499                         FREE_LOCK(&lk);
 2500                         panic("indir_trunc: lost indirdep");
 2501                 }
 2502                 WORKLIST_REMOVE(wk);
 2503                 WORKITEM_FREE(indirdep, D_INDIRDEP);
 2504                 if (LIST_FIRST(&bp->b_dep) != NULL) {
 2505                         FREE_LOCK(&lk);
 2506                         panic("indir_trunc: dangling dep");
 2507                 }
 2508                 FREE_LOCK(&lk);
 2509         } else {
 2510                 FREE_LOCK(&lk);
 2511                 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
 2512                 if (error)
 2513                         return (error);
 2514         }
 2515         /*
 2516          * Recursively free indirect blocks.
 2517          */
 2518         if (ip->i_ump->um_fstype == UM_UFS1) {
 2519                 ufs1fmt = 1;
 2520                 bap1 = (int32_t *)bp->b_data;
 2521         } else {
 2522                 ufs1fmt = 0;
 2523                 bap2 = (int64_t *)bp->b_data;
 2524         }
 2525         nblocks = btodb(fs->fs_bsize);
 2526         for (i = NINDIR(fs) - 1; i >= 0; i--) {
 2527                 if (ufs1fmt)
 2528                         nb = bap1[i];
 2529                 else
 2530                         nb = bap2[i];
 2531                 if (nb == 0)
 2532                         continue;
 2533                 if (level != 0) {
 2534                         if ((error = indir_trunc(ip, fsbtodb(fs, nb),
 2535                              level - 1, lbn + (i * lbnadd), countp)) != 0)
 2536                                 allerror = error;
 2537                 }
 2538                 ffs_blkfree(ip, nb, fs->fs_bsize);
 2539                 *countp += nblocks;
 2540         }
 2541         bp->b_flags |= B_INVAL | B_NOCACHE;
 2542         brelse(bp);
 2543         return (allerror);
 2544 }
 2545 
 2546 /*
 2547  * Free an allocindir.
 2548  * This routine must be called with splbio interrupts blocked.
 2549  */
 2550 STATIC void
 2551 free_allocindir(aip, inodedep)
 2552         struct allocindir *aip;
 2553         struct inodedep *inodedep;
 2554 {
 2555         struct freefrag *freefrag;
 2556 
 2557         splassert(IPL_BIO);
 2558 
 2559 #ifdef DEBUG
 2560         if (lk.lkt_held == -1)
 2561                 panic("free_allocindir: lock not held");
 2562 #endif
 2563         if ((aip->ai_state & DEPCOMPLETE) == 0)
 2564                 LIST_REMOVE(aip, ai_deps);
 2565         if (aip->ai_state & ONWORKLIST)
 2566                 WORKLIST_REMOVE(&aip->ai_list);
 2567         LIST_REMOVE(aip, ai_next);
 2568         if ((freefrag = aip->ai_freefrag) != NULL) {
 2569                 if (inodedep == NULL)
 2570                         add_to_worklist(&freefrag->ff_list);
 2571                 else
 2572                         WORKLIST_INSERT(&inodedep->id_bufwait,
 2573                             &freefrag->ff_list);
 2574         }
 2575         WORKITEM_FREE(aip, D_ALLOCINDIR);
 2576 }
 2577 
 2578 /*
 2579  * Directory entry addition dependencies.
 2580  * 
 2581  * When adding a new directory entry, the inode (with its incremented link
 2582  * count) must be written to disk before the directory entry's pointer to it.
 2583  * Also, if the inode is newly allocated, the corresponding freemap must be
 2584  * updated (on disk) before the directory entry's pointer. These requirements
 2585  * are met via undo/redo on the directory entry's pointer, which consists
 2586  * simply of the inode number.
 2587  * 
 2588  * As directory entries are added and deleted, the free space within a
 2589  * directory block can become fragmented.  The ufs file system will compact
 2590  * a fragmented directory block to make space for a new entry. When this
 2591  * occurs, the offsets of previously added entries change. Any "diradd"
 2592  * dependency structures corresponding to these entries must be updated with
 2593  * the new offsets.
 2594  */
 2595 
 2596 /*
 2597  * This routine is called after the in-memory inode's link
 2598  * count has been incremented, but before the directory entry's
 2599  * pointer to the inode has been set.
 2600  */
 2601 int 
 2602 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 2603         struct buf *bp;         /* buffer containing directory block */
 2604         struct inode *dp;       /* inode for directory */
 2605         off_t diroffset;        /* offset of new entry in directory */
 2606         long newinum;           /* inode referenced by new directory entry */
 2607         struct buf *newdirbp;   /* non-NULL => contents of new mkdir */
 2608         int isnewblk;           /* entry is in a newly allocated block */
 2609 {
 2610         int offset;             /* offset of new entry within directory block */
 2611         daddr64_t lbn;          /* block in directory containing new entry */
 2612         struct fs *fs;
 2613         struct diradd *dap;
 2614         struct allocdirect *adp;
 2615         struct pagedep *pagedep;
 2616         struct inodedep *inodedep;
 2617         struct newdirblk *newdirblk = NULL;
 2618         struct mkdir *mkdir1, *mkdir2;
 2619         
 2620 
 2621         fs = dp->i_fs;
 2622         lbn = lblkno(fs, diroffset);
 2623         offset = blkoff(fs, diroffset);
 2624         dap = pool_get(&diradd_pool, PR_WAITOK);
 2625         bzero(dap,sizeof(struct diradd));
 2626         dap->da_list.wk_type = D_DIRADD;
 2627         dap->da_offset = offset;
 2628         dap->da_newinum = newinum;
 2629         dap->da_state = ATTACHED;
 2630         if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
 2631                 newdirblk = pool_get(&newdirblk_pool, PR_WAITOK);
 2632                 newdirblk->db_list.wk_type = D_NEWDIRBLK;
 2633                 newdirblk->db_state = 0;
 2634         }
 2635         if (newdirbp == NULL) {
 2636                 dap->da_state |= DEPCOMPLETE;
 2637                 ACQUIRE_LOCK(&lk);
 2638         } else {
 2639                 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 2640                 mkdir1 = pool_get(&mkdir_pool, PR_WAITOK);
 2641                 mkdir1->md_list.wk_type = D_MKDIR;
 2642                 mkdir1->md_state = MKDIR_BODY;
 2643                 mkdir1->md_diradd = dap;
 2644                 mkdir2 = pool_get(&mkdir_pool, PR_WAITOK);
 2645                 mkdir2->md_list.wk_type = D_MKDIR;
 2646                 mkdir2->md_state = MKDIR_PARENT;
 2647                 mkdir2->md_diradd = dap;
 2648                 /*
 2649                  * Dependency on "." and ".." being written to disk.
 2650                  */
 2651                 mkdir1->md_buf = newdirbp;
 2652                 ACQUIRE_LOCK(&lk);
 2653                 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
 2654                 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
 2655                 FREE_LOCK(&lk);
 2656                 bdwrite(newdirbp);
 2657                 /*
 2658                  * Dependency on link count increase for parent directory
 2659                  */
 2660                 ACQUIRE_LOCK(&lk);
 2661                 if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
 2662                     || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 2663                         dap->da_state &= ~MKDIR_PARENT;
 2664                         WORKITEM_FREE(mkdir2, D_MKDIR);
 2665                 } else {
 2666                         LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
 2667                         WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
 2668                 }
 2669         }
 2670         /*
 2671          * Link into parent directory pagedep to await its being written.
 2672          */
 2673         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 2674                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 2675         dap->da_pagedep = pagedep;
 2676         LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 2677             da_pdlist);
 2678         /*
 2679          * Link into its inodedep. Put it on the id_bufwait list if the inode
 2680          * is not yet written. If it is written, do the post-inode write
 2681          * processing to put it on the id_pendinghd list.
 2682          */
 2683         (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
 2684         if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 2685                 diradd_inode_written(dap, inodedep);
 2686         else
 2687                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 2688         if (isnewblk) {
 2689                 /*
 2690                  * Directories growing into indirect blocks are rare
 2691                  * enough and the frequency of new block allocation
 2692                  * in those cases even more rare, that we choose not
 2693                  * to bother tracking them. Rather we simply force the
 2694                  * new directory entry to disk.
 2695                  */
 2696                 if (lbn >= NDADDR) {
 2697                         FREE_LOCK(&lk);
 2698                         /*
 2699                          * We only have a new allocation when at the
 2700                          * beginning of a new block, not when we are
 2701                          * expanding into an existing block.
 2702                          */
 2703                         if (blkoff(fs, diroffset) == 0)
 2704                                 return (1);
 2705                         return (0);
 2706                 }
 2707                 /*
 2708                  * We only have a new allocation when at the beginning
 2709                  * of a new fragment, not when we are expanding into an
 2710                  * existing fragment. Also, there is nothing to do if we
 2711                  * are already tracking this block.
 2712                  */
 2713                 if (fragoff(fs, diroffset) != 0) {
 2714                         FREE_LOCK(&lk);
 2715                         return (0);
 2716                 }
 2717                         
 2718                 if ((pagedep->pd_state & NEWBLOCK) != 0) {
 2719                         WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 2720                         FREE_LOCK(&lk);
 2721                         return (0);
 2722                 }
 2723                 /*
 2724                  * Find our associated allocdirect and have it track us.
 2725                  */
 2726                 if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
 2727                         panic("softdep_setup_directory_add: lost inodedep");
 2728                 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
 2729                 if (adp == NULL || adp->ad_lbn != lbn) {
 2730                         FREE_LOCK(&lk);
 2731                         panic("softdep_setup_directory_add: lost entry");
 2732                 }
 2733                 pagedep->pd_state |= NEWBLOCK;
 2734                 newdirblk->db_pagedep = pagedep;
 2735                 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
 2736         }
 2737         FREE_LOCK(&lk);
 2738         return (0);
 2739 }
 2740 
 2741 /*
 2742  * This procedure is called to change the offset of a directory
 2743  * entry when compacting a directory block which must be owned
 2744  * exclusively by the caller. Note that the actual entry movement
 2745  * must be done in this procedure to ensure that no I/O completions
 2746  * occur while the move is in progress.
 2747  */
 2748 void 
 2749 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 2750         struct inode *dp;       /* inode for directory */
 2751         caddr_t base;           /* address of dp->i_offset */
 2752         caddr_t oldloc;         /* address of old directory location */
 2753         caddr_t newloc;         /* address of new directory location */
 2754         int entrysize;          /* size of directory entry */
 2755 {
 2756         int offset, oldoffset, newoffset;
 2757         struct pagedep *pagedep;
 2758         struct diradd *dap;
 2759         daddr64_t lbn;
 2760 
 2761         ACQUIRE_LOCK(&lk);
 2762         lbn = lblkno(dp->i_fs, dp->i_offset);
 2763         offset = blkoff(dp->i_fs, dp->i_offset);
 2764         if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
 2765                 goto done;
 2766         oldoffset = offset + (oldloc - base);
 2767         newoffset = offset + (newloc - base);
 2768 
 2769         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
 2770                 if (dap->da_offset != oldoffset)
 2771                         continue;
 2772                 dap->da_offset = newoffset;
 2773                 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
 2774                         break;
 2775                 LIST_REMOVE(dap, da_pdlist);
 2776                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
 2777                     dap, da_pdlist);
 2778                 break;
 2779         }
 2780         if (dap == NULL) {
 2781 
 2782                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
 2783                         if (dap->da_offset == oldoffset) {
 2784                                 dap->da_offset = newoffset;
 2785                                 break;
 2786                         }
 2787                 }
 2788         }
 2789 done:
 2790         bcopy(oldloc, newloc, entrysize);
 2791         FREE_LOCK(&lk);
 2792 }
 2793 
 2794 /*
 2795  * Free a diradd dependency structure. This routine must be called
 2796  * with splbio interrupts blocked.
 2797  */
 2798 STATIC void
 2799 free_diradd(dap)
 2800         struct diradd *dap;
 2801 {
 2802         struct dirrem *dirrem;
 2803         struct pagedep *pagedep;
 2804         struct inodedep *inodedep;
 2805         struct mkdir *mkdir, *nextmd;
 2806 
 2807         splassert(IPL_BIO);
 2808 
 2809 #ifdef DEBUG
 2810         if (lk.lkt_held == -1)
 2811                 panic("free_diradd: lock not held");
 2812 #endif
 2813         WORKLIST_REMOVE(&dap->da_list);
 2814         LIST_REMOVE(dap, da_pdlist);
 2815         if ((dap->da_state & DIRCHG) == 0) {
 2816                 pagedep = dap->da_pagedep;
 2817         } else {
 2818                 dirrem = dap->da_previous;
 2819                 pagedep = dirrem->dm_pagedep;
 2820                 dirrem->dm_dirinum = pagedep->pd_ino;
 2821                 add_to_worklist(&dirrem->dm_list);
 2822         }
 2823         if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
 2824             0, &inodedep) != 0)
 2825                 (void) free_inodedep(inodedep);
 2826         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 2827                 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
 2828                         nextmd = LIST_NEXT(mkdir, md_mkdirs);
 2829                         if (mkdir->md_diradd != dap)
 2830                                 continue;
 2831                         dap->da_state &= ~mkdir->md_state;
 2832                         WORKLIST_REMOVE(&mkdir->md_list);
 2833                         LIST_REMOVE(mkdir, md_mkdirs);
 2834                         WORKITEM_FREE(mkdir, D_MKDIR);
 2835                 }
 2836                 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 2837                         FREE_LOCK(&lk);
 2838                         panic("free_diradd: unfound ref");
 2839                 }
 2840         }
 2841         WORKITEM_FREE(dap, D_DIRADD);
 2842 }
 2843 
 2844 /*
 2845  * Directory entry removal dependencies.
 2846  * 
 2847  * When removing a directory entry, the entry's inode pointer must be
 2848  * zero'ed on disk before the corresponding inode's link count is decremented
 2849  * (possibly freeing the inode for re-use). This dependency is handled by
 2850  * updating the directory entry but delaying the inode count reduction until
 2851  * after the directory block has been written to disk. After this point, the
 2852  * inode count can be decremented whenever it is convenient.
 2853  */
 2854 
 2855 /*
 2856  * This routine should be called immediately after removing
 2857  * a directory entry.  The inode's link count should not be
 2858  * decremented by the calling procedure -- the soft updates
 2859  * code will do this task when it is safe.
 2860  */
 2861 void 
 2862 softdep_setup_remove(bp, dp, ip, isrmdir)
 2863         struct buf *bp;         /* buffer containing directory block */
 2864         struct inode *dp;       /* inode for the directory being modified */
 2865         struct inode *ip;       /* inode for directory entry being removed */
 2866         int isrmdir;            /* indicates if doing RMDIR */
 2867 {
 2868         struct dirrem *dirrem, *prevdirrem;
 2869 
 2870         /*
 2871          * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
 2872          */
 2873         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 2874 
 2875         /*
 2876          * If the COMPLETE flag is clear, then there were no active
 2877          * entries and we want to roll back to a zeroed entry until
 2878          * the new inode is committed to disk. If the COMPLETE flag is
 2879          * set then we have deleted an entry that never made it to
 2880          * disk. If the entry we deleted resulted from a name change,
 2881          * then the old name still resides on disk. We cannot delete
 2882          * its inode (returned to us in prevdirrem) until the zeroed
 2883          * directory entry gets to disk. The new inode has never been
 2884          * referenced on the disk, so can be deleted immediately.
 2885          */
 2886         if ((dirrem->dm_state & COMPLETE) == 0) {
 2887                 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 2888                     dm_next);
 2889                 FREE_LOCK(&lk);
 2890         } else {
 2891                 if (prevdirrem != NULL)
 2892                         LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 2893                             prevdirrem, dm_next);
 2894                 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 2895                 FREE_LOCK(&lk);
 2896                 handle_workitem_remove(dirrem);
 2897         }
 2898 }
 2899 
 2900 /*
 2901  * Allocate a new dirrem if appropriate and return it along with
 2902  * its associated pagedep. Called without a lock, returns with lock.
 2903  */
 2904 STATIC long num_dirrem;         /* number of dirrem allocated */
 2905 STATIC struct dirrem *
 2906 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 2907         struct buf *bp;         /* buffer containing directory block */
 2908         struct inode *dp;       /* inode for the directory being modified */
 2909         struct inode *ip;       /* inode for directory entry being removed */
 2910         int isrmdir;            /* indicates if doing RMDIR */
 2911         struct dirrem **prevdirremp; /* previously referenced inode, if any */
 2912 {
 2913         int offset;
 2914         daddr64_t lbn;
 2915         struct diradd *dap;
 2916         struct dirrem *dirrem;
 2917         struct pagedep *pagedep;
 2918 
 2919         /*
 2920          * Whiteouts have no deletion dependencies.
 2921          */
 2922         if (ip == NULL)
 2923                 panic("newdirrem: whiteout");
 2924         /*
 2925          * If we are over our limit, try to improve the situation.
 2926          * Limiting the number of dirrem structures will also limit
 2927          * the number of freefile and freeblks structures.
 2928          */
 2929         if (num_dirrem > max_softdeps / 2)
 2930                 (void) request_cleanup(FLUSH_REMOVE, 0);
 2931         num_dirrem += 1;
 2932         dirrem = pool_get(&dirrem_pool, PR_WAITOK);
 2933         bzero(dirrem,sizeof(struct dirrem));
 2934         dirrem->dm_list.wk_type = D_DIRREM;
 2935         dirrem->dm_state = isrmdir ? RMDIR : 0;
 2936         dirrem->dm_mnt = ITOV(ip)->v_mount;
 2937         dirrem->dm_oldinum = ip->i_number;
 2938         *prevdirremp = NULL;
 2939 
 2940         ACQUIRE_LOCK(&lk);
 2941         lbn = lblkno(dp->i_fs, dp->i_offset);
 2942         offset = blkoff(dp->i_fs, dp->i_offset);
 2943         if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 2944                 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 2945         dirrem->dm_pagedep = pagedep;
 2946         /*
 2947          * Check for a diradd dependency for the same directory entry.
 2948          * If present, then both dependencies become obsolete and can
 2949          * be de-allocated. Check for an entry on both the pd_dirraddhd
 2950          * list and the pd_pendinghd list.
 2951          */
 2952 
 2953         LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
 2954                 if (dap->da_offset == offset)
 2955                         break;
 2956         if (dap == NULL) {
 2957 
 2958                 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 2959                         if (dap->da_offset == offset)
 2960                                 break;
 2961                 if (dap == NULL)
 2962                         return (dirrem);
 2963         }
 2964         /*
 2965          * Must be ATTACHED at this point.
 2966          */
 2967         if ((dap->da_state & ATTACHED) == 0) {
 2968                 FREE_LOCK(&lk);
 2969                 panic("newdirrem: not ATTACHED");
 2970         }
 2971         if (dap->da_newinum != ip->i_number) {
 2972                 FREE_LOCK(&lk);
 2973                 panic("newdirrem: inum %d should be %d",
 2974                     ip->i_number, dap->da_newinum);
 2975         }
 2976         /*
 2977          * If we are deleting a changed name that never made it to disk,
 2978          * then return the dirrem describing the previous inode (which
 2979          * represents the inode currently referenced from this entry on disk).
 2980          */
 2981         if ((dap->da_state & DIRCHG) != 0) {
 2982                 *prevdirremp = dap->da_previous;
 2983                 dap->da_state &= ~DIRCHG;
 2984                 dap->da_pagedep = pagedep;
 2985         }
 2986         /*
 2987          * We are deleting an entry that never made it to disk.
 2988          * Mark it COMPLETE so we can delete its inode immediately.
 2989          */
 2990         dirrem->dm_state |= COMPLETE;
 2991         free_diradd(dap);
 2992         return (dirrem);
 2993 }
 2994 
 2995 /*
 2996  * Directory entry change dependencies.
 2997  * 
 2998  * Changing an existing directory entry requires that an add operation
 2999  * be completed first followed by a deletion. The semantics for the addition
 3000  * are identical to the description of adding a new entry above except
 3001  * that the rollback is to the old inode number rather than zero. Once
 3002  * the addition dependency is completed, the removal is done as described
 3003  * in the removal routine above.
 3004  */
 3005 
 3006 /*
 3007  * This routine should be called immediately after changing
 3008  * a directory entry.  The inode's link count should not be
 3009  * decremented by the calling procedure -- the soft updates
 3010  * code will perform this task when it is safe.
 3011  */
 3012 void 
 3013 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 3014         struct buf *bp;         /* buffer containing directory block */
 3015         struct inode *dp;       /* inode for the directory being modified */
 3016         struct inode *ip;       /* inode for directory entry being removed */
 3017         long newinum;           /* new inode number for changed entry */
 3018         int isrmdir;            /* indicates if doing RMDIR */
 3019 {
 3020         int offset;
 3021         struct diradd *dap = NULL;
 3022         struct dirrem *dirrem, *prevdirrem;
 3023         struct pagedep *pagedep;
 3024         struct inodedep *inodedep;
 3025 
 3026         offset = blkoff(dp->i_fs, dp->i_offset);
 3027         dap = pool_get(&diradd_pool, PR_WAITOK);
 3028         bzero(dap,sizeof(struct diradd));
 3029         dap->da_list.wk_type = D_DIRADD;
 3030         dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 3031         dap->da_offset = offset;
 3032         dap->da_newinum = newinum;
 3033 
 3034         /*
 3035          * Allocate a new dirrem and ACQUIRE_LOCK.
 3036          */
 3037         dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 3038         pagedep = dirrem->dm_pagedep;
 3039         /*
 3040          * The possible values for isrmdir:
 3041          *      0 - non-directory file rename
 3042          *      1 - directory rename within same directory
 3043          *   inum - directory rename to new directory of given inode number
 3044          * When renaming to a new directory, we are both deleting and
 3045          * creating a new directory entry, so the link count on the new
 3046          * directory should not change. Thus we do not need the followup
 3047          * dirrem which is usually done in handle_workitem_remove. We set
 3048          * the DIRCHG flag to tell handle_workitem_remove to skip the 
 3049          * followup dirrem.
 3050          */
 3051         if (isrmdir > 1)
 3052                 dirrem->dm_state |= DIRCHG;
 3053 
 3054         /*
 3055          * If the COMPLETE flag is clear, then there were no active
 3056          * entries and we want to roll back to the previous inode until
 3057          * the new inode is committed to disk. If the COMPLETE flag is
 3058          * set, then we have deleted an entry that never made it to disk.
 3059          * If the entry we deleted resulted from a name change, then the old
 3060          * inode reference still resides on disk. Any rollback that we do
 3061          * needs to be to that old inode (returned to us in prevdirrem). If
 3062          * the entry we deleted resulted from a create, then there is
 3063          * no entry on the disk, so we want to roll back to zero rather
 3064          * than the uncommitted inode. In either of the COMPLETE cases we
 3065          * want to immediately free the unwritten and unreferenced inode.
 3066          */
 3067         if ((dirrem->dm_state & COMPLETE) == 0) {
 3068                 dap->da_previous = dirrem;
 3069         } else {
 3070                 if (prevdirrem != NULL) {
 3071                         dap->da_previous = prevdirrem;
 3072                 } else {
 3073                         dap->da_state &= ~DIRCHG;
 3074                         dap->da_pagedep = pagedep;
 3075                 }
 3076                 dirrem->dm_dirinum = pagedep->pd_ino;
 3077                 add_to_worklist(&dirrem->dm_list);
 3078         }
 3079         /*
 3080          * Link into its inodedep. Put it on the id_bufwait list if the inode
 3081          * is not yet written. If it is written, do the post-inode write
 3082          * processing to put it on the id_pendinghd list.
 3083          */
 3084         if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
 3085             (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 3086                 dap->da_state |= COMPLETE;
 3087                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 3088                 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 3089         } else {
 3090                 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 3091                     dap, da_pdlist);
 3092                 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 3093         }
 3094         FREE_LOCK(&lk);
 3095 }
 3096 
 3097 /*
 3098  * Called whenever the link count on an inode is changed.
 3099  * It creates an inode dependency so that the new reference(s)
 3100  * to the inode cannot be committed to disk until the updated
 3101  * inode has been written.
 3102  */
 3103 void
 3104 softdep_change_linkcnt(ip, nodelay)
 3105         struct inode *ip;       /* the inode with the increased link count */
 3106         int nodelay;            /* do background work or not */
 3107 {
 3108         struct inodedep *inodedep;
 3109         int flags;
 3110 
 3111         /*
 3112          * If requested, do not allow background work to happen.
 3113          */
 3114         flags = DEPALLOC;
 3115         if (nodelay)
 3116                 flags |= NODELAY;
 3117 
 3118         ACQUIRE_LOCK(&lk);
 3119 
 3120         (void) inodedep_lookup(ip->i_fs, ip->i_number, flags, &inodedep);
 3121         if (DIP(ip, nlink) < ip->i_effnlink) {
 3122                 FREE_LOCK(&lk);
 3123                 panic("softdep_change_linkcnt: bad delta");
 3124         }
 3125 
 3126         inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
 3127 
 3128         FREE_LOCK(&lk);
 3129 }
 3130 
 3131 /*
 3132  * This workitem decrements the inode's link count.
 3133  * If the link count reaches zero, the file is removed.
 3134  */
 3135 STATIC void 
 3136 handle_workitem_remove(dirrem)
 3137         struct dirrem *dirrem;
 3138 {
 3139         struct proc *p = CURPROC;       /* XXX */
 3140         struct inodedep *inodedep;
 3141         struct vnode *vp;
 3142         struct inode *ip;
 3143         ino_t oldinum;
 3144         int error;
 3145 
 3146         if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
 3147                 softdep_error("handle_workitem_remove: vget", error);
 3148                 return;
 3149         }
 3150         ip = VTOI(vp);
 3151         ACQUIRE_LOCK(&lk);
 3152         if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) 
 3153             == 0) {
 3154                 FREE_LOCK(&lk);
 3155                 panic("handle_workitem_remove: lost inodedep");
 3156         }
 3157         /*
 3158          * Normal file deletion.
 3159          */
 3160         if ((dirrem->dm_state & RMDIR) == 0) {
 3161                 DIP_ADD(ip, nlink, -1);
 3162                 ip->i_flag |= IN_CHANGE;
 3163                 if (DIP(ip, nlink) < ip->i_effnlink) {
 3164                         FREE_LOCK(&lk);
 3165                         panic("handle_workitem_remove: bad file delta");
 3166                 }
 3167                 inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
 3168                 FREE_LOCK(&lk);
 3169                 vput(vp);
 3170                 num_dirrem -= 1;
 3171                 WORKITEM_FREE(dirrem, D_DIRREM);
 3172                 return;
 3173         }
 3174         /*
 3175          * Directory deletion. Decrement reference count for both the
 3176          * just deleted parent directory entry and the reference for ".".
 3177          * Next truncate the directory to length zero. When the
 3178          * truncation completes, arrange to have the reference count on
 3179          * the parent decremented to account for the loss of "..".
 3180          */
 3181         DIP_ADD(ip, nlink, -2);
 3182         ip->i_flag |= IN_CHANGE;
 3183         if (DIP(ip, nlink) < ip->i_effnlink)
 3184                 panic("handle_workitem_remove: bad dir delta");
 3185         inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
 3186         FREE_LOCK(&lk);
 3187         if ((error = UFS_TRUNCATE(ip, (off_t)0, 0, p->p_ucred)) != 0)
 3188                 softdep_error("handle_workitem_remove: truncate", error);
 3189         /*
 3190          * Rename a directory to a new parent. Since, we are both deleting
 3191          * and creating a new directory entry, the link count on the new
 3192          * directory should not change. Thus we skip the followup dirrem.
 3193          */
 3194         if (dirrem->dm_state & DIRCHG) {
 3195                 vput(vp);
 3196                 num_dirrem -= 1;
 3197                 WORKITEM_FREE(dirrem, D_DIRREM);
 3198                 return;
 3199         }
 3200         /*
 3201          * If the inodedep does not exist, then the zero'ed inode has
 3202          * been written to disk. If the allocated inode has never been
 3203          * written to disk, then the on-disk inode is zero'ed. In either
 3204          * case we can remove the file immediately.
 3205          */
 3206         ACQUIRE_LOCK(&lk);
 3207         dirrem->dm_state = 0;
 3208         oldinum = dirrem->dm_oldinum;
 3209         dirrem->dm_oldinum = dirrem->dm_dirinum;
 3210         if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
 3211             check_inode_unwritten(inodedep)) {
 3212                 FREE_LOCK(&lk);
 3213                 vput(vp);
 3214                 handle_workitem_remove(dirrem);
 3215                 return;
 3216         }
 3217         WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 3218         FREE_LOCK(&lk);
 3219         ip->i_flag |= IN_CHANGE;
 3220         UFS_UPDATE(VTOI(vp), 0);
 3221         vput(vp);
 3222 }
 3223 
 3224 /*
 3225  * Inode de-allocation dependencies.
 3226  * 
 3227  * When an inode's link count is reduced to zero, it can be de-allocated. We
 3228  * found it convenient to postpone de-allocation until after the inode is
 3229  * written to disk with its new link count (zero).  At this point, all of the
 3230  * on-disk inode's block pointers are nullified and, with careful dependency
 3231  * list ordering, all dependencies related to the inode will be satisfied and
 3232  * the corresponding dependency structures de-allocated.  So, if/when the
 3233  * inode is reused, there will be no mixing of old dependencies with new
 3234  * ones.  This artificial dependency is set up by the block de-allocation
 3235  * procedure above (softdep_setup_freeblocks) and completed by the
 3236  * following procedure.
 3237  */
 3238 STATIC void 
 3239 handle_workitem_freefile(freefile)
 3240         struct freefile *freefile;
 3241 {
 3242         struct fs *fs;
 3243         struct vnode vp;
 3244         struct inode tip;
 3245 #ifdef DEBUG
 3246         struct inodedep *idp;
 3247 #endif
 3248         int error;
 3249 
 3250         fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
 3251 #ifdef DEBUG
 3252         ACQUIRE_LOCK(&lk);
 3253         error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
 3254         FREE_LOCK(&lk);
 3255         if (error)
 3256                 panic("handle_workitem_freefile: inodedep survived");
 3257 #endif
 3258         tip.i_ump = VFSTOUFS(freefile->fx_mnt);
 3259         tip.i_dev = freefile->fx_devvp->v_rdev;
 3260         tip.i_fs = fs;
 3261         tip.i_vnode = &vp;
 3262         vp.v_data = &tip;
 3263 
 3264         if ((error = ffs_freefile(&tip, freefile->fx_oldinum, 
 3265                  freefile->fx_mode)) != 0) {
 3266                 softdep_error("handle_workitem_freefile", error);
 3267         }
 3268         WORKITEM_FREE(freefile, D_FREEFILE);
 3269 }
 3270 
 3271 /*
 3272  * Disk writes.
 3273  * 
 3274  * The dependency structures constructed above are most actively used when file
 3275  * system blocks are written to disk.  No constraints are placed on when a
 3276  * block can be written, but unsatisfied update dependencies are made safe by
 3277  * modifying (or replacing) the source memory for the duration of the disk
 3278  * write.  When the disk write completes, the memory block is again brought
 3279  * up-to-date.
 3280  *
 3281  * In-core inode structure reclamation.
 3282  * 
 3283  * Because there are a finite number of "in-core" inode structures, they are
 3284  * reused regularly.  By transferring all inode-related dependencies to the
 3285  * in-memory inode block and indexing them separately (via "inodedep"s), we
 3286  * can allow "in-core" inode structures to be reused at any time and avoid
 3287  * any increase in contention.
 3288  *
 3289  * Called just before entering the device driver to initiate a new disk I/O.
 3290  * The buffer must be locked, thus, no I/O completion operations can occur
 3291  * while we are manipulating its associated dependencies.
 3292  */
 3293 void 
 3294 softdep_disk_io_initiation(bp)
 3295         struct buf *bp;         /* structure describing disk write to occur */
 3296 {
 3297         struct worklist *wk, *nextwk;
 3298         struct indirdep *indirdep;
 3299         struct inodedep *inodedep;
 3300         struct buf *sbp;
 3301 
 3302         /*
 3303          * We only care about write operations. There should never
 3304          * be dependencies for reads.
 3305          */
 3306         if (bp->b_flags & B_READ)
 3307                 panic("softdep_disk_io_initiation: read");
 3308 
 3309         ACQUIRE_LOCK(&lk);
 3310 
 3311         /*
 3312          * Do any necessary pre-I/O processing.
 3313          */
 3314         for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
 3315                 nextwk = LIST_NEXT(wk, wk_list);
 3316                 switch (wk->wk_type) {
 3317 
 3318                 case D_PAGEDEP:
 3319                         initiate_write_filepage(WK_PAGEDEP(wk), bp);
 3320                         continue;
 3321 
 3322                 case D_INODEDEP:
 3323                         inodedep = WK_INODEDEP(wk);
 3324                         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
 3325                                 initiate_write_inodeblock_ufs1(inodedep, bp);
 3326 #ifdef FFS2
 3327                         else
 3328                                 initiate_write_inodeblock_ufs2(inodedep, bp);
 3329 #endif
 3330                         continue;
 3331 
 3332                 case D_INDIRDEP:
 3333                         indirdep = WK_INDIRDEP(wk);
 3334                         if (indirdep->ir_state & GOINGAWAY)
 3335                                 panic("disk_io_initiation: indirdep gone");
 3336                         /*
 3337                          * If there are no remaining dependencies, this
 3338                          * will be writing the real pointers, so the
 3339                          * dependency can be freed.
 3340                          */
 3341                         if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
 3342                                 sbp = indirdep->ir_savebp;
 3343                                 sbp->b_flags |= B_INVAL | B_NOCACHE;
 3344                                 /* inline expand WORKLIST_REMOVE(wk); */
 3345                                 wk->wk_state &= ~ONWORKLIST;
 3346                                 LIST_REMOVE(wk, wk_list);
 3347                                 WORKITEM_FREE(indirdep, D_INDIRDEP);
 3348                                 FREE_LOCK(&lk);
 3349                                 brelse(sbp);
 3350                                 ACQUIRE_LOCK(&lk);
 3351                                 continue;
 3352                         }
 3353                         /*
 3354                          * Replace up-to-date version with safe version.
 3355                          */
 3356                         FREE_LOCK(&lk);
 3357                         indirdep->ir_saveddata = malloc(bp->b_bcount,
 3358                             M_INDIRDEP, M_WAITOK);
 3359                         ACQUIRE_LOCK(&lk);
 3360                         indirdep->ir_state &= ~ATTACHED;
 3361                         indirdep->ir_state |= UNDONE;
 3362                         bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
 3363                         bcopy(indirdep->ir_savebp->b_data, bp->b_data,
 3364                             bp->b_bcount);
 3365                         continue;
 3366 
 3367                 case D_MKDIR:
 3368                 case D_BMSAFEMAP:
 3369                 case D_ALLOCDIRECT:
 3370                 case D_ALLOCINDIR:
 3371                         continue;
 3372 
 3373                 default:
 3374                         FREE_LOCK(&lk);
 3375                         panic("handle_disk_io_initiation: Unexpected type %s",
 3376                             TYPENAME(wk->wk_type));
 3377                         /* NOTREACHED */
 3378                 }
 3379         }
 3380 
 3381         FREE_LOCK(&lk);
 3382 }
 3383 
 3384 /*
 3385  * Called from within the procedure above to deal with unsatisfied
 3386  * allocation dependencies in a directory. The buffer must be locked,
 3387  * thus, no I/O completion operations can occur while we are
 3388  * manipulating its associated dependencies.
 3389  */
 3390 STATIC void
 3391 initiate_write_filepage(pagedep, bp)
 3392         struct pagedep *pagedep;
 3393         struct buf *bp;
 3394 {
 3395         struct diradd *dap;
 3396         struct direct *ep;
 3397         int i;
 3398 
 3399         if (pagedep->pd_state & IOSTARTED) {
 3400                 /*
 3401                  * This can only happen if there is a driver that does not
 3402                  * understand chaining. Here biodone will reissue the call
 3403                  * to strategy for the incomplete buffers.
 3404                  */
 3405                 printf("initiate_write_filepage: already started\n");
 3406                 return;
 3407         }
 3408         pagedep->pd_state |= IOSTARTED;
 3409         for (i = 0; i < DAHASHSZ; i++) {
 3410                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 3411                         ep = (struct direct *)
 3412                             ((char *)bp->b_data + dap->da_offset);
 3413                         if (ep->d_ino != dap->da_newinum) {
 3414                                 FREE_LOCK(&lk);
 3415                                 panic("%s: dir inum %d != new %d",
 3416                                     "initiate_write_filepage",
 3417                                     ep->d_ino, dap->da_newinum);
 3418                         }
 3419                         if (dap->da_state & DIRCHG)
 3420                                 ep->d_ino = dap->da_previous->dm_oldinum;
 3421                         else
 3422                                 ep->d_ino = 0;
 3423                         dap->da_state &= ~ATTACHED;
 3424                         dap->da_state |= UNDONE;
 3425                 }
 3426         }
 3427 }
 3428 
 3429 /*
 3430  * Called from within the procedure above to deal with unsatisfied
 3431  * allocation dependencies in an inodeblock. The buffer must be
 3432  * locked, thus, no I/O completion operations can occur while we
 3433  * are manipulating its associated dependencies.
 3434  */
 3435 STATIC void 
 3436 initiate_write_inodeblock_ufs1(inodedep, bp)
 3437         struct inodedep *inodedep;
 3438         struct buf *bp;                 /* The inode block */
 3439 {
 3440         struct allocdirect *adp, *lastadp;
 3441         struct ufs1_dinode *dp;
 3442         struct fs *fs;
 3443 #ifdef DIAGNOSTIC
 3444         daddr64_t prevlbn = 0;
 3445         int32_t d1, d2;
 3446 #endif
 3447         int i, deplist;
 3448 
 3449         if (inodedep->id_state & IOSTARTED) {
 3450                 FREE_LOCK(&lk);
 3451                 panic("initiate_write_inodeblock: already started");
 3452         }
 3453         inodedep->id_state |= IOSTARTED;
 3454         fs = inodedep->id_fs;
 3455         dp = (struct ufs1_dinode *)bp->b_data +
 3456             ino_to_fsbo(fs, inodedep->id_ino);
 3457         /*
 3458          * If the bitmap is not yet written, then the allocated
 3459          * inode cannot be written to disk.
 3460          */
 3461         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 3462                 if (inodedep->id_savedino1 != NULL) {
 3463                         FREE_LOCK(&lk);
 3464                         panic("initiate_write_inodeblock: already doing I/O");
 3465                 }
 3466                 FREE_LOCK(&lk);
 3467                 MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
 3468                     sizeof(struct ufs1_dinode), M_INODEDEP, M_WAITOK);
 3469                 ACQUIRE_LOCK(&lk);
 3470                 *inodedep->id_savedino1 = *dp;
 3471                 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
 3472                 return;
 3473         }
 3474         /*
 3475          * If no dependencies, then there is nothing to roll back.
 3476          */
 3477         inodedep->id_savedsize = dp->di_size;
 3478         if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
 3479                 return;
 3480         /*
 3481          * Set the dependencies to busy.
 3482          */
 3483         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3484              adp = TAILQ_NEXT(adp, ad_next)) {
 3485 #ifdef DIAGNOSTIC
 3486                 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
 3487                         FREE_LOCK(&lk);
 3488                         panic("softdep_write_inodeblock: lbn order");
 3489                 }
 3490                 prevlbn = adp->ad_lbn;
 3491                 if (adp->ad_lbn < NDADDR &&
 3492                     (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
 3493                         FREE_LOCK(&lk);
 3494                         panic("%s: direct pointer #%ld mismatch %d != %d",
 3495                             "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
 3496                 }
 3497                 if (adp->ad_lbn >= NDADDR &&
 3498                     (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
 3499                     (d2 = adp->ad_newblkno)) {
 3500                         FREE_LOCK(&lk);
 3501                         panic("%s: indirect pointer #%ld mismatch %d != %d",
 3502                             "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
 3503                             d1, d2);
 3504                 }
 3505                 deplist |= 1 << adp->ad_lbn;
 3506                 if ((adp->ad_state & ATTACHED) == 0) {
 3507                         FREE_LOCK(&lk);
 3508                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 3509                             adp->ad_state);
 3510                 }
 3511 #endif /* DIAGNOSTIC */
 3512                 adp->ad_state &= ~ATTACHED;
 3513                 adp->ad_state |= UNDONE;
 3514         }
 3515         /*
 3516          * The on-disk inode cannot claim to be any larger than the last
 3517          * fragment that has been written. Otherwise, the on-disk inode
 3518          * might have fragments that were not the last block in the file
 3519          * which would corrupt the filesystem.
 3520          */
 3521         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3522              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 3523                 if (adp->ad_lbn >= NDADDR)
 3524                         break;
 3525                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 3526                 /* keep going until hitting a rollback to a frag */
 3527                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 3528                         continue;
 3529                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 3530                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 3531 #ifdef DIAGNOSTIC
 3532                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
 3533                                 FREE_LOCK(&lk);
 3534                                 panic("softdep_write_inodeblock: lost dep1");
 3535                         }
 3536 #endif /* DIAGNOSTIC */
 3537                         dp->di_db[i] = 0;
 3538                 }
 3539                 for (i = 0; i < NIADDR; i++) {
 3540 #ifdef DIAGNOSTIC
 3541                         if (dp->di_ib[i] != 0 &&
 3542                             (deplist & ((1 << NDADDR) << i)) == 0) {
 3543                                 FREE_LOCK(&lk);
 3544                                 panic("softdep_write_inodeblock: lost dep2");
 3545                         }
 3546 #endif /* DIAGNOSTIC */
 3547                         dp->di_ib[i] = 0;
 3548                 }
 3549                 return;
 3550         }
 3551         /*
 3552          * If we have zero'ed out the last allocated block of the file,
 3553          * roll back the size to the last currently allocated block.
 3554          * We know that this last allocated block is a full-sized as
 3555          * we already checked for fragments in the loop above.
 3556          */
 3557         if (lastadp != NULL &&
 3558             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 3559                 for (i = lastadp->ad_lbn; i >= 0; i--)
 3560                         if (dp->di_db[i] != 0)
 3561                                 break;
 3562                 dp->di_size = (i + 1) * fs->fs_bsize;
 3563         }
 3564         /*
 3565          * The only dependencies are for indirect blocks.
 3566          *
 3567          * The file size for indirect block additions is not guaranteed.
 3568          * Such a guarantee would be non-trivial to achieve. The conventional
 3569          * synchronous write implementation also does not make this guarantee.
 3570          * Fsck should catch and fix discrepancies. Arguably, the file size
 3571          * can be over-estimated without destroying integrity when the file
 3572          * moves into the indirect blocks (i.e., is large). If we want to
 3573          * postpone fsck, we are stuck with this argument.
 3574          */
 3575         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 3576                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 3577 }
 3578 
 3579 #ifdef FFS2
 3580 /*
 3581  * Version of initiate_write_inodeblock that handles FFS2 dinodes.
 3582  */
 3583 STATIC void
 3584 initiate_write_inodeblock_ufs2(inodedep, bp)
 3585         struct inodedep *inodedep;
 3586         struct buf *bp;                 /* The inode block */
 3587 {
 3588         struct allocdirect *adp, *lastadp;
 3589         struct ufs2_dinode *dp;
 3590         struct fs *fs = inodedep->id_fs;
 3591 #ifdef DIAGNOSTIC
 3592         daddr64_t prevlbn = -1, d1, d2;
 3593 #endif
 3594         int deplist, i;
 3595 
 3596         if (inodedep->id_state & IOSTARTED)
 3597                 panic("initiate_write_inodeblock_ufs2: already started");
 3598         inodedep->id_state |= IOSTARTED;
 3599         fs = inodedep->id_fs;
 3600         dp = (struct ufs2_dinode *)bp->b_data +
 3601             ino_to_fsbo(fs, inodedep->id_ino);
 3602         /*
 3603          * If the bitmap is not yet written, then the allocated
 3604          * inode cannot be written to disk.
 3605          */
 3606         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 3607                 if (inodedep->id_savedino2 != NULL)
 3608                         panic("initiate_write_inodeblock_ufs2: I/O underway");
 3609                 MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
 3610                     sizeof(struct ufs2_dinode), M_INODEDEP, M_WAITOK);
 3611                 *inodedep->id_savedino2 = *dp;
 3612                 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
 3613                 return;
 3614         }
 3615         /*
 3616          * If no dependencies, then there is nothing to roll back.
 3617          */
 3618         inodedep->id_savedsize = dp->di_size;
 3619         if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
 3620                 return;
 3621 
 3622 #ifdef notyet
 3623         inodedep->id_savedextsize = dp->di_extsize;
 3624         if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
 3625             TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
 3626                 return;
 3627         /*
 3628          * Set the ext data dependencies to busy.
 3629          */
 3630         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 3631              adp = TAILQ_NEXT(adp, ad_next)) {
 3632 #ifdef DIAGNOSTIC
 3633                 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
 3634                         FREE_LOCK(&lk);
 3635                         panic("softdep_write_inodeblock: lbn order");
 3636                 }
 3637                 prevlbn = adp->ad_lbn;
 3638                 if ((d1 = dp->di_extb[adp->ad_lbn]) !=
 3639                     (d2 = adp->ad_newblkno)) {
 3640                         FREE_LOCK(&lk);
 3641                         panic("%s: direct pointer #%ld mismatch %ld != %ld",
 3642                             "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
 3643                 }
 3644                 deplist |= 1 << adp->ad_lbn;
 3645                 if ((adp->ad_state & ATTACHED) == 0) {
 3646                         FREE_LOCK(&lk);
 3647                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 3648                             adp->ad_state);
 3649                 }
 3650 #endif /* DIAGNOSTIC */
 3651                 adp->ad_state &= ~ATTACHED;
 3652                 adp->ad_state |= UNDONE;
 3653         }
 3654         /*
 3655          * The on-disk inode cannot claim to be any larger than the last
 3656          * fragment that has been written. Otherwise, the on-disk inode
 3657          * might have fragments that were not the last block in the ext
 3658          * data which would corrupt the filesystem.
 3659          */
 3660         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 3661              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 3662                 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
 3663                 /* keep going until hitting a rollback to a frag */
 3664                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 3665                         continue;
 3666                 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 3667                 for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
 3668 #ifdef DIAGNOSTIC
 3669                         if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
 3670                                 FREE_LOCK(&lk);
 3671                                 panic("softdep_write_inodeblock: lost dep1");
 3672                         }
 3673 #endif /* DIAGNOSTIC */
 3674                         dp->di_extb[i] = 0;
 3675                 }
 3676                 lastadp = NULL;
 3677                 break;
 3678         }
 3679         /*
 3680          * If we have zero'ed out the last allocated block of the ext
 3681          * data, roll back the size to the last currently allocated block.
 3682          * We know that this last allocated block is a full-sized as
 3683          * we already checked for fragments in the loop above.
 3684          */
 3685         if (lastadp != NULL &&
 3686             dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 3687                 for (i = lastadp->ad_lbn; i >= 0; i--)
 3688                         if (dp->di_extb[i] != 0)
 3689                                 break;
 3690                 dp->di_extsize = (i + 1) * fs->fs_bsize;
 3691         }
 3692 #endif /* notyet */
 3693 
 3694         /*
 3695          * Set the file data dependencies to busy.
 3696          */
 3697         for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3698              adp = TAILQ_NEXT(adp, ad_next)) {
 3699 #ifdef DIAGNOSTIC
 3700                 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
 3701                         FREE_LOCK(&lk);
 3702                         panic("softdep_write_inodeblock: lbn order");
 3703                 }
 3704                 prevlbn = adp->ad_lbn;
 3705                 if (adp->ad_lbn < NDADDR &&
 3706                     (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
 3707                         FREE_LOCK(&lk);
 3708                         panic("%s: direct pointer #%ld mismatch %ld != %ld",
 3709                             "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
 3710                 }
 3711                 if (adp->ad_lbn >= NDADDR &&
 3712                     (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
 3713                     (d2 = adp->ad_newblkno)) {
 3714                         FREE_LOCK(&lk);
 3715                         panic("%s: indirect pointer #%ld mismatch %ld != %ld",
 3716                             "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
 3717                             d1, d2);
 3718                 }
 3719                 deplist |= 1 << adp->ad_lbn;
 3720                 if ((adp->ad_state & ATTACHED) == 0) {
 3721                         FREE_LOCK(&lk);
 3722                         panic("softdep_write_inodeblock: Unknown state 0x%x",
 3723                             adp->ad_state);
 3724                 }
 3725 #endif /* DIAGNOSTIC */
 3726                 adp->ad_state &= ~ATTACHED;
 3727                 adp->ad_state |= UNDONE;
 3728         }
 3729         /*
 3730          * The on-disk inode cannot claim to be any larger than the last
 3731          * fragment that has been written. Otherwise, the on-disk inode
 3732          * might have fragments that were not the last block in the file
 3733          * which would corrupt the filesystem.
 3734          */
 3735         for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 3736              lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 3737                 if (adp->ad_lbn >= NDADDR)
 3738                         break;
 3739                 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 3740                 /* keep going until hitting a rollback to a frag */
 3741                 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 3742                         continue;
 3743                 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 3744                 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 3745 #ifdef DIAGNOSTIC
 3746                         if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
 3747                                 FREE_LOCK(&lk);
 3748                                 panic("softdep_write_inodeblock: lost dep2");
 3749                         }
 3750 #endif /* DIAGNOSTIC */
 3751                         dp->di_db[i] = 0;
 3752                 }
 3753                 for (i = 0; i < NIADDR; i++) {
 3754 #ifdef DIAGNOSTIC
 3755                         if (dp->di_ib[i] != 0 &&
 3756                             (deplist & ((1 << NDADDR) << i)) == 0) {
 3757                                 FREE_LOCK(&lk);
 3758                                 panic("softdep_write_inodeblock: lost dep3");
 3759                         }
 3760 #endif /* DIAGNOSTIC */
 3761                         dp->di_ib[i] = 0;
 3762                 }
 3763                 return;
 3764         }
 3765         /*
 3766          * If we have zero'ed out the last allocated block of the file,
 3767          * roll back the size to the last currently allocated block.
 3768          * We know that this last allocated block is a full-sized as
 3769          * we already checked for fragments in the loop above.
 3770          */
 3771         if (lastadp != NULL &&
 3772             dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 3773                 for (i = lastadp->ad_lbn; i >= 0; i--)
 3774                         if (dp->di_db[i] != 0)
 3775                                 break;
 3776                 dp->di_size = (i + 1) * fs->fs_bsize;
 3777         }
 3778         /*
 3779          * The only dependencies are for indirect blocks.
 3780          *
 3781          * The file size for indirect block additions is not guaranteed.
 3782          * Such a guarantee would be non-trivial to achieve. The conventional
 3783          * synchronous write implementation also does not make this guarantee.
 3784          * Fsck should catch and fix discrepancies. Arguably, the file size
 3785          * can be over-estimated without destroying integrity when the file
 3786          * moves into the indirect blocks (i.e., is large). If we want to
 3787          * postpone fsck, we are stuck with this argument.
 3788          */
 3789         for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 3790                 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 3791 }
 3792 #endif /* FFS2 */
 3793 
 3794 /*
 3795  * This routine is called during the completion interrupt
 3796  * service routine for a disk write (from the procedure called
 3797  * by the device driver to inform the file system caches of
 3798  * a request completion).  It should be called early in this
 3799  * procedure, before the block is made available to other
 3800  * processes or other routines are called.
 3801  */
 3802 void 
 3803 softdep_disk_write_complete(bp)
 3804         struct buf *bp;         /* describes the completed disk write */
 3805 {
 3806         struct worklist *wk;
 3807         struct workhead reattach;
 3808         struct newblk *newblk;
 3809         struct allocindir *aip;
 3810         struct allocdirect *adp;
 3811         struct indirdep *indirdep;
 3812         struct inodedep *inodedep;
 3813         struct bmsafemap *bmsafemap;
 3814 
 3815         /*
 3816          * If an error occurred while doing the write, then the data
 3817          * has not hit the disk and the dependencies cannot be unrolled.
 3818          */
 3819         if ((bp->b_flags & B_ERROR) && !(bp->b_flags & B_INVAL))
 3820                 return;
 3821 
 3822 #ifdef DEBUG
 3823         if (lk.lkt_held != -1)
 3824                 panic("softdep_disk_write_complete: lock is held");
 3825         lk.lkt_held = -2;
 3826 #endif
 3827         LIST_INIT(&reattach);
 3828         while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 3829                 WORKLIST_REMOVE(wk);
 3830                 switch (wk->wk_type) {
 3831 
 3832                 case D_PAGEDEP:
 3833                         if (handle_written_filepage(WK_PAGEDEP(wk), bp))
 3834                                 WORKLIST_INSERT(&reattach, wk);
 3835                         continue;
 3836 
 3837                 case D_INODEDEP:
 3838                         if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
 3839                                 WORKLIST_INSERT(&reattach, wk);
 3840                         continue;
 3841 
 3842                 case D_BMSAFEMAP:
 3843                         bmsafemap = WK_BMSAFEMAP(wk);
 3844                         while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
 3845                                 newblk->nb_state |= DEPCOMPLETE;
 3846                                 newblk->nb_bmsafemap = NULL;
 3847                                 LIST_REMOVE(newblk, nb_deps);
 3848                         }
 3849                         while ((adp =
 3850                            LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
 3851                                 adp->ad_state |= DEPCOMPLETE;
 3852                                 adp->ad_buf = NULL;
 3853                                 LIST_REMOVE(adp, ad_deps);
 3854                                 handle_allocdirect_partdone(adp);
 3855                         }
 3856                         while ((aip =
 3857                             LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
 3858                                 aip->ai_state |= DEPCOMPLETE;
 3859                                 aip->ai_buf = NULL;
 3860                                 LIST_REMOVE(aip, ai_deps);
 3861                                 handle_allocindir_partdone(aip);
 3862                         }
 3863                         while ((inodedep =
 3864                              LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
 3865                                 inodedep->id_state |= DEPCOMPLETE;
 3866                                 LIST_REMOVE(inodedep, id_deps);
 3867                                 inodedep->id_buf = NULL;
 3868                         }
 3869                         WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 3870                         continue;
 3871 
 3872                 case D_MKDIR:
 3873                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 3874                         continue;
 3875 
 3876                 case D_ALLOCDIRECT:
 3877                         adp = WK_ALLOCDIRECT(wk);
 3878                         adp->ad_state |= COMPLETE;
 3879                         handle_allocdirect_partdone(adp);
 3880                         continue;
 3881 
 3882                 case D_ALLOCINDIR:
 3883                         aip = WK_ALLOCINDIR(wk);
 3884                         aip->ai_state |= COMPLETE;
 3885                         handle_allocindir_partdone(aip);
 3886                         continue;
 3887 
 3888                 case D_INDIRDEP:
 3889                         indirdep = WK_INDIRDEP(wk);
 3890                         if (indirdep->ir_state & GOINGAWAY)
 3891                                 panic("disk_write_complete: indirdep gone");
 3892                         bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
 3893                         free(indirdep->ir_saveddata, M_INDIRDEP);
 3894                         indirdep->ir_saveddata = 0;
 3895                         indirdep->ir_state &= ~UNDONE;
 3896                         indirdep->ir_state |= ATTACHED;
 3897                         while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
 3898                                 handle_allocindir_partdone(aip);
 3899                                 if (aip == LIST_FIRST(&indirdep->ir_donehd))
 3900                                         panic("disk_write_complete: not gone");
 3901                         }
 3902                         WORKLIST_INSERT(&reattach, wk);
 3903                         if ((bp->b_flags & B_DELWRI) == 0)
 3904                                 stat_indir_blk_ptrs++;
 3905                         buf_dirty(bp);
 3906                         continue;
 3907 
 3908                 default:
 3909                         panic("handle_disk_write_complete: Unknown type %s",
 3910                             TYPENAME(wk->wk_type));
 3911                         /* NOTREACHED */
 3912                 }
 3913         }
 3914         /*
 3915          * Reattach any requests that must be redone.
 3916          */
 3917         while ((wk = LIST_FIRST(&reattach)) != NULL) {
 3918                 WORKLIST_REMOVE(wk);
 3919                 WORKLIST_INSERT(&bp->b_dep, wk);
 3920         }
 3921 #ifdef DEBUG
 3922         if (lk.lkt_held != -2)
 3923                 panic("softdep_disk_write_complete: lock lost");
 3924         lk.lkt_held = -1;
 3925 #endif
 3926 }
 3927 
 3928 /*
 3929  * Called from within softdep_disk_write_complete above. Note that
 3930  * this routine is always called from interrupt level with further
 3931  * splbio interrupts blocked.
 3932  */
 3933 STATIC void 
 3934 handle_allocdirect_partdone(adp)
 3935         struct allocdirect *adp;        /* the completed allocdirect */
 3936 {
 3937         struct allocdirect *listadp;
 3938         struct inodedep *inodedep;
 3939         long bsize, delay;
 3940 
 3941         splassert(IPL_BIO);
 3942 
 3943         if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 3944                 return;
 3945         if (adp->ad_buf != NULL)
 3946                 panic("handle_allocdirect_partdone: dangling dep");
 3947 
 3948         /*
 3949          * The on-disk inode cannot claim to be any larger than the last
 3950          * fragment that has been written. Otherwise, the on-disk inode
 3951          * might have fragments that were not the last block in the file
 3952          * which would corrupt the filesystem. Thus, we cannot free any
 3953          * allocdirects after one whose ad_oldblkno claims a fragment as
 3954          * these blocks must be rolled back to zero before writing the inode.
 3955          * We check the currently active set of allocdirects in id_inoupdt.
 3956          */
 3957         inodedep = adp->ad_inodedep;
 3958         bsize = inodedep->id_fs->fs_bsize;
 3959         TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
 3960                 /* found our block */
 3961                 if (listadp == adp)
 3962                         break;
 3963                 /* continue if ad_oldlbn is not a fragment */
 3964                 if (listadp->ad_oldsize == 0 ||
 3965                     listadp->ad_oldsize == bsize)
 3966                         continue;
 3967                 /* hit a fragment */
 3968                 return;
 3969         }
 3970         /*
 3971          * If we have reached the end of the current list without
 3972          * finding the just finished dependency, then it must be
 3973          * on the future dependency list. Future dependencies cannot
 3974          * be freed until they are moved to the current list.
 3975          */
 3976         if (listadp == NULL) {
 3977 #ifdef DEBUG
 3978                 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
 3979                         /* found our block */
 3980                         if (listadp == adp)
 3981                                 break;
 3982                 if (listadp == NULL)
 3983                         panic("handle_allocdirect_partdone: lost dep");
 3984 #endif /* DEBUG */
 3985                 return;
 3986         }
 3987         /*
 3988          * If we have found the just finished dependency, then free
 3989          * it along with anything that follows it that is complete.
 3990          * If the inode still has a bitmap dependency, then it has
 3991          * never been written to disk, hence the on-disk inode cannot
 3992          * reference the old fragment so we can free it without delay.
 3993          */
 3994         delay = (inodedep->id_state & DEPCOMPLETE);
 3995         for (; adp; adp = listadp) {
 3996                 listadp = TAILQ_NEXT(adp, ad_next);
 3997                 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 3998                         return;
 3999                 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
 4000         }
 4001 }
 4002 
 4003 /*
 4004  * Called from within softdep_disk_write_complete above. Note that
 4005  * this routine is always called from interrupt level with further
 4006  * splbio interrupts blocked.
 4007  */
 4008 STATIC void
 4009 handle_allocindir_partdone(aip)
 4010         struct allocindir *aip;         /* the completed allocindir */
 4011 {
 4012         struct indirdep *indirdep;
 4013 
 4014         splassert(IPL_BIO);
 4015 
 4016         if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
 4017                 return;
 4018         if (aip->ai_buf != NULL)
 4019                 panic("handle_allocindir_partdone: dangling dependency");
 4020         indirdep = aip->ai_indirdep;
 4021         if (indirdep->ir_state & UNDONE) {
 4022                 LIST_REMOVE(aip, ai_next);
 4023                 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
 4024                 return;
 4025         }
 4026         if (indirdep->ir_state & UFS1FMT)
 4027                 ((int32_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 4028                     aip->ai_newblkno;
 4029         else
 4030                 ((int64_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 4031                     aip->ai_newblkno;
 4032         LIST_REMOVE(aip, ai_next);
 4033         if (aip->ai_freefrag != NULL)
 4034                 add_to_worklist(&aip->ai_freefrag->ff_list);
 4035         WORKITEM_FREE(aip, D_ALLOCINDIR);
 4036 }
 4037 
 4038 /*
 4039  * Called from within softdep_disk_write_complete above to restore
 4040  * in-memory inode block contents to their most up-to-date state. Note
 4041  * that this routine is always called from interrupt level with further
 4042  * splbio interrupts blocked.
 4043  */
 4044 STATIC int 
 4045 handle_written_inodeblock(inodedep, bp)
 4046         struct inodedep *inodedep;
 4047         struct buf *bp;         /* buffer containing the inode block */
 4048 {
 4049         struct worklist *wk, *filefree;
 4050         struct allocdirect *adp, *nextadp;
 4051         struct ufs1_dinode *dp1 = NULL;
 4052         struct ufs2_dinode *dp2 = NULL;
 4053         int hadchanges, fstype;
 4054 
 4055         splassert(IPL_BIO);
 4056 
 4057         if ((inodedep->id_state & IOSTARTED) == 0)
 4058                 panic("handle_written_inodeblock: not started");
 4059         inodedep->id_state &= ~IOSTARTED;
 4060 
 4061         if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
 4062                 fstype = UM_UFS1;
 4063                 dp1 = (struct ufs1_dinode *) bp->b_data +
 4064                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 4065         } else {
 4066                 fstype = UM_UFS2;
 4067                 dp2 = (struct ufs2_dinode *) bp->b_data +
 4068                     ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 4069         }
 4070 
 4071         /*
 4072          * If we had to rollback the inode allocation because of
 4073          * bitmaps being incomplete, then simply restore it.
 4074          * Keep the block dirty so that it will not be reclaimed until
 4075          * all associated dependencies have been cleared and the
 4076          * corresponding updates written to disk.
 4077          */
 4078         if (inodedep->id_savedino1 != NULL) {
 4079                 if (fstype == UM_UFS1)
 4080                         *dp1 = *inodedep->id_savedino1;
 4081                 else
 4082                         *dp2 = *inodedep->id_savedino2;
 4083                 FREE(inodedep->id_savedino1, M_INODEDEP);
 4084                 inodedep->id_savedino1 = NULL;
 4085                 if ((bp->b_flags & B_DELWRI) == 0)
 4086                         stat_inode_bitmap++;
 4087                 buf_dirty(bp);
 4088                 return (1);
 4089         }
 4090         inodedep->id_state |= COMPLETE;
 4091         /*
 4092          * Roll forward anything that had to be rolled back before 
 4093          * the inode could be updated.
 4094          */
 4095         hadchanges = 0;
 4096         for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
 4097                 nextadp = TAILQ_NEXT(adp, ad_next);
 4098                 if (adp->ad_state & ATTACHED)
 4099                         panic("handle_written_inodeblock: new entry");
 4100                 if (fstype == UM_UFS1) {
 4101                         if (adp->ad_lbn < NDADDR) {
 4102                                 if (dp1->di_db[adp->ad_lbn] != adp->ad_oldblkno)
 4103                                          panic("%s: %s #%ld mismatch %d != %d",
 4104                                              "handle_written_inodeblock",
 4105                                              "direct pointer", adp->ad_lbn,
 4106                                              dp1->di_db[adp->ad_lbn],
 4107                                              adp->ad_oldblkno);
 4108                                 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
 4109                         } else {
 4110                                 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
 4111                                         panic("%s: %s #%ld allocated as %d",
 4112                                             "handle_written_inodeblock",
 4113                                             "indirect pointer",
 4114                                             adp->ad_lbn - NDADDR,
 4115                                             dp1->di_ib[adp->ad_lbn - NDADDR]);
 4116                                 dp1->di_ib[adp->ad_lbn - NDADDR] =
 4117                                    adp->ad_newblkno;
 4118                         }
 4119                 } else {
 4120                         if (adp->ad_lbn < NDADDR) {
 4121                                 if (dp2->di_db[adp->ad_lbn] != adp->ad_oldblkno)
 4122                                         panic("%s: %s #%ld mismatch %d != %d",
 4123                                             "handle_written_inodeblock",
 4124                                             "direct pointer", adp->ad_lbn,
 4125                                             dp2->di_db[adp->ad_lbn],
 4126                                             adp->ad_oldblkno);
 4127                                 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
 4128                         } else {
 4129                                 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
 4130                                         panic("%s: %s #%ld allocated as %d",
 4131                                             "handle_written_inodeblock",
 4132                                             "indirect pointer",
 4133                                             adp->ad_lbn - NDADDR,
 4134                                             dp2->di_ib[adp->ad_lbn - NDADDR]);
 4135                                 dp2->di_ib[adp->ad_lbn - NDADDR] =
 4136                                     adp->ad_newblkno;
 4137                         }
 4138                 }
 4139                 adp->ad_state &= ~UNDONE;
 4140                 adp->ad_state |= ATTACHED;
 4141                 hadchanges = 1;
 4142         }
 4143         if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
 4144                 stat_direct_blk_ptrs++;
 4145         /*
 4146          * Reset the file size to its most up-to-date value.
 4147          */
 4148         if (inodedep->id_savedsize == -1)
 4149                 panic("handle_written_inodeblock: bad size");
 4150         
 4151         if (fstype == UM_UFS1) {
 4152                 if (dp1->di_size != inodedep->id_savedsize) {
 4153                         dp1->di_size = inodedep->id_savedsize;
 4154                         hadchanges = 1;
 4155                 }
 4156         } else {
 4157                 if (dp2->di_size != inodedep->id_savedsize) {
 4158                         dp2->di_size = inodedep->id_savedsize;
 4159                         hadchanges = 1;
 4160                 }
 4161         }
 4162         inodedep->id_savedsize = -1;
 4163         /*
 4164          * If there were any rollbacks in the inode block, then it must be
 4165          * marked dirty so that its will eventually get written back in
 4166          * its correct form.
 4167          */
 4168         if (hadchanges)
 4169                 buf_dirty(bp);
 4170         /*
 4171          * Process any allocdirects that completed during the update.
 4172          */
 4173         if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 4174                 handle_allocdirect_partdone(adp);
 4175         /*
 4176          * Process deallocations that were held pending until the
 4177          * inode had been written to disk. Freeing of the inode
 4178          * is delayed until after all blocks have been freed to
 4179          * avoid creation of new <vfsid, inum, lbn> triples
 4180          * before the old ones have been deleted.
 4181          */
 4182         filefree = NULL;
 4183         while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
 4184                 WORKLIST_REMOVE(wk);
 4185                 switch (wk->wk_type) {
 4186 
 4187                 case D_FREEFILE:
 4188                         /*
 4189                          * We defer adding filefree to the worklist until
 4190                          * all other additions have been made to ensure
 4191                          * that it will be done after all the old blocks
 4192                          * have been freed.
 4193                          */
 4194                         if (filefree != NULL)
 4195                                 panic("handle_written_inodeblock: filefree");
 4196                         filefree = wk;
 4197                         continue;
 4198 
 4199                 case D_MKDIR:
 4200                         handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
 4201                         continue;
 4202 
 4203                 case D_DIRADD:
 4204                         diradd_inode_written(WK_DIRADD(wk), inodedep);
 4205                         continue;
 4206 
 4207                 case D_FREEBLKS:
 4208                         wk->wk_state |= COMPLETE;
 4209                         if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
 4210                                 continue;
 4211                         /* FALLTHROUGH */
 4212                 case D_FREEFRAG:
 4213                 case D_DIRREM:
 4214                         add_to_worklist(wk);
 4215                         continue;
 4216 
 4217                 case D_NEWDIRBLK:
 4218                         free_newdirblk(WK_NEWDIRBLK(wk));
 4219                         continue;
 4220 
 4221                 default:
 4222                         panic("handle_written_inodeblock: Unknown type %s",
 4223                             TYPENAME(wk->wk_type));
 4224                         /* NOTREACHED */
 4225                 }
 4226         }
 4227         if (filefree != NULL) {
 4228                 if (free_inodedep(inodedep) == 0)
 4229                         panic("handle_written_inodeblock: live inodedep");
 4230                 add_to_worklist(filefree);
 4231                 return (0);
 4232         }
 4233 
 4234         /*
 4235          * If no outstanding dependencies, free it.
 4236          */
 4237         if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
 4238                 return (0);
 4239         return (hadchanges);
 4240 }
 4241 
 4242 /*
 4243  * Process a diradd entry after its dependent inode has been written.
 4244  * This routine must be called with splbio interrupts blocked.
 4245  */
 4246 STATIC void
 4247 diradd_inode_written(dap, inodedep)
 4248         struct diradd *dap;
 4249         struct inodedep *inodedep;
 4250 {
 4251         struct pagedep *pagedep;
 4252 
 4253         splassert(IPL_BIO);
 4254 
 4255         dap->da_state |= COMPLETE;
 4256         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4257                 if (dap->da_state & DIRCHG)
 4258                         pagedep = dap->da_previous->dm_pagedep;
 4259                 else
 4260                         pagedep = dap->da_pagedep;
 4261                 LIST_REMOVE(dap, da_pdlist);
 4262                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 4263         }
 4264         WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 4265 }
 4266 
 4267 /*
 4268  * Handle the completion of a mkdir dependency.
 4269  */
 4270 STATIC void
 4271 handle_written_mkdir(mkdir, type)
 4272         struct mkdir *mkdir;
 4273         int type;
 4274 {
 4275         struct diradd *dap;
 4276         struct pagedep *pagedep;
 4277 
 4278         splassert(IPL_BIO);
 4279 
 4280         if (mkdir->md_state != type)
 4281                 panic("handle_written_mkdir: bad type");
 4282         dap = mkdir->md_diradd;
 4283         dap->da_state &= ~type;
 4284         if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
 4285                 dap->da_state |= DEPCOMPLETE;
 4286         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4287                 if (dap->da_state & DIRCHG)
 4288                         pagedep = dap->da_previous->dm_pagedep;
 4289                 else
 4290                         pagedep = dap->da_pagedep;
 4291                 LIST_REMOVE(dap, da_pdlist);
 4292                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 4293         }
 4294         LIST_REMOVE(mkdir, md_mkdirs);
 4295         WORKITEM_FREE(mkdir, D_MKDIR);
 4296 }
 4297 
 4298 /*
 4299  * Called from within softdep_disk_write_complete above.
 4300  * A write operation was just completed. Removed inodes can
 4301  * now be freed and associated block pointers may be committed.
 4302  * Note that this routine is always called from interrupt level
 4303  * with further splbio interrupts blocked.
 4304  */
 4305 STATIC int 
 4306 handle_written_filepage(pagedep, bp)
 4307         struct pagedep *pagedep;
 4308         struct buf *bp;         /* buffer containing the written page */
 4309 {
 4310         struct dirrem *dirrem;
 4311         struct diradd *dap, *nextdap;
 4312         struct direct *ep;
 4313         int i, chgs;
 4314 
 4315         splassert(IPL_BIO);
 4316 
 4317         if ((pagedep->pd_state & IOSTARTED) == 0)
 4318                 panic("handle_written_filepage: not started");
 4319         pagedep->pd_state &= ~IOSTARTED;
 4320         /*
 4321          * Process any directory removals that have been committed.
 4322          */
 4323         while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
 4324                 LIST_REMOVE(dirrem, dm_next);
 4325                 dirrem->dm_dirinum = pagedep->pd_ino;
 4326                 add_to_worklist(&dirrem->dm_list);
 4327         }
 4328         /*
 4329          * Free any directory additions that have been committed.
 4330          * If it is a newly allocated block, we have to wait until
 4331          * the on-disk directory inode claims the new block.
 4332          */
 4333         if ((pagedep->pd_state & NEWBLOCK) == 0)
 4334                 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 4335                         free_diradd(dap);
 4336         /*
 4337          * Uncommitted directory entries must be restored.
 4338          */
 4339         for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
 4340                 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
 4341                      dap = nextdap) {
 4342                         nextdap = LIST_NEXT(dap, da_pdlist);
 4343                         if (dap->da_state & ATTACHED)
 4344                                 panic("handle_written_filepage: attached");
 4345                         ep = (struct direct *)
 4346                             ((char *)bp->b_data + dap->da_offset);
 4347                         ep->d_ino = dap->da_newinum;
 4348                         dap->da_state &= ~UNDONE;
 4349                         dap->da_state |= ATTACHED;
 4350                         chgs = 1;
 4351                         /*
 4352                          * If the inode referenced by the directory has
 4353                          * been written out, then the dependency can be
 4354                          * moved to the pending list.
 4355                          */
 4356                         if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 4357                                 LIST_REMOVE(dap, da_pdlist);
 4358                                 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
 4359                                     da_pdlist);
 4360                         }
 4361                 }
 4362         }
 4363         /*
 4364          * If there were any rollbacks in the directory, then it must be
 4365          * marked dirty so that its will eventually get written back in
 4366          * its correct form.
 4367          */
 4368         if (chgs) {
 4369                 if ((bp->b_flags & B_DELWRI) == 0)
 4370                         stat_dir_entry++;
 4371                 buf_dirty(bp);
 4372                 return (1);
 4373         }
 4374         /*
 4375          * If we are not waiting for a new directory block to be
 4376          * claimed by its inode, then the pagedep will be freed.
 4377          * Otherwise it will remain to track any new entries on
 4378          * the page in case they are fsync'ed.
 4379          */
 4380         if ((pagedep->pd_state & NEWBLOCK) == 0) {
 4381                 LIST_REMOVE(pagedep, pd_hash);
 4382                 WORKITEM_FREE(pagedep, D_PAGEDEP);
 4383         }
 4384         return (0);
 4385 }
 4386 
 4387 /*
 4388  * Writing back in-core inode structures.
 4389  * 
 4390  * The file system only accesses an inode's contents when it occupies an
 4391  * "in-core" inode structure.  These "in-core" structures are separate from
 4392  * the page frames used to cache inode blocks.  Only the latter are
 4393  * transferred to/from the disk.  So, when the updated contents of the
 4394  * "in-core" inode structure are copied to the corresponding in-memory inode
 4395  * block, the dependencies are also transferred.  The following procedure is
 4396  * called when copying a dirty "in-core" inode to a cached inode block.
 4397  */
 4398 
 4399 /*
 4400  * Called when an inode is loaded from disk. If the effective link count
 4401  * differed from the actual link count when it was last flushed, then we
 4402  * need to ensure that the correct effective link count is put back.
 4403  */
 4404 void 
 4405 softdep_load_inodeblock(ip)
 4406         struct inode *ip;       /* the "in_core" copy of the inode */
 4407 {
 4408         struct inodedep *inodedep;
 4409 
 4410         /*
 4411          * Check for alternate nlink count.
 4412          */
 4413         ip->i_effnlink = DIP(ip, nlink);
 4414         ACQUIRE_LOCK(&lk);
 4415         if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
 4416                 FREE_LOCK(&lk);
 4417                 return;
 4418         }
 4419         ip->i_effnlink -= inodedep->id_nlinkdelta;
 4420         FREE_LOCK(&lk);
 4421 }
 4422 
 4423 /*
 4424  * This routine is called just before the "in-core" inode
 4425  * information is to be copied to the in-memory inode block.
 4426  * Recall that an inode block contains several inodes. If
 4427  * the force flag is set, then the dependencies will be
 4428  * cleared so that the update can always be made. Note that
 4429  * the buffer is locked when this routine is called, so we
 4430  * will never be in the middle of writing the inode block 
 4431  * to disk.
 4432  */
 4433 void 
 4434 softdep_update_inodeblock(ip, bp, waitfor)
 4435         struct inode *ip;       /* the "in_core" copy of the inode */
 4436         struct buf *bp;         /* the buffer containing the inode block */
 4437         int waitfor;            /* nonzero => update must be allowed */
 4438 {
 4439         struct inodedep *inodedep;
 4440         struct worklist *wk;
 4441         int error, gotit;
 4442 
 4443         /*
 4444          * If the effective link count is not equal to the actual link
 4445          * count, then we must track the difference in an inodedep while
 4446          * the inode is (potentially) tossed out of the cache. Otherwise,
 4447          * if there is no existing inodedep, then there are no dependencies
 4448          * to track.
 4449          */
 4450         ACQUIRE_LOCK(&lk);
 4451         if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
 4452                 FREE_LOCK(&lk);
 4453                 if (ip->i_effnlink != DIP(ip, nlink))
 4454                         panic("softdep_update_inodeblock: bad link count");
 4455                 return;
 4456         }
 4457         if (inodedep->id_nlinkdelta != DIP(ip, nlink) - ip->i_effnlink) {
 4458                 FREE_LOCK(&lk);
 4459                 panic("softdep_update_inodeblock: bad delta");
 4460         }
 4461         /*
 4462          * Changes have been initiated. Anything depending on these
 4463          * changes cannot occur until this inode has been written.
 4464          */
 4465         inodedep->id_state &= ~COMPLETE;
 4466         if ((inodedep->id_state & ONWORKLIST) == 0)
 4467                 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
 4468         /*
 4469          * Any new dependencies associated with the incore inode must 
 4470          * now be moved to the list associated with the buffer holding
 4471          * the in-memory copy of the inode. Once merged process any
 4472          * allocdirects that are completed by the merger.
 4473          */
 4474         merge_inode_lists(inodedep);
 4475         if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
 4476                 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
 4477         /*
 4478          * Now that the inode has been pushed into the buffer, the
 4479          * operations dependent on the inode being written to disk
 4480          * can be moved to the id_bufwait so that they will be
 4481          * processed when the buffer I/O completes.
 4482          */
 4483         while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
 4484                 WORKLIST_REMOVE(wk);
 4485                 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
 4486         }
 4487         /*
 4488          * Newly allocated inodes cannot be written until the bitmap
 4489          * that allocates them have been written (indicated by
 4490          * DEPCOMPLETE being set in id_state). If we are doing a
 4491          * forced sync (e.g., an fsync on a file), we force the bitmap
 4492          * to be written so that the update can be done.
 4493          */
 4494         if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
 4495                 FREE_LOCK(&lk);
 4496                 return;
 4497         }
 4498         bp = inodedep->id_buf;
 4499         gotit = getdirtybuf(bp, MNT_WAIT);
 4500         FREE_LOCK(&lk);
 4501         if (gotit && (error = bwrite(bp)) != 0)
 4502                 softdep_error("softdep_update_inodeblock: bwrite", error);
 4503         if ((inodedep->id_state & DEPCOMPLETE) == 0)
 4504                 panic("softdep_update_inodeblock: update failed");
 4505 }
 4506 
 4507 /*
 4508  * Merge the new inode dependency list (id_newinoupdt) into the old
 4509  * inode dependency list (id_inoupdt). This routine must be called
 4510  * with splbio interrupts blocked.
 4511  */
 4512 STATIC void
 4513 merge_inode_lists(inodedep)
 4514         struct inodedep *inodedep;
 4515 {
 4516         struct allocdirect *listadp, *newadp;
 4517 
 4518         splassert(IPL_BIO);
 4519 
 4520         newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
 4521         for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
 4522                 if (listadp->ad_lbn < newadp->ad_lbn) {
 4523                         listadp = TAILQ_NEXT(listadp, ad_next);
 4524                         continue;
 4525                 }
 4526                 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
 4527                 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
 4528                 if (listadp->ad_lbn == newadp->ad_lbn) {
 4529                         allocdirect_merge(&inodedep->id_inoupdt, newadp,
 4530                             listadp);
 4531                         listadp = newadp;
 4532                 }
 4533                 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
 4534         }
 4535         while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
 4536                 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
 4537                 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
 4538         }
 4539 }
 4540 
 4541 /*
 4542  * If we are doing an fsync, then we must ensure that any directory
 4543  * entries for the inode have been written after the inode gets to disk.
 4544  */
 4545 int
 4546 softdep_fsync(vp)
 4547         struct vnode *vp;       /* the "in_core" copy of the inode */
 4548 {
 4549         struct inodedep *inodedep;
 4550         struct pagedep *pagedep;
 4551         struct worklist *wk;
 4552         struct diradd *dap;
 4553         struct mount *mnt;
 4554         struct vnode *pvp;
 4555         struct inode *ip;
 4556         struct inode *pip;
 4557         struct buf *bp;
 4558         struct fs *fs;
 4559         struct proc *p = CURPROC;               /* XXX */
 4560         int error, flushparent;
 4561         ino_t parentino;
 4562         daddr64_t lbn;
 4563 
 4564         ip = VTOI(vp);
 4565         fs = ip->i_fs;
 4566         ACQUIRE_LOCK(&lk);
 4567         if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
 4568                 FREE_LOCK(&lk);
 4569                 return (0);
 4570         }
 4571         if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
 4572             LIST_FIRST(&inodedep->id_bufwait) != NULL ||
 4573             TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
 4574             TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
 4575                 FREE_LOCK(&lk);
 4576                 panic("softdep_fsync: pending ops");
 4577         }
 4578         for (error = 0, flushparent = 0; ; ) {
 4579                 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
 4580                         break;
 4581                 if (wk->wk_type != D_DIRADD) {
 4582                         FREE_LOCK(&lk);
 4583                         panic("softdep_fsync: Unexpected type %s",
 4584                             TYPENAME(wk->wk_type));
 4585                 }
 4586                 dap = WK_DIRADD(wk);
 4587                 /*
 4588                  * Flush our parent if this directory entry has a MKDIR_PARENT
 4589                  * dependency or is contained in a newly allocated block.
 4590                  */
 4591                 if (dap->da_state & DIRCHG)
 4592                         pagedep = dap->da_previous->dm_pagedep;
 4593                 else
 4594                         pagedep = dap->da_pagedep;
 4595                 mnt = pagedep->pd_mnt;
 4596                 parentino = pagedep->pd_ino;
 4597                 lbn = pagedep->pd_lbn;
 4598                 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
 4599                         FREE_LOCK(&lk);
 4600                         panic("softdep_fsync: dirty");
 4601                 }
 4602                 if ((dap->da_state & MKDIR_PARENT) ||
 4603                     (pagedep->pd_state & NEWBLOCK))
 4604                         flushparent = 1;
 4605                 else
 4606                         flushparent = 0;
 4607                 /*
 4608                  * If we are being fsync'ed as part of vgone'ing this vnode,
 4609                  * then we will not be able to release and recover the
 4610                  * vnode below, so we just have to give up on writing its
 4611                  * directory entry out. It will eventually be written, just
 4612                  * not now, but then the user was not asking to have it
 4613                  * written, so we are not breaking any promises.
 4614                  */
 4615                 if (vp->v_flag & VXLOCK)
 4616                         break;
 4617                 /*
 4618                  * We prevent deadlock by always fetching inodes from the
 4619                  * root, moving down the directory tree. Thus, when fetching
 4620                  * our parent directory, we must unlock ourselves before
 4621                  * requesting the lock on our parent. See the comment in
 4622                  * ufs_lookup for details on possible races.
 4623                  */
 4624                 FREE_LOCK(&lk);
 4625                 VOP_UNLOCK(vp, 0, p);
 4626                 error = VFS_VGET(mnt, parentino, &pvp);
 4627                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 4628                 if (error != 0)
 4629                         return (error);
 4630                 /*
 4631                  * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 4632                  * that are contained in direct blocks will be resolved by 
 4633                  * doing a UFS_UPDATE. Pagedeps contained in indirect blocks
 4634                  * may require a complete sync'ing of the directory. So, we
 4635                  * try the cheap and fast UFS_UPDATE first, and if that fails,
 4636                  * then we do the slower VOP_FSYNC of the directory.
 4637                  */
 4638                 pip = VTOI(pvp);
 4639                 if (flushparent) {
 4640                         error = UFS_UPDATE(pip, MNT_WAIT);
 4641                         if (error) {
 4642                                 vput(pvp);
 4643                                 return (error);
 4644                         }
 4645                         if (pagedep->pd_state & NEWBLOCK) {
 4646                                 error = VOP_FSYNC(pvp, p->p_ucred, MNT_WAIT, p);
 4647                                 if (error) {
 4648                                         vput(pvp);
 4649                                         return (error);
 4650                                 }
 4651                         }
 4652                 }
 4653                 /*
 4654                  * Flush directory page containing the inode's name.
 4655                  */
 4656                 error = bread(pvp, lbn, fs->fs_bsize, p->p_ucred, &bp);
 4657                 if (error == 0) {
 4658                         bp->b_bcount = blksize(fs, pip, lbn);
 4659                         error = bwrite(bp);
 4660                 } else
 4661                         brelse(bp);
 4662                 vput(pvp);
 4663                 if (error != 0)
 4664                         return (error);
 4665                 ACQUIRE_LOCK(&lk);
 4666                 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
 4667                         break;
 4668         }
 4669         FREE_LOCK(&lk);
 4670         return (0);
 4671 }
 4672 
 4673 /*
 4674  * Flush all the dirty bitmaps associated with the block device
 4675  * before flushing the rest of the dirty blocks so as to reduce
 4676  * the number of dependencies that will have to be rolled back.
 4677  */
 4678 void
 4679 softdep_fsync_mountdev(vp, waitfor)
 4680         struct vnode *vp;
 4681         int waitfor;
 4682 {
 4683         struct buf *bp, *nbp;
 4684         struct worklist *wk;
 4685 
 4686         if (!vn_isdisk(vp, NULL))
 4687                 panic("softdep_fsync_mountdev: vnode not a disk");
 4688         ACQUIRE_LOCK(&lk);
 4689         for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 4690                 nbp = LIST_NEXT(bp, b_vnbufs);
 4691                 /* 
 4692                  * If it is already scheduled, skip to the next buffer.
 4693                  */
 4694                 if (bp->b_flags & B_BUSY)
 4695                         continue;
 4696                 bp->b_flags |= B_BUSY;
 4697 
 4698                 if ((bp->b_flags & B_DELWRI) == 0) {
 4699                         FREE_LOCK(&lk);
 4700                         panic("softdep_fsync_mountdev: not dirty");
 4701                 }
 4702                 /*
 4703                  * We are only interested in bitmaps with outstanding
 4704                  * dependencies.
 4705                  */
 4706                 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
 4707                     wk->wk_type != D_BMSAFEMAP) {
 4708                         bp->b_flags &= ~B_BUSY;
 4709                         continue;
 4710                 }
 4711                 bremfree(bp);
 4712                 FREE_LOCK(&lk);
 4713                 (void) bawrite(bp);
 4714                 ACQUIRE_LOCK(&lk);
 4715                 /*
 4716                  * Since we may have slept during the I/O, we need 
 4717                  * to start from a known point.
 4718                  */
 4719                 nbp = LIST_FIRST(&vp->v_dirtyblkhd);
 4720         }
 4721         if (waitfor == MNT_WAIT)
 4722                 drain_output(vp, 1);
 4723         FREE_LOCK(&lk);
 4724 }
 4725 
 4726 /*
 4727  * This routine is called when we are trying to synchronously flush a
 4728  * file. This routine must eliminate any filesystem metadata dependencies
 4729  * so that the syncing routine can succeed by pushing the dirty blocks
 4730  * associated with the file. If any I/O errors occur, they are returned.
 4731  */
 4732 int
 4733 softdep_sync_metadata(ap)
 4734         struct vop_fsync_args /* {
 4735                 struct vnode *a_vp;
 4736                 struct ucred *a_cred;
 4737                 int a_waitfor;
 4738                 struct proc *a_p;
 4739         } */ *ap;
 4740 {
 4741         struct vnode *vp = ap->a_vp;
 4742         struct pagedep *pagedep;
 4743         struct allocdirect *adp;
 4744         struct allocindir *aip;
 4745         struct buf *bp, *nbp;
 4746         struct worklist *wk;
 4747         int i, error, waitfor;
 4748 
 4749         /*
 4750          * Check whether this vnode is involved in a filesystem
 4751          * that is doing soft dependency processing.
 4752          */
 4753         if (!vn_isdisk(vp, NULL)) {
 4754                 if (!DOINGSOFTDEP(vp))
 4755                         return (0);
 4756         } else
 4757                 if (vp->v_specmountpoint == NULL ||
 4758                     (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
 4759                         return (0);
 4760         /*
 4761          * Ensure that any direct block dependencies have been cleared.
 4762          */
 4763         ACQUIRE_LOCK(&lk);
 4764         if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
 4765                 FREE_LOCK(&lk);
 4766                 return (error);
 4767         }
 4768         /*
 4769          * For most files, the only metadata dependencies are the
 4770          * cylinder group maps that allocate their inode or blocks.
 4771          * The block allocation dependencies can be found by traversing
 4772          * the dependency lists for any buffers that remain on their
 4773          * dirty buffer list. The inode allocation dependency will
 4774          * be resolved when the inode is updated with MNT_WAIT.
 4775          * This work is done in two passes. The first pass grabs most
 4776          * of the buffers and begins asynchronously writing them. The
 4777          * only way to wait for these asynchronous writes is to sleep
 4778          * on the filesystem vnode which may stay busy for a long time
 4779          * if the filesystem is active. So, instead, we make a second
 4780          * pass over the dependencies blocking on each write. In the
 4781          * usual case we will be blocking against a write that we
 4782          * initiated, so when it is done the dependency will have been
 4783          * resolved. Thus the second pass is expected to end quickly.
 4784          */
 4785         waitfor = MNT_NOWAIT;
 4786 top:
 4787         /*
 4788          * We must wait for any I/O in progress to finish so that
 4789          * all potential buffers on the dirty list will be visible.
 4790          */
 4791         drain_output(vp, 1);
 4792         bp = LIST_FIRST(&vp->v_dirtyblkhd);
 4793         if (getdirtybuf(bp, MNT_WAIT) == 0) {
 4794                 FREE_LOCK(&lk);
 4795                 return (0);
 4796         }
 4797 loop:
 4798         /*
 4799          * As we hold the buffer locked, none of its dependencies
 4800          * will disappear.
 4801          */
 4802         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 4803                 switch (wk->wk_type) {
 4804 
 4805                 case D_ALLOCDIRECT:
 4806                         adp = WK_ALLOCDIRECT(wk);
 4807                         if (adp->ad_state & DEPCOMPLETE)
 4808                                 break;
 4809                         nbp = adp->ad_buf;
 4810                         if (getdirtybuf(nbp, waitfor) == 0)
 4811                                 break;
 4812                         FREE_LOCK(&lk);
 4813                         if (waitfor == MNT_NOWAIT) {
 4814                                 bawrite(nbp);
 4815                         } else if ((error = VOP_BWRITE(nbp)) != 0) {
 4816                                 bawrite(bp);
 4817                                 return (error);
 4818                         }
 4819                         ACQUIRE_LOCK(&lk);
 4820                         break;
 4821 
 4822                 case D_ALLOCINDIR:
 4823                         aip = WK_ALLOCINDIR(wk);
 4824                         if (aip->ai_state & DEPCOMPLETE)
 4825                                 break;
 4826                         nbp = aip->ai_buf;
 4827                         if (getdirtybuf(nbp, waitfor) == 0)
 4828                                 break;
 4829                         FREE_LOCK(&lk);
 4830                         if (waitfor == MNT_NOWAIT) {
 4831                                 bawrite(nbp);
 4832                         } else if ((error = VOP_BWRITE(nbp)) != 0) {
 4833                                 bawrite(bp);
 4834                                 return (error);
 4835                         }
 4836                         ACQUIRE_LOCK(&lk);
 4837                         break;
 4838 
 4839                 case D_INDIRDEP:
 4840                 restart:
 4841 
 4842                         LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
 4843                                 if (aip->ai_state & DEPCOMPLETE)
 4844                                         continue;
 4845                                 nbp = aip->ai_buf;
 4846                                 if (getdirtybuf(nbp, MNT_WAIT) == 0)
 4847                                         goto restart;
 4848                                 FREE_LOCK(&lk);
 4849                                 if ((error = VOP_BWRITE(nbp)) != 0) {
 4850                                         bawrite(bp);
 4851                                         return (error);
 4852                                 }
 4853                                 ACQUIRE_LOCK(&lk);
 4854                                 goto restart;
 4855                         }
 4856                         break;
 4857 
 4858                 case D_INODEDEP:
 4859                         if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
 4860                             WK_INODEDEP(wk)->id_ino)) != 0) {
 4861                                 FREE_LOCK(&lk);
 4862                                 bawrite(bp);
 4863                                 return (error);
 4864                         }
 4865                         break;
 4866 
 4867                 case D_PAGEDEP:
 4868                         /*
 4869                          * We are trying to sync a directory that may
 4870                          * have dependencies on both its own metadata
 4871                          * and/or dependencies on the inodes of any
 4872                          * recently allocated files. We walk its diradd
 4873                          * lists pushing out the associated inode.
 4874                          */
 4875                         pagedep = WK_PAGEDEP(wk);
 4876                         for (i = 0; i < DAHASHSZ; i++) {
 4877                                 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 4878                                         continue;
 4879                                 if ((error =
 4880                                     flush_pagedep_deps(vp, pagedep->pd_mnt,
 4881                                                 &pagedep->pd_diraddhd[i]))) {
 4882                                         FREE_LOCK(&lk);
 4883                                         bawrite(bp);
 4884                                         return (error);
 4885                                 }
 4886                         }
 4887                         break;
 4888 
 4889                 case D_MKDIR:
 4890                         /*
 4891                          * This case should never happen if the vnode has
 4892                          * been properly sync'ed. However, if this function
 4893                          * is used at a place where the vnode has not yet
 4894                          * been sync'ed, this dependency can show up. So,
 4895                          * rather than panic, just flush it.
 4896                          */
 4897                         nbp = WK_MKDIR(wk)->md_buf;
 4898                         if (getdirtybuf(nbp, waitfor) == 0)
 4899                                 break;
 4900                         FREE_LOCK(&lk);
 4901                         if (waitfor == MNT_NOWAIT) {
 4902                                 bawrite(nbp);
 4903                         } else if ((error = VOP_BWRITE(nbp)) != 0) {
 4904                                 bawrite(bp);
 4905                                 return (error);
 4906                         }
 4907                         ACQUIRE_LOCK(&lk);
 4908                         break;
 4909 
 4910                 case D_BMSAFEMAP:
 4911                         /*
 4912                          * This case should never happen if the vnode has
 4913                          * been properly sync'ed. However, if this function
 4914                          * is used at a place where the vnode has not yet
 4915                          * been sync'ed, this dependency can show up. So,
 4916                          * rather than panic, just flush it.
 4917                          */
 4918                         nbp = WK_BMSAFEMAP(wk)->sm_buf;
 4919                         if (getdirtybuf(nbp, waitfor) == 0)
 4920                                 break;
 4921                         FREE_LOCK(&lk);
 4922                         if (waitfor == MNT_NOWAIT) {
 4923                                 bawrite(nbp);
 4924                         } else if ((error = VOP_BWRITE(nbp)) != 0) {
 4925                                 bawrite(bp);
 4926                                 return (error);
 4927                         }
 4928                         ACQUIRE_LOCK(&lk);
 4929                         break;
 4930 
 4931                 default:
 4932                         FREE_LOCK(&lk);
 4933                         panic("softdep_sync_metadata: Unknown type %s",
 4934                             TYPENAME(wk->wk_type));
 4935                         /* NOTREACHED */
 4936                 }
 4937         }
 4938         nbp = LIST_NEXT(bp, b_vnbufs);
 4939         getdirtybuf(nbp, MNT_WAIT);
 4940         FREE_LOCK(&lk);
 4941         bawrite(bp);
 4942         ACQUIRE_LOCK(&lk);
 4943         if (nbp != NULL) {
 4944                 bp = nbp;
 4945                 goto loop;
 4946         }
 4947         /*
 4948          * The brief unlock is to allow any pent up dependency
 4949          * processing to be done. Then proceed with the second pass.
 4950          */
 4951         if (waitfor == MNT_NOWAIT) {
 4952                 waitfor = MNT_WAIT;
 4953                 FREE_LOCK(&lk);
 4954                 ACQUIRE_LOCK(&lk);
 4955                 goto top;
 4956         }
 4957 
 4958         /*
 4959          * If we have managed to get rid of all the dirty buffers,
 4960          * then we are done. For certain directories and block
 4961          * devices, we may need to do further work.
 4962          *
 4963          * We must wait for any I/O in progress to finish so that
 4964          * all potential buffers on the dirty list will be visible.
 4965          */
 4966         drain_output(vp, 1);
 4967         if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
 4968                 FREE_LOCK(&lk);
 4969                 return (0);
 4970         }
 4971 
 4972         FREE_LOCK(&lk);
 4973         /*
 4974          * If we are trying to sync a block device, some of its buffers may
 4975          * contain metadata that cannot be written until the contents of some
 4976          * partially written files have been written to disk. The only easy
 4977          * way to accomplish this is to sync the entire filesystem (luckily
 4978          * this happens rarely).
 4979          */
 4980         if (vn_isdisk(vp, NULL) &&
 4981             vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
 4982             (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
 4983              ap->a_p)) != 0)
 4984                 return (error);
 4985         return (0);
 4986 }
 4987 
 4988 /*
 4989  * Flush the dependencies associated with an inodedep.
 4990  * Called with splbio blocked.
 4991  */
 4992 STATIC int
 4993 flush_inodedep_deps(fs, ino)
 4994         struct fs *fs;
 4995         ino_t ino;
 4996 {
 4997         struct inodedep *inodedep;
 4998         struct allocdirect *adp;
 4999         int error, waitfor;
 5000         struct buf *bp;
 5001 
 5002         splassert(IPL_BIO);
 5003 
 5004         /*
 5005          * This work is done in two passes. The first pass grabs most
 5006          * of the buffers and begins asynchronously writing them. The
 5007          * only way to wait for these asynchronous writes is to sleep
 5008          * on the filesystem vnode which may stay busy for a long time
 5009          * if the filesystem is active. So, instead, we make a second
 5010          * pass over the dependencies blocking on each write. In the
 5011          * usual case we will be blocking against a write that we
 5012          * initiated, so when it is done the dependency will have been
 5013          * resolved. Thus the second pass is expected to end quickly.
 5014          * We give a brief window at the top of the loop to allow
 5015          * any pending I/O to complete.
 5016          */
 5017         for (waitfor = MNT_NOWAIT; ; ) {
 5018                 FREE_LOCK(&lk);
 5019                 ACQUIRE_LOCK(&lk);
 5020                 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
 5021                         return (0);
 5022                 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
 5023                         if (adp->ad_state & DEPCOMPLETE)
 5024                                 continue;
 5025                         bp = adp->ad_buf;
 5026                         if (getdirtybuf(bp, waitfor) == 0) {
 5027                                 if (waitfor == MNT_NOWAIT)
 5028                                         continue;
 5029                                 break;
 5030                         }
 5031                         FREE_LOCK(&lk);
 5032                         if (waitfor == MNT_NOWAIT) {
 5033                                 bawrite(bp);
 5034                         } else if ((error = VOP_BWRITE(bp)) != 0) {
 5035                                 ACQUIRE_LOCK(&lk);
 5036                                 return (error);
 5037                         }
 5038                         ACQUIRE_LOCK(&lk);
 5039                         break;
 5040                 }
 5041                 if (adp != NULL)
 5042                         continue;
 5043                 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
 5044                         if (adp->ad_state & DEPCOMPLETE)
 5045                                 continue;
 5046                         bp = adp->ad_buf;
 5047                         if (getdirtybuf(bp, waitfor) == 0) {
 5048                                 if (waitfor == MNT_NOWAIT)
 5049                                         continue;
 5050                                 break;
 5051                         }
 5052                         FREE_LOCK(&lk);
 5053                         if (waitfor == MNT_NOWAIT) {
 5054                                 bawrite(bp);
 5055                         } else if ((error = VOP_BWRITE(bp)) != 0) {
 5056                                 ACQUIRE_LOCK(&lk);
 5057                                 return (error);
 5058                         }
 5059                         ACQUIRE_LOCK(&lk);
 5060                         break;
 5061                 }
 5062                 if (adp != NULL)
 5063                         continue;
 5064                 /*
 5065                  * If pass2, we are done, otherwise do pass 2.
 5066                  */
 5067                 if (waitfor == MNT_WAIT)
 5068                         break;
 5069                 waitfor = MNT_WAIT;
 5070         }
 5071         /*
 5072          * Try freeing inodedep in case all dependencies have been removed.
 5073          */
 5074         if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
 5075                 (void) free_inodedep(inodedep);
 5076         return (0);
 5077 }
 5078 
 5079 /*
 5080  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
 5081  * Called with splbio blocked.
 5082  */
 5083 STATIC int
 5084 flush_pagedep_deps(pvp, mp, diraddhdp)
 5085         struct vnode *pvp;
 5086         struct mount *mp;
 5087         struct diraddhd *diraddhdp;
 5088 {
 5089         struct proc *p = CURPROC;       /* XXX */
 5090         struct worklist *wk;
 5091         struct inodedep *inodedep;
 5092         struct ufsmount *ump;
 5093         struct diradd *dap;
 5094         struct vnode *vp;
 5095         int gotit, error = 0;
 5096         struct buf *bp;
 5097         ino_t inum;
 5098 
 5099         splassert(IPL_BIO);
 5100 
 5101         ump = VFSTOUFS(mp);
 5102         while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
 5103                 /*
 5104                  * Flush ourselves if this directory entry
 5105                  * has a MKDIR_PARENT dependency.
 5106                  */
 5107                 if (dap->da_state & MKDIR_PARENT) {
 5108                         FREE_LOCK(&lk);
 5109                         if ((error = UFS_UPDATE(VTOI(pvp), MNT_WAIT)))
 5110                                 break;
 5111                         ACQUIRE_LOCK(&lk);
 5112                         /*
 5113                          * If that cleared dependencies, go on to next.
 5114                          */
 5115                         if (dap != LIST_FIRST(diraddhdp))
 5116                                 continue;
 5117                         if (dap->da_state & MKDIR_PARENT) {
 5118                                 FREE_LOCK(&lk);
 5119                                 panic("flush_pagedep_deps: MKDIR_PARENT");
 5120                         }
 5121                 }
 5122                 /*
 5123                  * A newly allocated directory must have its "." and
 5124                  * ".." entries written out before its name can be
 5125                  * committed in its parent. We do not want or need
 5126                  * the full semantics of a synchronous VOP_FSYNC as
 5127                  * that may end up here again, once for each directory
 5128                  * level in the filesystem. Instead, we push the blocks
 5129                  * and wait for them to clear. We have to fsync twice
 5130                  * because the first call may choose to defer blocks
 5131                  * that still have dependencies, but deferral will
 5132                  * happen at most once.
 5133                  */
 5134                 inum = dap->da_newinum;
 5135                 if (dap->da_state & MKDIR_BODY) {
 5136                         FREE_LOCK(&lk);
 5137                         if ((error = VFS_VGET(mp, inum, &vp)) != 0)
 5138                                 break;
 5139                         if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
 5140                             (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
 5141                                 vput(vp);
 5142                                 break;
 5143                         }
 5144                         drain_output(vp, 0);
 5145                         /*
 5146                          * If first block is still dirty with a D_MKDIR
 5147                          * dependency then it needs to be written now.
 5148                          */
 5149                         for (;;) {
 5150                                 error = 0;
 5151                                 ACQUIRE_LOCK(&lk);
 5152                                 bp = incore(vp, 0);
 5153                                 if (bp == NULL) {
 5154                                         FREE_LOCK(&lk);
 5155                                         break;
 5156                                 }
 5157                                 LIST_FOREACH(wk, &bp->b_dep, wk_list)
 5158                                         if (wk->wk_type == D_MKDIR)
 5159                                                 break;
 5160                                 if (wk) {
 5161                                         gotit = getdirtybuf(bp, MNT_WAIT);
 5162                                         FREE_LOCK(&lk);
 5163                                         if (gotit && (error = bwrite(bp)) != 0)
 5164                                                 break;
 5165                                 } else
 5166                                         FREE_LOCK(&lk);
 5167                                 break;
 5168                         }
 5169                         vput(vp);
 5170                         /* Flushing of first block failed */
 5171                         if (error)
 5172                                 break;
 5173                         ACQUIRE_LOCK(&lk);
 5174                         /*
 5175                          * If that cleared dependencies, go on to next.
 5176                          */
 5177                         if (dap != LIST_FIRST(diraddhdp))
 5178                                 continue;
 5179                         if (dap->da_state & MKDIR_BODY) {
 5180                                 FREE_LOCK(&lk);
 5181                                 panic("flush_pagedep_deps: MKDIR_BODY");
 5182                         }
 5183                 }
 5184                 /*
 5185                  * Flush the inode on which the directory entry depends.
 5186                  * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
 5187                  * the only remaining dependency is that the updated inode
 5188                  * count must get pushed to disk. The inode has already
 5189                  * been pushed into its inode buffer (via VOP_UPDATE) at
 5190                  * the time of the reference count change. So we need only
 5191                  * locate that buffer, ensure that there will be no rollback
 5192                  * caused by a bitmap dependency, then write the inode buffer.
 5193                  */
 5194                 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
 5195                         FREE_LOCK(&lk);
 5196                         panic("flush_pagedep_deps: lost inode");
 5197                 }
 5198                 /*
 5199                  * If the inode still has bitmap dependencies,
 5200                  * push them to disk.
 5201                  */
 5202                 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 5203                         bp = inodedep->id_buf;
 5204                         gotit = getdirtybuf(bp, MNT_WAIT);
 5205                         FREE_LOCK(&lk);
 5206                         if (gotit && (error = bwrite(bp)) != 0)
 5207                                 break;
 5208                         ACQUIRE_LOCK(&lk);
 5209                         if (dap != LIST_FIRST(diraddhdp))
 5210                                 continue;
 5211                 }
 5212                 /*
 5213                  * If the inode is still sitting in a buffer waiting
 5214                  * to be written, push it to disk.
 5215                  */
 5216                 FREE_LOCK(&lk);
 5217                 if ((error = bread(ump->um_devvp,
 5218                     fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
 5219                     (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
 5220                         brelse(bp);
 5221                         break;
 5222                 }
 5223                 if ((error = bwrite(bp)) != 0)
 5224                         break;
 5225                 ACQUIRE_LOCK(&lk);
 5226                 /*
 5227                  * If we have failed to get rid of all the dependencies
 5228                  * then something is seriously wrong.
 5229                  */
 5230                 if (dap == LIST_FIRST(diraddhdp)) {
 5231                         FREE_LOCK(&lk);
 5232                         panic("flush_pagedep_deps: flush failed");
 5233                 }
 5234         }
 5235         if (error)
 5236                 ACQUIRE_LOCK(&lk);
 5237         return (error);
 5238 }
 5239 
 5240 /*
 5241  * A large burst of file addition or deletion activity can drive the
 5242  * memory load excessively high. First attempt to slow things down
 5243  * using the techniques below. If that fails, this routine requests
 5244  * the offending operations to fall back to running synchronously
 5245  * until the memory load returns to a reasonable level.
 5246  */
 5247 int
 5248 softdep_slowdown(vp)
 5249         struct vnode *vp;
 5250 {
 5251         int max_softdeps_hard;
 5252 
 5253         max_softdeps_hard = max_softdeps * 11 / 10;
 5254         if (num_dirrem < max_softdeps_hard / 2 &&
 5255             num_inodedep < max_softdeps_hard)
 5256                 return (0);
 5257         stat_sync_limit_hit += 1;
 5258         return (1);
 5259 }
 5260 
 5261 /*
 5262  * If memory utilization has gotten too high, deliberately slow things
 5263  * down and speed up the I/O processing.
 5264  */
 5265 STATIC int
 5266 request_cleanup(resource, islocked)
 5267         int resource;
 5268         int islocked;
 5269 {
 5270         struct proc *p = CURPROC;
 5271         int s;
 5272 
 5273         /*
 5274          * We never hold up the filesystem syncer process.
 5275          */
 5276         if (p == filesys_syncer || (p->p_flag & P_SOFTDEP))
 5277                 return (0);
 5278         /*
 5279          * First check to see if the work list has gotten backlogged.
 5280          * If it has, co-opt this process to help clean up two entries.
 5281          * Because this process may hold inodes locked, we cannot
 5282          * handle any remove requests that might block on a locked
 5283          * inode as that could lead to deadlock. We set P_SOFTDEP
 5284          * to avoid recursively processing the worklist.
 5285          */
 5286         if (num_on_worklist > max_softdeps / 10) {
 5287                 atomic_setbits_int(&p->p_flag, P_SOFTDEP);
 5288                 if (islocked)
 5289                         FREE_LOCK(&lk);
 5290                 process_worklist_item(NULL, LK_NOWAIT);
 5291                 process_worklist_item(NULL, LK_NOWAIT);
 5292                 atomic_clearbits_int(&p->p_flag, P_SOFTDEP);
 5293                 stat_worklist_push += 2;
 5294                 if (islocked)
 5295                         ACQUIRE_LOCK(&lk);
 5296                 return(1);
 5297         }
 5298         /*
 5299          * Next, we attempt to speed up the syncer process. If that
 5300          * is successful, then we allow the process to continue.
 5301          */
 5302         if (speedup_syncer())
 5303                 return(0);
 5304         /*
 5305          * If we are resource constrained on inode dependencies, try
 5306          * flushing some dirty inodes. Otherwise, we are constrained
 5307          * by file deletions, so try accelerating flushes of directories
 5308          * with removal dependencies. We would like to do the cleanup
 5309          * here, but we probably hold an inode locked at this point and 
 5310          * that might deadlock against one that we try to clean. So,
 5311          * the best that we can do is request the syncer daemon to do
 5312          * the cleanup for us.
 5313          */
 5314         switch (resource) {
 5315 
 5316         case FLUSH_INODES:
 5317                 stat_ino_limit_push += 1;
 5318                 req_clear_inodedeps += 1;
 5319                 stat_countp = &stat_ino_limit_hit;
 5320                 break;
 5321 
 5322         case FLUSH_REMOVE:
 5323                 stat_blk_limit_push += 1;
 5324                 req_clear_remove += 1;
 5325                 stat_countp = &stat_blk_limit_hit;
 5326                 break;
 5327 
 5328         default:
 5329                 if (islocked)
 5330                         FREE_LOCK(&lk);
 5331                 panic("request_cleanup: unknown type");
 5332         }
 5333         /*
 5334          * Hopefully the syncer daemon will catch up and awaken us.
 5335          * We wait at most tickdelay before proceeding in any case.
 5336          */
 5337         if (islocked == 0)
 5338                 ACQUIRE_LOCK(&lk);
 5339         proc_waiting += 1;
 5340         if (!timeout_pending(&proc_waiting_timeout))
 5341                 timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
 5342 
 5343         s = FREE_LOCK_INTERLOCKED(&lk);
 5344         (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
 5345         ACQUIRE_LOCK_INTERLOCKED(&lk, s);
 5346         proc_waiting -= 1;
 5347         if (islocked == 0)
 5348                 FREE_LOCK(&lk);
 5349         return (1);
 5350 }
 5351 
 5352 /*
 5353  * Awaken processes pausing in request_cleanup and clear proc_waiting
 5354  * to indicate that there is no longer a timer running.
 5355  */
 5356 void
 5357 pause_timer(arg)
 5358         void *arg;
 5359 {
 5360 
 5361         *stat_countp += 1;
 5362         wakeup_one(&proc_waiting);
 5363         if (proc_waiting > 0)
 5364                 timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
 5365 }
 5366 
 5367 /*
 5368  * Flush out a directory with at least one removal dependency in an effort to
 5369  * reduce the number of dirrem, freefile, and freeblks dependency structures.
 5370  */
 5371 STATIC void
 5372 clear_remove(p)
 5373         struct proc *p;
 5374 {
 5375         struct pagedep_hashhead *pagedephd;
 5376         struct pagedep *pagedep;
 5377         static int next = 0;
 5378         struct mount *mp;
 5379         struct vnode *vp;
 5380         int error, cnt;
 5381         ino_t ino;
 5382 
 5383         ACQUIRE_LOCK(&lk);
 5384         for (cnt = 0; cnt < pagedep_hash; cnt++) {
 5385                 pagedephd = &pagedep_hashtbl[next++];
 5386                 if (next >= pagedep_hash)
 5387                         next = 0;
 5388                 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 5389                         if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
 5390                                 continue;
 5391                         mp = pagedep->pd_mnt;
 5392                         ino = pagedep->pd_ino;
 5393 #if 0
 5394                         if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 5395                                 continue;
 5396 #endif
 5397                         FREE_LOCK(&lk);
 5398                         if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
 5399                                 softdep_error("clear_remove: vget", error);
 5400 #if 0
 5401                                 vn_finished_write(mp);
 5402 #endif
 5403                                 return;
 5404                         }
 5405                         if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
 5406                                 softdep_error("clear_remove: fsync", error);
 5407                         drain_output(vp, 0);
 5408                         vput(vp);
 5409 #if 0
 5410                         vn_finished_write(mp);
 5411 #endif
 5412                         return;
 5413                 }
 5414         }
 5415         FREE_LOCK(&lk);
 5416 }
 5417 
 5418 /*
 5419  * Clear out a block of dirty inodes in an effort to reduce
 5420  * the number of inodedep dependency structures.
 5421  */
 5422 STATIC void
 5423 clear_inodedeps(p)
 5424         struct proc *p;
 5425 {
 5426         struct inodedep_hashhead *inodedephd;
 5427         struct inodedep *inodedep;
 5428         static int next = 0;
 5429         struct mount *mp;
 5430         struct vnode *vp;
 5431         struct fs *fs;
 5432         int error, cnt;
 5433         ino_t firstino, lastino, ino;
 5434 
 5435         ACQUIRE_LOCK(&lk);
 5436         /*
 5437          * Pick a random inode dependency to be cleared.
 5438          * We will then gather up all the inodes in its block 
 5439          * that have dependencies and flush them out.
 5440          */
 5441         for (cnt = 0; cnt < inodedep_hash; cnt++) {
 5442                 inodedephd = &inodedep_hashtbl[next++];
 5443                 if (next >= inodedep_hash)
 5444                         next = 0;
 5445                 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
 5446                         break;
 5447         }
 5448         if (inodedep == NULL) {
 5449                 FREE_LOCK(&lk);
 5450                 return;
 5451         }
 5452         /*
 5453          * Ugly code to find mount point given pointer to superblock.
 5454          */
 5455         fs = inodedep->id_fs;
 5456         CIRCLEQ_FOREACH(mp, &mountlist, mnt_list)
 5457                 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
 5458                         break;
 5459         /*
 5460          * Find the last inode in the block with dependencies.
 5461          */
 5462         firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
 5463         for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
 5464                 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
 5465                         break;
 5466         /*
 5467          * Asynchronously push all but the last inode with dependencies.
 5468          * Synchronously push the last inode with dependencies to ensure
 5469          * that the inode block gets written to free up the inodedeps.
 5470          */
 5471         for (ino = firstino; ino <= lastino; ino++) {
 5472                 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
 5473                         continue;
 5474                 FREE_LOCK(&lk);
 5475 #if 0
 5476                 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 5477                         continue;
 5478 #endif
 5479                 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
 5480                         softdep_error("clear_inodedeps: vget", error);
 5481 #if 0
 5482                         vn_finished_write(mp);
 5483 #endif
 5484                         return;
 5485                 }
 5486                 if (ino == lastino) {
 5487                         if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
 5488                                 softdep_error("clear_inodedeps: fsync1", error);
 5489                 } else {
 5490                         if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
 5491                                 softdep_error("clear_inodedeps: fsync2", error);
 5492                         drain_output(vp, 0);
 5493                 }
 5494                 vput(vp);
 5495 #if 0
 5496                 vn_finished_write(mp);
 5497 #endif
 5498                 ACQUIRE_LOCK(&lk);
 5499         }
 5500         FREE_LOCK(&lk);
 5501 }
 5502 
 5503 /*
 5504  * Function to determine if the buffer has outstanding dependencies
 5505  * that will cause a roll-back if the buffer is written. If wantcount
 5506  * is set, return number of dependencies, otherwise just yes or no.
 5507  */
 5508 int
 5509 softdep_count_dependencies(bp, wantcount, islocked)
 5510         struct buf *bp;
 5511         int wantcount;
 5512         int islocked;
 5513 {
 5514         struct worklist *wk;
 5515         struct inodedep *inodedep;
 5516         struct indirdep *indirdep;
 5517         struct allocindir *aip;
 5518         struct pagedep *pagedep;
 5519         struct diradd *dap;
 5520         int i, retval;
 5521 
 5522         retval = 0;
 5523         if (!islocked)
 5524                 ACQUIRE_LOCK(&lk);
 5525         LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 5526                 switch (wk->wk_type) {
 5527 
 5528                 case D_INODEDEP:
 5529                         inodedep = WK_INODEDEP(wk);
 5530                         if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 5531                                 /* bitmap allocation dependency */
 5532                                 retval += 1;
 5533                                 if (!wantcount)
 5534                                         goto out;
 5535                         }
 5536                         if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
 5537                                 /* direct block pointer dependency */
 5538                                 retval += 1;
 5539                                 if (!wantcount)
 5540                                         goto out;
 5541                         }
 5542                         continue;
 5543 
 5544                 case D_INDIRDEP:
 5545                         indirdep = WK_INDIRDEP(wk);
 5546 
 5547                         LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
 5548                                 /* indirect block pointer dependency */
 5549                                 retval += 1;
 5550                                 if (!wantcount)
 5551                                         goto out;
 5552                         }
 5553                         continue;
 5554 
 5555                 case D_PAGEDEP:
 5556                         pagedep = WK_PAGEDEP(wk);
 5557                         for (i = 0; i < DAHASHSZ; i++) {
 5558 
 5559                                 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 5560                                         /* directory entry dependency */
 5561                                         retval += 1;
 5562                                         if (!wantcount)
 5563                                                 goto out;
 5564                                 }
 5565                         }
 5566                         continue;
 5567 
 5568                 case D_BMSAFEMAP:
 5569                 case D_ALLOCDIRECT:
 5570                 case D_ALLOCINDIR:
 5571                 case D_MKDIR:
 5572                         /* never a dependency on these blocks */
 5573                         continue;
 5574 
 5575                 default:
 5576                         if (!islocked)
 5577                                 FREE_LOCK(&lk);
 5578                         panic("softdep_check_for_rollback: Unexpected type %s",
 5579                             TYPENAME(wk->wk_type));
 5580                         /* NOTREACHED */
 5581                 }
 5582         }
 5583 out:
 5584         if (!islocked)
 5585                 FREE_LOCK(&lk);
 5586         return retval;
 5587 }
 5588 
 5589 /*
 5590  * Acquire exclusive access to a buffer.
 5591  * Must be called with splbio blocked.
 5592  * Return 1 if buffer was acquired.
 5593  */
 5594 STATIC int
 5595 getdirtybuf(bp, waitfor)
 5596         struct buf *bp;
 5597         int waitfor;
 5598 {
 5599         int s;
 5600 
 5601         if (bp == NULL)
 5602                 return (0);
 5603 
 5604         splassert(IPL_BIO);
 5605 
 5606         for (;;) {
 5607                 if ((bp->b_flags & B_BUSY) == 0)
 5608                         break;
 5609                 if (waitfor != MNT_WAIT)
 5610                         return (0);
 5611                 bp->b_flags |= B_WANTED;
 5612                 s = FREE_LOCK_INTERLOCKED(&lk);
 5613                 tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0);
 5614                 ACQUIRE_LOCK_INTERLOCKED(&lk, s);
 5615         }
 5616         if ((bp->b_flags & B_DELWRI) == 0)
 5617                 return (0);
 5618         bremfree(bp);
 5619         bp->b_flags |= B_BUSY;
 5620         return (1);
 5621 }
 5622 
 5623 /*
 5624  * Wait for pending output on a vnode to complete.
 5625  * Must be called with vnode locked.
 5626  */
 5627 STATIC void
 5628 drain_output(vp, islocked)
 5629         struct vnode *vp;
 5630         int islocked;
 5631 {
 5632         int s;
 5633 
 5634         if (!islocked)
 5635                 ACQUIRE_LOCK(&lk);
 5636 
 5637         splassert(IPL_BIO);
 5638 
 5639         while (vp->v_numoutput) {
 5640                 vp->v_bioflag |= VBIOWAIT;
 5641                 s = FREE_LOCK_INTERLOCKED(&lk);
 5642                 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drain_output", 0);
 5643                 ACQUIRE_LOCK_INTERLOCKED(&lk, s);
 5644         }
 5645         if (!islocked)
 5646                 FREE_LOCK(&lk);
 5647 }
 5648 
 5649 /*
 5650  * Called whenever a buffer that is being invalidated or reallocated
 5651  * contains dependencies. This should only happen if an I/O error has
 5652  * occurred. The routine is called with the buffer locked.
 5653  */ 
 5654 void
 5655 softdep_deallocate_dependencies(bp)
 5656         struct buf *bp;
 5657 {
 5658 
 5659         if ((bp->b_flags & B_ERROR) == 0)
 5660                 panic("softdep_deallocate_dependencies: dangling deps");
 5661         softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
 5662         panic("softdep_deallocate_dependencies: unrecovered I/O error");
 5663 }
 5664 
 5665 /*
 5666  * Function to handle asynchronous write errors in the filesystem.
 5667  */
 5668 void
 5669 softdep_error(func, error)
 5670         char *func;
 5671         int error;
 5672 {
 5673 
 5674         /* XXX should do something better! */
 5675         printf("%s: got error %d while accessing filesystem\n", func, error);
 5676 }
 5677 
 5678 #ifdef DDB
 5679 #include <machine/db_machdep.h>
 5680 #include <ddb/db_interface.h>
 5681 #include <ddb/db_output.h>
 5682 
 5683 void
 5684 softdep_print(struct buf *bp, int full, int (*pr)(const char *, ...))
 5685 {
 5686         struct worklist *wk;
 5687 
 5688         (*pr)("  deps:\n");
 5689         LIST_FOREACH(wk, &bp->b_dep, wk_list)
 5690                 worklist_print(wk, full, pr);
 5691 }
 5692 
 5693 void
 5694 worklist_print(struct worklist *wk, int full, int (*pr)(const char *, ...))
 5695 {
 5696         struct pagedep *pagedep;
 5697         struct inodedep *inodedep;
 5698         struct newblk *newblk;
 5699         struct bmsafemap *bmsafemap;
 5700         struct allocdirect *adp;
 5701         struct indirdep *indirdep;
 5702         struct allocindir *aip;
 5703         struct freefrag *freefrag;
 5704         struct freeblks *freeblks;
 5705         struct freefile *freefile;
 5706         struct diradd *dap;
 5707         struct mkdir *mkdir;
 5708         struct dirrem *dirrem;
 5709         struct newdirblk *newdirblk;
 5710         char prefix[33];
 5711         int i;
 5712 
 5713         for (prefix[i = 2 * MIN(16, full)] = '\0'; i--; prefix[i] = ' ')
 5714                 ;
 5715 
 5716         (*pr)("%s%s(%p) state %b\n%s", prefix, TYPENAME(wk->wk_type), wk,
 5717             wk->wk_state, DEP_BITS, prefix);
 5718         switch (wk->wk_type) {
 5719         case D_PAGEDEP:
 5720                 pagedep = WK_PAGEDEP(wk);
 5721                 (*pr)("mount %p ino %u lbn %lld\n", pagedep->pd_mnt,
 5722                     pagedep->pd_ino, pagedep->pd_lbn);
 5723                 break;
 5724         case D_INODEDEP:
 5725                 inodedep = WK_INODEDEP(wk);
 5726                 (*pr)("fs %p ino %u nlinkdelta %u dino %p\n"
 5727                     "%s  bp %p savsz %lld\n", inodedep->id_fs,
 5728                     inodedep->id_ino, inodedep->id_nlinkdelta,
 5729                     inodedep->id_un.idu_savedino1,
 5730                     prefix, inodedep->id_buf, inodedep->id_savedsize);
 5731                 break;
 5732         case D_NEWBLK:
 5733                 newblk = WK_NEWBLK(wk);
 5734                 (*pr)("fs %p newblk %d state %d bmsafemap %p\n",
 5735                     newblk->nb_fs, newblk->nb_newblkno, newblk->nb_state,
 5736                     newblk->nb_bmsafemap);
 5737                 break;
 5738         case D_BMSAFEMAP:
 5739                 bmsafemap = WK_BMSAFEMAP(wk);
 5740                 (*pr)("buf %p\n", bmsafemap->sm_buf);
 5741                 break;
 5742         case D_ALLOCDIRECT:
 5743                 adp = WK_ALLOCDIRECT(wk);
 5744                 (*pr)("lbn %lld newlbk %d oldblk %d newsize %lu olsize %lu\n"
 5745                     "%s  bp %p inodedep %p freefrag %p\n", adp->ad_lbn,
 5746                     adp->ad_newblkno, adp->ad_oldblkno, adp->ad_newsize,
 5747                     adp->ad_oldsize,
 5748                     prefix, adp->ad_buf, adp->ad_inodedep, adp->ad_freefrag);
 5749                 break;
 5750         case D_INDIRDEP:
 5751                 indirdep = WK_INDIRDEP(wk);
 5752                 (*pr)("savedata %p savebp %p\n", indirdep->ir_saveddata,
 5753                     indirdep->ir_savebp);
 5754                 break;
 5755         case D_ALLOCINDIR:
 5756                 aip = WK_ALLOCINDIR(wk);
 5757                 (*pr)("off %d newblk %d oldblk %d freefrag %p\n"
 5758                     "%s  indirdep %p buf %p\n", aip->ai_offset,
 5759                     aip->ai_newblkno, aip->ai_oldblkno, aip->ai_freefrag,
 5760                     prefix, aip->ai_indirdep, aip->ai_buf);
 5761                 break;
 5762         case D_FREEFRAG:
 5763                 freefrag = WK_FREEFRAG(wk);
 5764                 (*pr)("vnode %p mp %p blkno %d fsize %ld ino %u\n",
 5765                     freefrag->ff_devvp, freefrag->ff_mnt, freefrag->ff_blkno,
 5766                     freefrag->ff_fragsize, freefrag->ff_inum);
 5767                 break;
 5768         case D_FREEBLKS:
 5769                 freeblks = WK_FREEBLKS(wk);
 5770                 (*pr)("previno %u devvp %p mp %p oldsz %lld newsz %lld\n"
 5771                     "%s  chkcnt %d uid %d\n", freeblks->fb_previousinum,
 5772                     freeblks->fb_devvp, freeblks->fb_mnt, freeblks->fb_oldsize,
 5773                     freeblks->fb_newsize,
 5774                     prefix, freeblks->fb_chkcnt, freeblks->fb_uid);
 5775                 break;
 5776         case D_FREEFILE:
 5777                 freefile = WK_FREEFILE(wk);
 5778                 (*pr)("mode %x oldino %u vnode %p mp %p\n", freefile->fx_mode,
 5779                     freefile->fx_oldinum, freefile->fx_devvp, freefile->fx_mnt);
 5780                 break;
 5781         case D_DIRADD:
 5782                 dap = WK_DIRADD(wk);
 5783                 (*pr)("off %ld ino %u da_un %p\n", dap->da_offset, 
 5784                     dap->da_newinum, dap->da_un.dau_previous);
 5785                 break;
 5786         case D_MKDIR:
 5787                 mkdir = WK_MKDIR(wk);
 5788                 (*pr)("diradd %p bp %p\n", mkdir->md_diradd, mkdir->md_buf);
 5789                 break;
 5790         case D_DIRREM:
 5791                 dirrem = WK_DIRREM(wk);
 5792                 (*pr)("mp %p ino %u dm_un %p\n", dirrem->dm_mnt, 
 5793                     dirrem->dm_oldinum, dirrem->dm_un.dmu_pagedep);
 5794                 break;
 5795         case D_NEWDIRBLK:
 5796                 newdirblk = WK_NEWDIRBLK(wk);
 5797                 (*pr)("pagedep %p\n", newdirblk->db_pagedep);
 5798                 break;
 5799         }
 5800 }
 5801 #endif
/* [<][>][^][v][top][bottom][index][help] */
root/ufs/ffs/ffs_softdep.c

DEFINITIONS