1 /* $OpenBSD: ffs_softdep.c,v 1.92 2007/07/11 15:32:22 millert Exp $ */
2
3 /*
4 * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
14 * 1614 Oxford Street mckusick@mckusick.com
15 * Berkeley, CA 94709-1608 +1-510-843-9542
16 * USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
29 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
30 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
32 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
41 * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.86 2001/02/04 16:08:18 phk Exp $
42 */
43
44 #include <sys/param.h>
45 #include <sys/buf.h>
46 #include <sys/kernel.h>
47 #include <sys/malloc.h>
48 #include <sys/mount.h>
49 #include <sys/proc.h>
50 #include <sys/pool.h>
51 #include <sys/syslog.h>
52 #include <sys/systm.h>
53 #include <sys/vnode.h>
54 #include <miscfs/specfs/specdev.h>
55 #include <ufs/ufs/dir.h>
56 #include <ufs/ufs/quota.h>
57 #include <ufs/ufs/inode.h>
58 #include <ufs/ufs/ufsmount.h>
59 #include <ufs/ffs/fs.h>
60 #include <ufs/ffs/softdep.h>
61 #include <ufs/ffs/ffs_extern.h>
62 #include <ufs/ufs/ufs_extern.h>
63
64 #define STATIC
65
66 /*
67 * Mapping of dependency structure types to malloc types.
68 */
69 #define D_PAGEDEP 0
70 #define D_INODEDEP 1
71 #define D_NEWBLK 2
72 #define D_BMSAFEMAP 3
73 #define D_ALLOCDIRECT 4
74 #define D_INDIRDEP 5
75 #define D_ALLOCINDIR 6
76 #define D_FREEFRAG 7
77 #define D_FREEBLKS 8
78 #define D_FREEFILE 9
79 #define D_DIRADD 10
80 #define D_MKDIR 11
81 #define D_DIRREM 12
82 #define D_NEWDIRBLK 13
83 #define D_LAST 13
84 /*
85 * Names of softdep types.
86 */
87 const char *softdep_typenames[] = {
88 "pagedep",
89 "inodedep",
90 "newblk",
91 "bmsafemap",
92 "allocdirect",
93 "indirdep",
94 "allocindir",
95 "freefrag",
96 "freeblks",
97 "freefile",
98 "diradd",
99 "mkdir",
100 "dirrem",
101 "newdirblk",
102 };
103 #define TYPENAME(type) \
104 ((unsigned)(type) <= D_LAST ? softdep_typenames[type] : "???")
105 /*
106 * Finding the current process.
107 */
108 #define CURPROC curproc
109 /*
110 * End system adaptation definitions.
111 */
112
113 /*
114 * Internal function prototypes.
115 */
116 STATIC void softdep_error(char *, int);
117 STATIC void drain_output(struct vnode *, int);
118 STATIC int getdirtybuf(struct buf *, int);
119 STATIC void clear_remove(struct proc *);
120 STATIC void clear_inodedeps(struct proc *);
121 STATIC int flush_pagedep_deps(struct vnode *, struct mount *,
122 struct diraddhd *);
123 STATIC int flush_inodedep_deps(struct fs *, ino_t);
124 STATIC int handle_written_filepage(struct pagedep *, struct buf *);
125 STATIC void diradd_inode_written(struct diradd *, struct inodedep *);
126 STATIC int handle_written_inodeblock(struct inodedep *, struct buf *);
127 STATIC void handle_allocdirect_partdone(struct allocdirect *);
128 STATIC void handle_allocindir_partdone(struct allocindir *);
129 STATIC void initiate_write_filepage(struct pagedep *, struct buf *);
130 STATIC void handle_written_mkdir(struct mkdir *, int);
131 STATIC void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
132 #ifdef FFS2
133 STATIC void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
134 #endif
135 STATIC void handle_workitem_freefile(struct freefile *);
136 STATIC void handle_workitem_remove(struct dirrem *);
137 STATIC struct dirrem *newdirrem(struct buf *, struct inode *,
138 struct inode *, int, struct dirrem **);
139 STATIC void free_diradd(struct diradd *);
140 STATIC void free_allocindir(struct allocindir *, struct inodedep *);
141 STATIC void free_newdirblk(struct newdirblk *);
142 STATIC int indir_trunc(struct inode *, daddr_t, int, daddr64_t, long *);
143 STATIC void deallocate_dependencies(struct buf *, struct inodedep *);
144 STATIC void free_allocdirect(struct allocdirectlst *,
145 struct allocdirect *, int);
146 STATIC int check_inode_unwritten(struct inodedep *);
147 STATIC int free_inodedep(struct inodedep *);
148 STATIC void handle_workitem_freeblocks(struct freeblks *);
149 STATIC void merge_inode_lists(struct inodedep *);
150 STATIC void setup_allocindir_phase2(struct buf *, struct inode *,
151 struct allocindir *);
152 STATIC struct allocindir *newallocindir(struct inode *, int, daddr_t,
153 daddr_t);
154 STATIC void handle_workitem_freefrag(struct freefrag *);
155 STATIC struct freefrag *newfreefrag(struct inode *, daddr_t, long);
156 STATIC void allocdirect_merge(struct allocdirectlst *,
157 struct allocdirect *, struct allocdirect *);
158 STATIC struct bmsafemap *bmsafemap_lookup(struct buf *);
159 STATIC int newblk_lookup(struct fs *, daddr_t, int,
160 struct newblk **);
161 STATIC int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
162 STATIC int pagedep_lookup(struct inode *, daddr64_t, int, struct pagedep **);
163 STATIC void pause_timer(void *);
164 STATIC int request_cleanup(int, int);
165 STATIC int process_worklist_item(struct mount *, int);
166 STATIC void add_to_worklist(struct worklist *);
167
168 /*
169 * Exported softdep operations.
170 */
171 void softdep_disk_io_initiation(struct buf *);
172 void softdep_disk_write_complete(struct buf *);
173 void softdep_deallocate_dependencies(struct buf *);
174 void softdep_move_dependencies(struct buf *, struct buf *);
175 int softdep_count_dependencies(struct buf *bp, int, int);
176
177 /*
178 * Locking primitives.
179 *
180 * For a uniprocessor, all we need to do is protect against disk
181 * interrupts. For a multiprocessor, this lock would have to be
182 * a mutex. A single mutex is used throughout this file, though
183 * finer grain locking could be used if contention warranted it.
184 *
185 * For a multiprocessor, the sleep call would accept a lock and
186 * release it after the sleep processing was complete. In a uniprocessor
187 * implementation there is no such interlock, so we simple mark
188 * the places where it needs to be done with the `interlocked' form
189 * of the lock calls. Since the uniprocessor sleep already interlocks
190 * the spl, there is nothing that really needs to be done.
191 */
192 #ifndef /* NOT */ DEBUG
193 STATIC struct lockit {
194 int lkt_spl;
195 } lk = { 0 };
196 #define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio()
197 #define FREE_LOCK(lk) splx((lk)->lkt_spl)
198 #define ACQUIRE_LOCK_INTERLOCKED(lk,s) (lk)->lkt_spl = (s)
199 #define FREE_LOCK_INTERLOCKED(lk) ((lk)->lkt_spl)
200
201 #else /* DEBUG */
202 STATIC struct lockit {
203 int lkt_spl;
204 pid_t lkt_held;
205 int lkt_line;
206 } lk = { 0, -1 };
207 STATIC int lockcnt;
208
209 STATIC void acquire_lock(struct lockit *, int);
210 STATIC void free_lock(struct lockit *, int);
211 STATIC void acquire_lock_interlocked(struct lockit *, int, int);
212 STATIC int free_lock_interlocked(struct lockit *, int);
213
214 #define ACQUIRE_LOCK(lk) acquire_lock(lk, __LINE__)
215 #define FREE_LOCK(lk) free_lock(lk, __LINE__)
216 #define ACQUIRE_LOCK_INTERLOCKED(lk,s) acquire_lock_interlocked(lk, (s), __LINE__)
217 #define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk, __LINE__)
218
219 STATIC void
220 acquire_lock(lk, line)
221 struct lockit *lk;
222 int line;
223 {
224 pid_t holder;
225 int original_line;
226
227 if (lk->lkt_held != -1) {
228 holder = lk->lkt_held;
229 original_line = lk->lkt_line;
230 FREE_LOCK(lk);
231 if (holder == CURPROC->p_pid)
232 panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
233 else
234 panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
235 }
236 lk->lkt_spl = splbio();
237 lk->lkt_held = CURPROC->p_pid;
238 lk->lkt_line = line;
239 lockcnt++;
240 }
241
242 STATIC void
243 free_lock(lk, line)
244 struct lockit *lk;
245 int line;
246 {
247
248 if (lk->lkt_held == -1)
249 panic("softdep_unlock: lock not held at line %d", line);
250 lk->lkt_held = -1;
251 splx(lk->lkt_spl);
252 }
253
254 STATIC void
255 acquire_lock_interlocked(lk, s, line)
256 struct lockit *lk;
257 int s;
258 int line;
259 {
260 pid_t holder;
261 int original_line;
262
263 if (lk->lkt_held != -1) {
264 holder = lk->lkt_held;
265 original_line = lk->lkt_line;
266 FREE_LOCK_INTERLOCKED(lk);
267 if (holder == CURPROC->p_pid)
268 panic("softdep_lock: locking against myself, acquired at line %d, relocked at line %d", original_line, line);
269 else
270 panic("softdep_lock: lock held by %d, acquired at line %d, relocked at line %d", holder, original_line, line);
271 }
272 lk->lkt_held = CURPROC->p_pid;
273 lk->lkt_line = line;
274 lk->lkt_spl = s;
275 lockcnt++;
276 }
277
278 STATIC int
279 free_lock_interlocked(lk, line)
280 struct lockit *lk;
281 int line;
282 {
283
284 if (lk->lkt_held == -1)
285 panic("softdep_unlock_interlocked: lock not held at line %d", line);
286 lk->lkt_held = -1;
287
288 return (lk->lkt_spl);
289 }
290 #endif /* DEBUG */
291
292 /*
293 * Place holder for real semaphores.
294 */
295 struct sema {
296 int value;
297 pid_t holder;
298 char *name;
299 int prio;
300 int timo;
301 };
302 STATIC void sema_init(struct sema *, char *, int, int);
303 STATIC int sema_get(struct sema *, struct lockit *);
304 STATIC void sema_release(struct sema *);
305
306 STATIC void
307 sema_init(semap, name, prio, timo)
308 struct sema *semap;
309 char *name;
310 int prio, timo;
311 {
312
313 semap->holder = -1;
314 semap->value = 0;
315 semap->name = name;
316 semap->prio = prio;
317 semap->timo = timo;
318 }
319
320 STATIC int
321 sema_get(semap, interlock)
322 struct sema *semap;
323 struct lockit *interlock;
324 {
325 int s;
326
327 if (semap->value++ > 0) {
328 if (interlock != NULL)
329 s = FREE_LOCK_INTERLOCKED(interlock);
330 tsleep((caddr_t)semap, semap->prio, semap->name, semap->timo);
331 if (interlock != NULL) {
332 ACQUIRE_LOCK_INTERLOCKED(interlock, s);
333 FREE_LOCK(interlock);
334 }
335 return (0);
336 }
337 semap->holder = CURPROC->p_pid;
338 if (interlock != NULL)
339 FREE_LOCK(interlock);
340 return (1);
341 }
342
343 STATIC void
344 sema_release(semap)
345 struct sema *semap;
346 {
347
348 if (semap->value <= 0 || semap->holder != CURPROC->p_pid) {
349 #ifdef DEBUG
350 if (lk.lkt_held != -1)
351 FREE_LOCK(&lk);
352 #endif
353 panic("sema_release: not held");
354 }
355 if (--semap->value > 0) {
356 semap->value = 0;
357 wakeup(semap);
358 }
359 semap->holder = -1;
360 }
361
362 /*
363 * Memory management.
364 */
365 STATIC struct pool pagedep_pool;
366 STATIC struct pool inodedep_pool;
367 STATIC struct pool newblk_pool;
368 STATIC struct pool bmsafemap_pool;
369 STATIC struct pool allocdirect_pool;
370 STATIC struct pool indirdep_pool;
371 STATIC struct pool allocindir_pool;
372 STATIC struct pool freefrag_pool;
373 STATIC struct pool freeblks_pool;
374 STATIC struct pool freefile_pool;
375 STATIC struct pool diradd_pool;
376 STATIC struct pool mkdir_pool;
377 STATIC struct pool dirrem_pool;
378 STATIC struct pool newdirblk_pool;
379
380 static __inline void
381 softdep_free(struct worklist *item, int type)
382 {
383
384 switch (type) {
385 case D_PAGEDEP:
386 pool_put(&pagedep_pool, item);
387 break;
388
389 case D_INODEDEP:
390 pool_put(&inodedep_pool, item);
391 break;
392
393 case D_BMSAFEMAP:
394 pool_put(&bmsafemap_pool, item);
395 break;
396
397 case D_ALLOCDIRECT:
398 pool_put(&allocdirect_pool, item);
399 break;
400
401 case D_INDIRDEP:
402 pool_put(&indirdep_pool, item);
403 break;
404
405 case D_ALLOCINDIR:
406 pool_put(&allocindir_pool, item);
407 break;
408
409 case D_FREEFRAG:
410 pool_put(&freefrag_pool, item);
411 break;
412
413 case D_FREEBLKS:
414 pool_put(&freeblks_pool, item);
415 break;
416
417 case D_FREEFILE:
418 pool_put(&freefile_pool, item);
419 break;
420
421 case D_DIRADD:
422 pool_put(&diradd_pool, item);
423 break;
424
425 case D_MKDIR:
426 pool_put(&mkdir_pool, item);
427 break;
428
429 case D_DIRREM:
430 pool_put(&dirrem_pool, item);
431 break;
432
433 case D_NEWDIRBLK:
434 pool_put(&newdirblk_pool, item);
435 break;
436
437 default:
438 #ifdef DEBUG
439 if (lk.lkt_held != -1)
440 FREE_LOCK(&lk);
441 #endif
442 panic("softdep_free: unknown type %d", type);
443 }
444 }
445
446 struct workhead softdep_freequeue;
447
448 static __inline void
449 softdep_freequeue_add(struct worklist *item)
450 {
451 int s;
452
453 s = splbio();
454 LIST_INSERT_HEAD(&softdep_freequeue, item, wk_list);
455 splx(s);
456 }
457
458 static __inline void
459 softdep_freequeue_process(void)
460 {
461 struct worklist *wk;
462
463 splassert(IPL_BIO);
464
465 while ((wk = LIST_FIRST(&softdep_freequeue)) != NULL) {
466 LIST_REMOVE(wk, wk_list);
467 FREE_LOCK(&lk);
468 softdep_free(wk, wk->wk_type);
469 ACQUIRE_LOCK(&lk);
470 }
471 }
472
473 /*
474 * Worklist queue management.
475 * These routines require that the lock be held.
476 */
477 #ifndef /* NOT */ DEBUG
478 #define WORKLIST_INSERT(head, item) do { \
479 (item)->wk_state |= ONWORKLIST; \
480 LIST_INSERT_HEAD(head, item, wk_list); \
481 } while (0)
482 #define WORKLIST_REMOVE(item) do { \
483 (item)->wk_state &= ~ONWORKLIST; \
484 LIST_REMOVE(item, wk_list); \
485 } while (0)
486 #define WORKITEM_FREE(item, type) softdep_freequeue_add((struct worklist *)item)
487
488 #else /* DEBUG */
489 STATIC void worklist_insert(struct workhead *, struct worklist *);
490 STATIC void worklist_remove(struct worklist *);
491 STATIC void workitem_free(struct worklist *);
492
493 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
494 #define WORKLIST_REMOVE(item) worklist_remove(item)
495 #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item)
496
497 STATIC void
498 worklist_insert(head, item)
499 struct workhead *head;
500 struct worklist *item;
501 {
502
503 if (lk.lkt_held == -1)
504 panic("worklist_insert: lock not held");
505 if (item->wk_state & ONWORKLIST) {
506 FREE_LOCK(&lk);
507 panic("worklist_insert: already on list");
508 }
509 item->wk_state |= ONWORKLIST;
510 LIST_INSERT_HEAD(head, item, wk_list);
511 }
512
513 STATIC void
514 worklist_remove(item)
515 struct worklist *item;
516 {
517
518 if (lk.lkt_held == -1)
519 panic("worklist_remove: lock not held");
520 if ((item->wk_state & ONWORKLIST) == 0) {
521 FREE_LOCK(&lk);
522 panic("worklist_remove: not on list");
523 }
524 item->wk_state &= ~ONWORKLIST;
525 LIST_REMOVE(item, wk_list);
526 }
527
528 STATIC void
529 workitem_free(item)
530 struct worklist *item;
531 {
532
533 if (item->wk_state & ONWORKLIST) {
534 if (lk.lkt_held != -1)
535 FREE_LOCK(&lk);
536 panic("workitem_free: still on list");
537 }
538 softdep_freequeue_add(item);
539 }
540 #endif /* DEBUG */
541
542 /*
543 * Workitem queue management
544 */
545 STATIC struct workhead softdep_workitem_pending;
546 STATIC struct worklist *worklist_tail;
547 STATIC int num_on_worklist; /* number of worklist items to be processed */
548 STATIC int softdep_worklist_busy; /* 1 => trying to do unmount */
549 STATIC int softdep_worklist_req; /* serialized waiters */
550 STATIC int max_softdeps; /* maximum number of structs before slowdown */
551 STATIC int tickdelay = 2; /* number of ticks to pause during slowdown */
552 STATIC int proc_waiting; /* tracks whether we have a timeout posted */
553 STATIC int *stat_countp; /* statistic to count in proc_waiting timeout */
554 STATIC struct timeout proc_waiting_timeout;
555 STATIC struct proc *filesys_syncer; /* proc of filesystem syncer process */
556 STATIC int req_clear_inodedeps; /* syncer process flush some inodedeps */
557 #define FLUSH_INODES 1
558 STATIC int req_clear_remove; /* syncer process flush some freeblks */
559 #define FLUSH_REMOVE 2
560 /*
561 * runtime statistics
562 */
563 STATIC int stat_worklist_push; /* number of worklist cleanups */
564 STATIC int stat_blk_limit_push; /* number of times block limit neared */
565 STATIC int stat_ino_limit_push; /* number of times inode limit neared */
566 STATIC int stat_blk_limit_hit; /* number of times block slowdown imposed */
567 STATIC int stat_ino_limit_hit; /* number of times inode slowdown imposed */
568 STATIC int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
569 STATIC int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
570 STATIC int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
571 STATIC int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
572 STATIC int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
573
574 /*
575 * Add an item to the end of the work queue.
576 * This routine requires that the lock be held.
577 * This is the only routine that adds items to the list.
578 * The following routine is the only one that removes items
579 * and does so in order from first to last.
580 */
581 STATIC void
582 add_to_worklist(wk)
583 struct worklist *wk;
584 {
585
586 if (wk->wk_state & ONWORKLIST) {
587 #ifdef DEBUG
588 if (lk.lkt_held != -1)
589 FREE_LOCK(&lk);
590 #endif
591 panic("add_to_worklist: already on list");
592 }
593 wk->wk_state |= ONWORKLIST;
594 if (LIST_FIRST(&softdep_workitem_pending) == NULL)
595 LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
596 else
597 LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
598 worklist_tail = wk;
599 num_on_worklist += 1;
600 }
601
602 /*
603 * Process that runs once per second to handle items in the background queue.
604 *
605 * Note that we ensure that everything is done in the order in which they
606 * appear in the queue. The code below depends on this property to ensure
607 * that blocks of a file are freed before the inode itself is freed. This
608 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
609 * until all the old ones have been purged from the dependency lists.
610 */
611 int
612 softdep_process_worklist(matchmnt)
613 struct mount *matchmnt;
614 {
615 struct proc *p = CURPROC;
616 int matchcnt, loopcount;
617 struct timeval starttime;
618
619 /*
620 * First process any items on the delayed-free queue.
621 */
622 ACQUIRE_LOCK(&lk);
623 softdep_freequeue_process();
624 FREE_LOCK(&lk);
625
626 /*
627 * Record the process identifier of our caller so that we can give
628 * this process preferential treatment in request_cleanup below.
629 * We can't do this in softdep_initialize, because the syncer doesn't
630 * have to run then.
631 * NOTE! This function _could_ be called with a curproc != syncerproc.
632 */
633 filesys_syncer = syncerproc;
634 matchcnt = 0;
635
636 /*
637 * There is no danger of having multiple processes run this
638 * code, but we have to single-thread it when softdep_flushfiles()
639 * is in operation to get an accurate count of the number of items
640 * related to its mount point that are in the list.
641 */
642 if (matchmnt == NULL) {
643 if (softdep_worklist_busy < 0)
644 return(-1);
645 softdep_worklist_busy += 1;
646 }
647
648 /*
649 * If requested, try removing inode or removal dependencies.
650 */
651 if (req_clear_inodedeps) {
652 clear_inodedeps(p);
653 req_clear_inodedeps -= 1;
654 wakeup_one(&proc_waiting);
655 }
656 if (req_clear_remove) {
657 clear_remove(p);
658 req_clear_remove -= 1;
659 wakeup_one(&proc_waiting);
660 }
661 loopcount = 1;
662 getmicrouptime(&starttime);
663 while (num_on_worklist > 0) {
664 matchcnt += process_worklist_item(matchmnt, 0);
665
666 /*
667 * If a umount operation wants to run the worklist
668 * accurately, abort.
669 */
670 if (softdep_worklist_req && matchmnt == NULL) {
671 matchcnt = -1;
672 break;
673 }
674
675 /*
676 * If requested, try removing inode or removal dependencies.
677 */
678 if (req_clear_inodedeps) {
679 clear_inodedeps(p);
680 req_clear_inodedeps -= 1;
681 wakeup_one(&proc_waiting);
682 }
683 if (req_clear_remove) {
684 clear_remove(p);
685 req_clear_remove -= 1;
686 wakeup_one(&proc_waiting);
687 }
688 /*
689 * We do not generally want to stop for buffer space, but if
690 * we are really being a buffer hog, we will stop and wait.
691 */
692 #if 0
693 if (loopcount++ % 128 == 0)
694 bwillwrite();
695 #endif
696 /*
697 * Never allow processing to run for more than one
698 * second. Otherwise the other syncer tasks may get
699 * excessively backlogged.
700 */
701 {
702 struct timeval diff;
703 struct timeval tv;
704
705 getmicrouptime(&tv);
706 timersub(&tv, &starttime, &diff);
707 if (diff.tv_sec != 0 && matchmnt == NULL) {
708 matchcnt = -1;
709 break;
710 }
711 }
712
713 /*
714 * Process any new items on the delayed-free queue.
715 */
716 ACQUIRE_LOCK(&lk);
717 softdep_freequeue_process();
718 FREE_LOCK(&lk);
719 }
720 if (matchmnt == NULL) {
721 softdep_worklist_busy -= 1;
722 if (softdep_worklist_req && softdep_worklist_busy == 0)
723 wakeup(&softdep_worklist_req);
724 }
725 return (matchcnt);
726 }
727
728 /*
729 * Process one item on the worklist.
730 */
731 STATIC int
732 process_worklist_item(matchmnt, flags)
733 struct mount *matchmnt;
734 int flags;
735 {
736 struct worklist *wk, *wkend;
737 struct dirrem *dirrem;
738 struct mount *mp;
739 struct vnode *vp;
740 int matchcnt = 0;
741
742 ACQUIRE_LOCK(&lk);
743 /*
744 * Normally we just process each item on the worklist in order.
745 * However, if we are in a situation where we cannot lock any
746 * inodes, we have to skip over any dirrem requests whose
747 * vnodes are resident and locked.
748 */
749 LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
750 if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
751 break;
752 dirrem = WK_DIRREM(wk);
753 vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev,
754 dirrem->dm_oldinum);
755 if (vp == NULL || !VOP_ISLOCKED(vp))
756 break;
757 }
758 if (wk == 0) {
759 FREE_LOCK(&lk);
760 return (0);
761 }
762 /*
763 * Remove the item to be processed. If we are removing the last
764 * item on the list, we need to recalculate the tail pointer.
765 * As this happens rarely and usually when the list is short,
766 * we just run down the list to find it rather than tracking it
767 * in the above loop.
768 */
769 WORKLIST_REMOVE(wk);
770 if (wk == worklist_tail) {
771 LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
772 if (LIST_NEXT(wkend, wk_list) == NULL)
773 break;
774 worklist_tail = wkend;
775 }
776 num_on_worklist -= 1;
777 FREE_LOCK(&lk);
778 switch (wk->wk_type) {
779
780 case D_DIRREM:
781 /* removal of a directory entry */
782 mp = WK_DIRREM(wk)->dm_mnt;
783 #if 0
784 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
785 panic("%s: dirrem on suspended filesystem",
786 "process_worklist_item");
787 #endif
788 if (mp == matchmnt)
789 matchcnt += 1;
790 handle_workitem_remove(WK_DIRREM(wk));
791 break;
792
793 case D_FREEBLKS:
794 /* releasing blocks and/or fragments from a file */
795 mp = WK_FREEBLKS(wk)->fb_mnt;
796 #if 0
797 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
798 panic("%s: freeblks on suspended filesystem",
799 "process_worklist_item");
800 #endif
801 if (mp == matchmnt)
802 matchcnt += 1;
803 handle_workitem_freeblocks(WK_FREEBLKS(wk));
804 break;
805
806 case D_FREEFRAG:
807 /* releasing a fragment when replaced as a file grows */
808 mp = WK_FREEFRAG(wk)->ff_mnt;
809 #if 0
810 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
811 panic("%s: freefrag on suspended filesystem",
812 "process_worklist_item");
813 #endif
814 if (mp == matchmnt)
815 matchcnt += 1;
816 handle_workitem_freefrag(WK_FREEFRAG(wk));
817 break;
818
819 case D_FREEFILE:
820 /* releasing an inode when its link count drops to 0 */
821 mp = WK_FREEFILE(wk)->fx_mnt;
822 #if 0
823 if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
824 panic("%s: freefile on suspended filesystem",
825 "process_worklist_item");
826 #endif
827 if (mp == matchmnt)
828 matchcnt += 1;
829 handle_workitem_freefile(WK_FREEFILE(wk));
830 break;
831
832 default:
833 panic("%s_process_worklist: Unknown type %s",
834 "softdep", TYPENAME(wk->wk_type));
835 /* NOTREACHED */
836 }
837 return (matchcnt);
838 }
839
840 /*
841 * Move dependencies from one buffer to another.
842 */
843 void
844 softdep_move_dependencies(oldbp, newbp)
845 struct buf *oldbp;
846 struct buf *newbp;
847 {
848 struct worklist *wk, *wktail;
849
850 if (LIST_FIRST(&newbp->b_dep) != NULL)
851 panic("softdep_move_dependencies: need merge code");
852 wktail = 0;
853 ACQUIRE_LOCK(&lk);
854 while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
855 LIST_REMOVE(wk, wk_list);
856 if (wktail == 0)
857 LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
858 else
859 LIST_INSERT_AFTER(wktail, wk, wk_list);
860 wktail = wk;
861 }
862 FREE_LOCK(&lk);
863 }
864
865 /*
866 * Purge the work list of all items associated with a particular mount point.
867 */
868 int
869 softdep_flushworklist(oldmnt, countp, p)
870 struct mount *oldmnt;
871 int *countp;
872 struct proc *p;
873 {
874 struct vnode *devvp;
875 int count, error = 0;
876
877 /*
878 * Await our turn to clear out the queue, then serialize access.
879 */
880 while (softdep_worklist_busy) {
881 softdep_worklist_req += 1;
882 tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0);
883 softdep_worklist_req -= 1;
884 }
885 softdep_worklist_busy = -1;
886 /*
887 * Alternately flush the block device associated with the mount
888 * point and process any dependencies that the flushing
889 * creates. We continue until no more worklist dependencies
890 * are found.
891 */
892 *countp = 0;
893 devvp = VFSTOUFS(oldmnt)->um_devvp;
894 while ((count = softdep_process_worklist(oldmnt)) > 0) {
895 *countp += count;
896 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p);
897 error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p);
898 VOP_UNLOCK(devvp, 0, p);
899 if (error)
900 break;
901 }
902 softdep_worklist_busy = 0;
903 if (softdep_worklist_req)
904 wakeup(&softdep_worklist_req);
905 return (error);
906 }
907
908 /*
909 * Flush all vnodes and worklist items associated with a specified mount point.
910 */
911 int
912 softdep_flushfiles(oldmnt, flags, p)
913 struct mount *oldmnt;
914 int flags;
915 struct proc *p;
916 {
917 int error, count, loopcnt;
918
919 /*
920 * Alternately flush the vnodes associated with the mount
921 * point and process any dependencies that the flushing
922 * creates. In theory, this loop can happen at most twice,
923 * but we give it a few extra just to be sure.
924 */
925 for (loopcnt = 10; loopcnt > 0; loopcnt--) {
926 /*
927 * Do another flush in case any vnodes were brought in
928 * as part of the cleanup operations.
929 */
930 if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0)
931 break;
932 if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 ||
933 count == 0)
934 break;
935 }
936 /*
937 * If we are unmounting then it is an error to fail. If we
938 * are simply trying to downgrade to read-only, then filesystem
939 * activity can keep us busy forever, so we just fail with EBUSY.
940 */
941 if (loopcnt == 0) {
942 error = EBUSY;
943 }
944 return (error);
945 }
946
947 /*
948 * Structure hashing.
949 *
950 * There are three types of structures that can be looked up:
951 * 1) pagedep structures identified by mount point, inode number,
952 * and logical block.
953 * 2) inodedep structures identified by mount point and inode number.
954 * 3) newblk structures identified by mount point and
955 * physical block number.
956 *
957 * The "pagedep" and "inodedep" dependency structures are hashed
958 * separately from the file blocks and inodes to which they correspond.
959 * This separation helps when the in-memory copy of an inode or
960 * file block must be replaced. It also obviates the need to access
961 * an inode or file page when simply updating (or de-allocating)
962 * dependency structures. Lookup of newblk structures is needed to
963 * find newly allocated blocks when trying to associate them with
964 * their allocdirect or allocindir structure.
965 *
966 * The lookup routines optionally create and hash a new instance when
967 * an existing entry is not found.
968 */
969 #define DEPALLOC 0x0001 /* allocate structure if lookup fails */
970 #define NODELAY 0x0002 /* cannot do background work */
971
972 /*
973 * Structures and routines associated with pagedep caching.
974 */
975 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
976 u_long pagedep_hash; /* size of hash table - 1 */
977 #define PAGEDEP_HASH(mp, inum, lbn) \
978 (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
979 pagedep_hash])
980 STATIC struct sema pagedep_in_progress;
981
982 /*
983 * Look up a pagedep. Return 1 if found, 0 if not found or found
984 * when asked to allocate but not associated with any buffer.
985 * If not found, allocate if DEPALLOC flag is passed.
986 * Found or allocated entry is returned in pagedeppp.
987 * This routine must be called with splbio interrupts blocked.
988 */
989 STATIC int
990 pagedep_lookup(ip, lbn, flags, pagedeppp)
991 struct inode *ip;
992 daddr64_t lbn;
993 int flags;
994 struct pagedep **pagedeppp;
995 {
996 struct pagedep *pagedep;
997 struct pagedep_hashhead *pagedephd;
998 struct mount *mp;
999 int i;
1000
1001 splassert(IPL_BIO);
1002
1003 #ifdef DEBUG
1004 if (lk.lkt_held == -1)
1005 panic("pagedep_lookup: lock not held");
1006 #endif
1007 mp = ITOV(ip)->v_mount;
1008 pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
1009 top:
1010 LIST_FOREACH(pagedep, pagedephd, pd_hash)
1011 if (ip->i_number == pagedep->pd_ino &&
1012 lbn == pagedep->pd_lbn &&
1013 mp == pagedep->pd_mnt)
1014 break;
1015 if (pagedep) {
1016 *pagedeppp = pagedep;
1017 if ((flags & DEPALLOC) != 0 &&
1018 (pagedep->pd_state & ONWORKLIST) == 0)
1019 return (0);
1020 return (1);
1021 }
1022 if ((flags & DEPALLOC) == 0) {
1023 *pagedeppp = NULL;
1024 return (0);
1025 }
1026 if (sema_get(&pagedep_in_progress, &lk) == 0) {
1027 ACQUIRE_LOCK(&lk);
1028 goto top;
1029 }
1030 pagedep = pool_get(&pagedep_pool, PR_WAITOK);
1031 bzero(pagedep, sizeof(struct pagedep));
1032 pagedep->pd_list.wk_type = D_PAGEDEP;
1033 pagedep->pd_mnt = mp;
1034 pagedep->pd_ino = ip->i_number;
1035 pagedep->pd_lbn = lbn;
1036 LIST_INIT(&pagedep->pd_dirremhd);
1037 LIST_INIT(&pagedep->pd_pendinghd);
1038 for (i = 0; i < DAHASHSZ; i++)
1039 LIST_INIT(&pagedep->pd_diraddhd[i]);
1040 ACQUIRE_LOCK(&lk);
1041 LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
1042 sema_release(&pagedep_in_progress);
1043 *pagedeppp = pagedep;
1044 return (0);
1045 }
1046
1047 /*
1048 * Structures and routines associated with inodedep caching.
1049 */
1050 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
1051 STATIC u_long inodedep_hash; /* size of hash table - 1 */
1052 STATIC long num_inodedep; /* number of inodedep allocated */
1053 #define INODEDEP_HASH(fs, inum) \
1054 (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
1055 STATIC struct sema inodedep_in_progress;
1056
1057 /*
1058 * Look up a inodedep. Return 1 if found, 0 if not found.
1059 * If not found, allocate if DEPALLOC flag is passed.
1060 * Found or allocated entry is returned in inodedeppp.
1061 * This routine must be called with splbio interrupts blocked.
1062 */
1063 STATIC int
1064 inodedep_lookup(fs, inum, flags, inodedeppp)
1065 struct fs *fs;
1066 ino_t inum;
1067 int flags;
1068 struct inodedep **inodedeppp;
1069 {
1070 struct inodedep *inodedep;
1071 struct inodedep_hashhead *inodedephd;
1072 int firsttry;
1073
1074 splassert(IPL_BIO);
1075
1076 #ifdef DEBUG
1077 if (lk.lkt_held == -1)
1078 panic("inodedep_lookup: lock not held");
1079 #endif
1080 firsttry = 1;
1081 inodedephd = INODEDEP_HASH(fs, inum);
1082 top:
1083 LIST_FOREACH(inodedep, inodedephd, id_hash)
1084 if (inum == inodedep->id_ino && fs == inodedep->id_fs)
1085 break;
1086 if (inodedep) {
1087 *inodedeppp = inodedep;
1088 return (1);
1089 }
1090 if ((flags & DEPALLOC) == 0) {
1091 *inodedeppp = NULL;
1092 return (0);
1093 }
1094 /*
1095 * If we are over our limit, try to improve the situation.
1096 */
1097 if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 &&
1098 request_cleanup(FLUSH_INODES, 1)) {
1099 firsttry = 0;
1100 goto top;
1101 }
1102 if (sema_get(&inodedep_in_progress, &lk) == 0) {
1103 ACQUIRE_LOCK(&lk);
1104 goto top;
1105 }
1106 num_inodedep += 1;
1107 inodedep = pool_get(&inodedep_pool, PR_WAITOK);
1108 inodedep->id_list.wk_type = D_INODEDEP;
1109 inodedep->id_fs = fs;
1110 inodedep->id_ino = inum;
1111 inodedep->id_state = ALLCOMPLETE;
1112 inodedep->id_nlinkdelta = 0;
1113 inodedep->id_savedino1 = NULL;
1114 inodedep->id_savedsize = -1;
1115 inodedep->id_buf = NULL;
1116 LIST_INIT(&inodedep->id_pendinghd);
1117 LIST_INIT(&inodedep->id_inowait);
1118 LIST_INIT(&inodedep->id_bufwait);
1119 TAILQ_INIT(&inodedep->id_inoupdt);
1120 TAILQ_INIT(&inodedep->id_newinoupdt);
1121 ACQUIRE_LOCK(&lk);
1122 LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
1123 sema_release(&inodedep_in_progress);
1124 *inodedeppp = inodedep;
1125 return (0);
1126 }
1127
1128 /*
1129 * Structures and routines associated with newblk caching.
1130 */
1131 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
1132 u_long newblk_hash; /* size of hash table - 1 */
1133 #define NEWBLK_HASH(fs, inum) \
1134 (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
1135 STATIC struct sema newblk_in_progress;
1136
1137 /*
1138 * Look up a newblk. Return 1 if found, 0 if not found.
1139 * If not found, allocate if DEPALLOC flag is passed.
1140 * Found or allocated entry is returned in newblkpp.
1141 */
1142 STATIC int
1143 newblk_lookup(fs, newblkno, flags, newblkpp)
1144 struct fs *fs;
1145 daddr_t newblkno;
1146 int flags;
1147 struct newblk **newblkpp;
1148 {
1149 struct newblk *newblk;
1150 struct newblk_hashhead *newblkhd;
1151
1152 newblkhd = NEWBLK_HASH(fs, newblkno);
1153 top:
1154 LIST_FOREACH(newblk, newblkhd, nb_hash)
1155 if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
1156 break;
1157 if (newblk) {
1158 *newblkpp = newblk;
1159 return (1);
1160 }
1161 if ((flags & DEPALLOC) == 0) {
1162 *newblkpp = NULL;
1163 return (0);
1164 }
1165 if (sema_get(&newblk_in_progress, 0) == 0)
1166 goto top;
1167 newblk = pool_get(&newblk_pool, PR_WAITOK);
1168 newblk->nb_state = 0;
1169 newblk->nb_fs = fs;
1170 newblk->nb_newblkno = newblkno;
1171 LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
1172 sema_release(&newblk_in_progress);
1173 *newblkpp = newblk;
1174 return (0);
1175 }
1176
1177 /*
1178 * Executed during filesystem system initialization before
1179 * mounting any file systems.
1180 */
1181 void
1182 softdep_initialize()
1183 {
1184
1185 bioops.io_start = softdep_disk_io_initiation;
1186 bioops.io_complete = softdep_disk_write_complete;
1187 bioops.io_deallocate = softdep_deallocate_dependencies;
1188 bioops.io_movedeps = softdep_move_dependencies;
1189 bioops.io_countdeps = softdep_count_dependencies;
1190
1191 LIST_INIT(&mkdirlisthd);
1192 LIST_INIT(&softdep_workitem_pending);
1193 #ifdef KMEMSTATS
1194 max_softdeps = min (desiredvnodes * 8,
1195 kmemstats[M_INODEDEP].ks_limit / (2 * sizeof(struct inodedep)));
1196 #else
1197 max_softdeps = desiredvnodes * 4;
1198 #endif
1199 pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, M_WAITOK,
1200 &pagedep_hash);
1201 sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
1202 inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, M_WAITOK,
1203 &inodedep_hash);
1204 sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
1205 newblk_hashtbl = hashinit(64, M_NEWBLK, M_WAITOK, &newblk_hash);
1206 sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
1207 timeout_set(&proc_waiting_timeout, pause_timer, 0);
1208 pool_init(&pagedep_pool, sizeof(struct pagedep), 0, 0, 0,
1209 "pagedeppl", &pool_allocator_nointr);
1210 pool_init(&inodedep_pool, sizeof(struct inodedep), 0, 0, 0,
1211 "inodedeppl", &pool_allocator_nointr);
1212 pool_init(&newblk_pool, sizeof(struct newblk), 0, 0, 0,
1213 "newblkpl", &pool_allocator_nointr);
1214 pool_init(&bmsafemap_pool, sizeof(struct bmsafemap), 0, 0, 0,
1215 "bmsafemappl", &pool_allocator_nointr);
1216 pool_init(&allocdirect_pool, sizeof(struct allocdirect), 0, 0, 0,
1217 "allocdirectpl", &pool_allocator_nointr);
1218 pool_init(&indirdep_pool, sizeof(struct indirdep), 0, 0, 0,
1219 "indirdeppl", &pool_allocator_nointr);
1220 pool_init(&allocindir_pool, sizeof(struct allocindir), 0, 0, 0,
1221 "allocindirpl", &pool_allocator_nointr);
1222 pool_init(&freefrag_pool, sizeof(struct freefrag), 0, 0, 0,
1223 "freefragpl", &pool_allocator_nointr);
1224 pool_init(&freeblks_pool, sizeof(struct freeblks), 0, 0, 0,
1225 "freeblkspl", &pool_allocator_nointr);
1226 pool_init(&freefile_pool, sizeof(struct freefile), 0, 0, 0,
1227 "freefilepl", &pool_allocator_nointr);
1228 pool_init(&diradd_pool, sizeof(struct diradd), 0, 0, 0,
1229 "diraddpl", &pool_allocator_nointr);
1230 pool_init(&mkdir_pool, sizeof(struct mkdir), 0, 0, 0,
1231 "mkdirpl", &pool_allocator_nointr);
1232 pool_init(&dirrem_pool, sizeof(struct dirrem), 0, 0, 0,
1233 "dirrempl", &pool_allocator_nointr);
1234 pool_init(&newdirblk_pool, sizeof(struct newdirblk), 0, 0, 0,
1235 "newdirblkpl", &pool_allocator_nointr);
1236 }
1237
1238 /*
1239 * Called at mount time to notify the dependency code that a
1240 * filesystem wishes to use it.
1241 */
1242 int
1243 softdep_mount(devvp, mp, fs, cred)
1244 struct vnode *devvp;
1245 struct mount *mp;
1246 struct fs *fs;
1247 struct ucred *cred;
1248 {
1249 struct csum_total cstotal;
1250 struct cg *cgp;
1251 struct buf *bp;
1252 int error, cyl;
1253
1254 /*
1255 * When doing soft updates, the counters in the
1256 * superblock may have gotten out of sync, so we have
1257 * to scan the cylinder groups and recalculate them.
1258 */
1259 if ((fs->fs_flags & FS_UNCLEAN) == 0)
1260 return (0);
1261 bzero(&cstotal, sizeof cstotal);
1262 for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
1263 if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
1264 fs->fs_cgsize, cred, &bp)) != 0) {
1265 brelse(bp);
1266 return (error);
1267 }
1268 cgp = (struct cg *)bp->b_data;
1269 cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
1270 cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
1271 cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
1272 cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
1273 fs->fs_cs(fs, cyl) = cgp->cg_cs;
1274 brelse(bp);
1275 }
1276 #ifdef DEBUG
1277 if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
1278 printf("ffs_mountfs: superblock updated for soft updates\n");
1279 #endif
1280 bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
1281 return (0);
1282 }
1283
1284 /*
1285 * Protecting the freemaps (or bitmaps).
1286 *
1287 * To eliminate the need to execute fsck before mounting a file system
1288 * after a power failure, one must (conservatively) guarantee that the
1289 * on-disk copy of the bitmaps never indicate that a live inode or block is
1290 * free. So, when a block or inode is allocated, the bitmap should be
1291 * updated (on disk) before any new pointers. When a block or inode is
1292 * freed, the bitmap should not be updated until all pointers have been
1293 * reset. The latter dependency is handled by the delayed de-allocation
1294 * approach described below for block and inode de-allocation. The former
1295 * dependency is handled by calling the following procedure when a block or
1296 * inode is allocated. When an inode is allocated an "inodedep" is created
1297 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
1298 * Each "inodedep" is also inserted into the hash indexing structure so
1299 * that any additional link additions can be made dependent on the inode
1300 * allocation.
1301 *
1302 * The ufs file system maintains a number of free block counts (e.g., per
1303 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
1304 * in addition to the bitmaps. These counts are used to improve efficiency
1305 * during allocation and therefore must be consistent with the bitmaps.
1306 * There is no convenient way to guarantee post-crash consistency of these
1307 * counts with simple update ordering, for two main reasons: (1) The counts
1308 * and bitmaps for a single cylinder group block are not in the same disk
1309 * sector. If a disk write is interrupted (e.g., by power failure), one may
1310 * be written and the other not. (2) Some of the counts are located in the
1311 * superblock rather than the cylinder group block. So, we focus our soft
1312 * updates implementation on protecting the bitmaps. When mounting a
1313 * filesystem, we recompute the auxiliary counts from the bitmaps.
1314 */
1315
1316 /*
1317 * Called just after updating the cylinder group block to allocate an inode.
1318 */
1319 void
1320 softdep_setup_inomapdep(bp, ip, newinum)
1321 struct buf *bp; /* buffer for cylgroup block with inode map */
1322 struct inode *ip; /* inode related to allocation */
1323 ino_t newinum; /* new inode number being allocated */
1324 {
1325 struct inodedep *inodedep;
1326 struct bmsafemap *bmsafemap;
1327
1328 /*
1329 * Create a dependency for the newly allocated inode.
1330 * Panic if it already exists as something is seriously wrong.
1331 * Otherwise add it to the dependency list for the buffer holding
1332 * the cylinder group map from which it was allocated.
1333 */
1334 ACQUIRE_LOCK(&lk);
1335 if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC | NODELAY, &inodedep)
1336 != 0) {
1337 FREE_LOCK(&lk);
1338 panic("softdep_setup_inomapdep: found inode");
1339 }
1340 inodedep->id_buf = bp;
1341 inodedep->id_state &= ~DEPCOMPLETE;
1342 bmsafemap = bmsafemap_lookup(bp);
1343 LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
1344 FREE_LOCK(&lk);
1345 }
1346
1347 /*
1348 * Called just after updating the cylinder group block to
1349 * allocate block or fragment.
1350 */
1351 void
1352 softdep_setup_blkmapdep(bp, fs, newblkno)
1353 struct buf *bp; /* buffer for cylgroup block with block map */
1354 struct fs *fs; /* filesystem doing allocation */
1355 daddr_t newblkno; /* number of newly allocated block */
1356 {
1357 struct newblk *newblk;
1358 struct bmsafemap *bmsafemap;
1359
1360 /*
1361 * Create a dependency for the newly allocated block.
1362 * Add it to the dependency list for the buffer holding
1363 * the cylinder group map from which it was allocated.
1364 */
1365 if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
1366 panic("softdep_setup_blkmapdep: found block");
1367 ACQUIRE_LOCK(&lk);
1368 newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
1369 LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
1370 FREE_LOCK(&lk);
1371 }
1372
1373 /*
1374 * Find the bmsafemap associated with a cylinder group buffer.
1375 * If none exists, create one. The buffer must be locked when
1376 * this routine is called and this routine must be called with
1377 * splbio interrupts blocked.
1378 */
1379 STATIC struct bmsafemap *
1380 bmsafemap_lookup(bp)
1381 struct buf *bp;
1382 {
1383 struct bmsafemap *bmsafemap;
1384 struct worklist *wk;
1385
1386 splassert(IPL_BIO);
1387
1388 #ifdef DEBUG
1389 if (lk.lkt_held == -1)
1390 panic("bmsafemap_lookup: lock not held");
1391 #endif
1392 LIST_FOREACH(wk, &bp->b_dep, wk_list)
1393 if (wk->wk_type == D_BMSAFEMAP)
1394 return (WK_BMSAFEMAP(wk));
1395 FREE_LOCK(&lk);
1396 bmsafemap = pool_get(&bmsafemap_pool, PR_WAITOK);
1397 bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
1398 bmsafemap->sm_list.wk_state = 0;
1399 bmsafemap->sm_buf = bp;
1400 LIST_INIT(&bmsafemap->sm_allocdirecthd);
1401 LIST_INIT(&bmsafemap->sm_allocindirhd);
1402 LIST_INIT(&bmsafemap->sm_inodedephd);
1403 LIST_INIT(&bmsafemap->sm_newblkhd);
1404 ACQUIRE_LOCK(&lk);
1405 WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
1406 return (bmsafemap);
1407 }
1408
1409 /*
1410 * Direct block allocation dependencies.
1411 *
1412 * When a new block is allocated, the corresponding disk locations must be
1413 * initialized (with zeros or new data) before the on-disk inode points to
1414 * them. Also, the freemap from which the block was allocated must be
1415 * updated (on disk) before the inode's pointer. These two dependencies are
1416 * independent of each other and are needed for all file blocks and indirect
1417 * blocks that are pointed to directly by the inode. Just before the
1418 * "in-core" version of the inode is updated with a newly allocated block
1419 * number, a procedure (below) is called to setup allocation dependency
1420 * structures. These structures are removed when the corresponding
1421 * dependencies are satisfied or when the block allocation becomes obsolete
1422 * (i.e., the file is deleted, the block is de-allocated, or the block is a
1423 * fragment that gets upgraded). All of these cases are handled in
1424 * procedures described later.
1425 *
1426 * When a file extension causes a fragment to be upgraded, either to a larger
1427 * fragment or to a full block, the on-disk location may change (if the
1428 * previous fragment could not simply be extended). In this case, the old
1429 * fragment must be de-allocated, but not until after the inode's pointer has
1430 * been updated. In most cases, this is handled by later procedures, which
1431 * will construct a "freefrag" structure to be added to the workitem queue
1432 * when the inode update is complete (or obsolete). The main exception to
1433 * this is when an allocation occurs while a pending allocation dependency
1434 * (for the same block pointer) remains. This case is handled in the main
1435 * allocation dependency setup procedure by immediately freeing the
1436 * unreferenced fragments.
1437 */
1438 void
1439 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
1440 struct inode *ip; /* inode to which block is being added */
1441 daddr64_t lbn; /* block pointer within inode */
1442 daddr_t newblkno; /* disk block number being added */
1443 daddr_t oldblkno; /* previous block number, 0 unless frag */
1444 long newsize; /* size of new block */
1445 long oldsize; /* size of new block */
1446 struct buf *bp; /* bp for allocated block */
1447 {
1448 struct allocdirect *adp, *oldadp;
1449 struct allocdirectlst *adphead;
1450 struct bmsafemap *bmsafemap;
1451 struct inodedep *inodedep;
1452 struct pagedep *pagedep;
1453 struct newblk *newblk;
1454
1455 adp = pool_get(&allocdirect_pool, PR_WAITOK);
1456 bzero(adp, sizeof(struct allocdirect));
1457 adp->ad_list.wk_type = D_ALLOCDIRECT;
1458 adp->ad_lbn = lbn;
1459 adp->ad_newblkno = newblkno;
1460 adp->ad_oldblkno = oldblkno;
1461 adp->ad_newsize = newsize;
1462 adp->ad_oldsize = oldsize;
1463 adp->ad_state = ATTACHED;
1464 LIST_INIT(&adp->ad_newdirblk);
1465 if (newblkno == oldblkno)
1466 adp->ad_freefrag = NULL;
1467 else
1468 adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
1469
1470 if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
1471 panic("softdep_setup_allocdirect: lost block");
1472
1473 ACQUIRE_LOCK(&lk);
1474 inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
1475 adp->ad_inodedep = inodedep;
1476
1477 if (newblk->nb_state == DEPCOMPLETE) {
1478 adp->ad_state |= DEPCOMPLETE;
1479 adp->ad_buf = NULL;
1480 } else {
1481 bmsafemap = newblk->nb_bmsafemap;
1482 adp->ad_buf = bmsafemap->sm_buf;
1483 LIST_REMOVE(newblk, nb_deps);
1484 LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
1485 }
1486 LIST_REMOVE(newblk, nb_hash);
1487 pool_put(&newblk_pool, newblk);
1488
1489 if (bp == NULL) {
1490 /*
1491 * XXXUBC - Yes, I know how to fix this, but not right now.
1492 */
1493 panic("softdep_setup_allocdirect: Bonk art in the head");
1494 }
1495 WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
1496 if (lbn >= NDADDR) {
1497 /* allocating an indirect block */
1498 if (oldblkno != 0) {
1499 FREE_LOCK(&lk);
1500 panic("softdep_setup_allocdirect: non-zero indir");
1501 }
1502 } else {
1503 /*
1504 * Allocating a direct block.
1505 *
1506 * If we are allocating a directory block, then we must
1507 * allocate an associated pagedep to track additions and
1508 * deletions.
1509 */
1510 if ((DIP(ip, mode) & IFMT) == IFDIR &&
1511 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1512 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
1513 }
1514 /*
1515 * The list of allocdirects must be kept in sorted and ascending
1516 * order so that the rollback routines can quickly determine the
1517 * first uncommitted block (the size of the file stored on disk
1518 * ends at the end of the lowest committed fragment, or if there
1519 * are no fragments, at the end of the highest committed block).
1520 * Since files generally grow, the typical case is that the new
1521 * block is to be added at the end of the list. We speed this
1522 * special case by checking against the last allocdirect in the
1523 * list before laboriously traversing the list looking for the
1524 * insertion point.
1525 */
1526 adphead = &inodedep->id_newinoupdt;
1527 oldadp = TAILQ_LAST(adphead, allocdirectlst);
1528 if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
1529 /* insert at end of list */
1530 TAILQ_INSERT_TAIL(adphead, adp, ad_next);
1531 if (oldadp != NULL && oldadp->ad_lbn == lbn)
1532 allocdirect_merge(adphead, adp, oldadp);
1533 FREE_LOCK(&lk);
1534 return;
1535 }
1536 TAILQ_FOREACH(oldadp, adphead, ad_next) {
1537 if (oldadp->ad_lbn >= lbn)
1538 break;
1539 }
1540 if (oldadp == NULL) {
1541 FREE_LOCK(&lk);
1542 panic("softdep_setup_allocdirect: lost entry");
1543 }
1544 /* insert in middle of list */
1545 TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
1546 if (oldadp->ad_lbn == lbn)
1547 allocdirect_merge(adphead, adp, oldadp);
1548 FREE_LOCK(&lk);
1549 }
1550
1551 /*
1552 * Replace an old allocdirect dependency with a newer one.
1553 * This routine must be called with splbio interrupts blocked.
1554 */
1555 STATIC void
1556 allocdirect_merge(adphead, newadp, oldadp)
1557 struct allocdirectlst *adphead; /* head of list holding allocdirects */
1558 struct allocdirect *newadp; /* allocdirect being added */
1559 struct allocdirect *oldadp; /* existing allocdirect being checked */
1560 {
1561 struct worklist *wk;
1562 struct freefrag *freefrag;
1563 struct newdirblk *newdirblk;
1564
1565 splassert(IPL_BIO);
1566
1567 #ifdef DEBUG
1568 if (lk.lkt_held == -1)
1569 panic("allocdirect_merge: lock not held");
1570 #endif
1571 if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
1572 newadp->ad_oldsize != oldadp->ad_newsize ||
1573 newadp->ad_lbn >= NDADDR) {
1574 FREE_LOCK(&lk);
1575 panic("allocdirect_merge: old %d != new %d || lbn %ld >= %d",
1576 newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn,
1577 NDADDR);
1578 }
1579 newadp->ad_oldblkno = oldadp->ad_oldblkno;
1580 newadp->ad_oldsize = oldadp->ad_oldsize;
1581 /*
1582 * If the old dependency had a fragment to free or had never
1583 * previously had a block allocated, then the new dependency
1584 * can immediately post its freefrag and adopt the old freefrag.
1585 * This action is done by swapping the freefrag dependencies.
1586 * The new dependency gains the old one's freefrag, and the
1587 * old one gets the new one and then immediately puts it on
1588 * the worklist when it is freed by free_allocdirect. It is
1589 * not possible to do this swap when the old dependency had a
1590 * non-zero size but no previous fragment to free. This condition
1591 * arises when the new block is an extension of the old block.
1592 * Here, the first part of the fragment allocated to the new
1593 * dependency is part of the block currently claimed on disk by
1594 * the old dependency, so cannot legitimately be freed until the
1595 * conditions for the new dependency are fulfilled.
1596 */
1597 if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
1598 freefrag = newadp->ad_freefrag;
1599 newadp->ad_freefrag = oldadp->ad_freefrag;
1600 oldadp->ad_freefrag = freefrag;
1601 }
1602 /*
1603 * If we are tracking a new directory-block allocation,
1604 * move it from the old allocdirect to the new allocdirect.
1605 */
1606 if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
1607 newdirblk = WK_NEWDIRBLK(wk);
1608 WORKLIST_REMOVE(&newdirblk->db_list);
1609 if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
1610 panic("allocdirect_merge: extra newdirblk");
1611 WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
1612 }
1613 free_allocdirect(adphead, oldadp, 0);
1614 }
1615
1616 /*
1617 * Allocate a new freefrag structure if needed.
1618 */
1619 STATIC struct freefrag *
1620 newfreefrag(ip, blkno, size)
1621 struct inode *ip;
1622 daddr_t blkno;
1623 long size;
1624 {
1625 struct freefrag *freefrag;
1626 struct fs *fs;
1627
1628 if (blkno == 0)
1629 return (NULL);
1630 fs = ip->i_fs;
1631 if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
1632 panic("newfreefrag: frag size");
1633 freefrag = pool_get(&freefrag_pool, PR_WAITOK);
1634 freefrag->ff_list.wk_type = D_FREEFRAG;
1635 freefrag->ff_state = DIP(ip, uid) & ~ONWORKLIST; /* used below */
1636 freefrag->ff_inum = ip->i_number;
1637 freefrag->ff_mnt = ITOV(ip)->v_mount;
1638 freefrag->ff_devvp = ip->i_devvp;
1639 freefrag->ff_blkno = blkno;
1640 freefrag->ff_fragsize = size;
1641 return (freefrag);
1642 }
1643
1644 /*
1645 * This workitem de-allocates fragments that were replaced during
1646 * file block allocation.
1647 */
1648 STATIC void
1649 handle_workitem_freefrag(freefrag)
1650 struct freefrag *freefrag;
1651 {
1652 struct inode tip;
1653 struct ufs1_dinode dtip1;
1654
1655 tip.i_vnode = NULL;
1656 tip.i_din1 = &dtip1;
1657 tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs;
1658 tip.i_ump = VFSTOUFS(freefrag->ff_mnt);
1659 tip.i_dev = freefrag->ff_devvp->v_rdev;
1660 tip.i_number = freefrag->ff_inum;
1661 tip.i_ffs1_uid = freefrag->ff_state & ~ONWORKLIST; /* set above */
1662 ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
1663 pool_put(&freefrag_pool, freefrag);
1664 }
1665
1666 /*
1667 * Indirect block allocation dependencies.
1668 *
1669 * The same dependencies that exist for a direct block also exist when
1670 * a new block is allocated and pointed to by an entry in a block of
1671 * indirect pointers. The undo/redo states described above are also
1672 * used here. Because an indirect block contains many pointers that
1673 * may have dependencies, a second copy of the entire in-memory indirect
1674 * block is kept. The buffer cache copy is always completely up-to-date.
1675 * The second copy, which is used only as a source for disk writes,
1676 * contains only the safe pointers (i.e., those that have no remaining
1677 * update dependencies). The second copy is freed when all pointers
1678 * are safe. The cache is not allowed to replace indirect blocks with
1679 * pending update dependencies. If a buffer containing an indirect
1680 * block with dependencies is written, these routines will mark it
1681 * dirty again. It can only be successfully written once all the
1682 * dependencies are removed. The ffs_fsync routine in conjunction with
1683 * softdep_sync_metadata work together to get all the dependencies
1684 * removed so that a file can be successfully written to disk. Three
1685 * procedures are used when setting up indirect block pointer
1686 * dependencies. The division is necessary because of the organization
1687 * of the "balloc" routine and because of the distinction between file
1688 * pages and file metadata blocks.
1689 */
1690
1691 /*
1692 * Allocate a new allocindir structure.
1693 */
1694 STATIC struct allocindir *
1695 newallocindir(ip, ptrno, newblkno, oldblkno)
1696 struct inode *ip; /* inode for file being extended */
1697 int ptrno; /* offset of pointer in indirect block */
1698 daddr_t newblkno; /* disk block number being added */
1699 daddr_t oldblkno; /* previous block number, 0 if none */
1700 {
1701 struct allocindir *aip;
1702
1703 aip = pool_get(&allocindir_pool, PR_WAITOK);
1704 bzero(aip,sizeof(struct allocindir));
1705 aip->ai_list.wk_type = D_ALLOCINDIR;
1706 aip->ai_state = ATTACHED;
1707 aip->ai_offset = ptrno;
1708 aip->ai_newblkno = newblkno;
1709 aip->ai_oldblkno = oldblkno;
1710 aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
1711 return (aip);
1712 }
1713
1714 /*
1715 * Called just before setting an indirect block pointer
1716 * to a newly allocated file page.
1717 */
1718 void
1719 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
1720 struct inode *ip; /* inode for file being extended */
1721 daddr64_t lbn; /* allocated block number within file */
1722 struct buf *bp; /* buffer with indirect blk referencing page */
1723 int ptrno; /* offset of pointer in indirect block */
1724 daddr_t newblkno; /* disk block number being added */
1725 daddr_t oldblkno; /* previous block number, 0 if none */
1726 struct buf *nbp; /* buffer holding allocated page */
1727 {
1728 struct allocindir *aip;
1729 struct pagedep *pagedep;
1730
1731 aip = newallocindir(ip, ptrno, newblkno, oldblkno);
1732 ACQUIRE_LOCK(&lk);
1733 /*
1734 * If we are allocating a directory page, then we must
1735 * allocate an associated pagedep to track additions and
1736 * deletions.
1737 */
1738 if ((DIP(ip, mode) & IFMT) == IFDIR &&
1739 pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
1740 WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
1741 if (nbp == NULL) {
1742 /*
1743 * XXXUBC - Yes, I know how to fix this, but not right now.
1744 */
1745 panic("softdep_setup_allocindir_page: Bonk art in the head");
1746 }
1747 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1748 FREE_LOCK(&lk);
1749 setup_allocindir_phase2(bp, ip, aip);
1750 }
1751
1752 /*
1753 * Called just before setting an indirect block pointer to a
1754 * newly allocated indirect block.
1755 */
1756 void
1757 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
1758 struct buf *nbp; /* newly allocated indirect block */
1759 struct inode *ip; /* inode for file being extended */
1760 struct buf *bp; /* indirect block referencing allocated block */
1761 int ptrno; /* offset of pointer in indirect block */
1762 daddr_t newblkno; /* disk block number being added */
1763 {
1764 struct allocindir *aip;
1765
1766 aip = newallocindir(ip, ptrno, newblkno, 0);
1767 ACQUIRE_LOCK(&lk);
1768 WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
1769 FREE_LOCK(&lk);
1770 setup_allocindir_phase2(bp, ip, aip);
1771 }
1772
1773 /*
1774 * Called to finish the allocation of the "aip" allocated
1775 * by one of the two routines above.
1776 */
1777 STATIC void
1778 setup_allocindir_phase2(bp, ip, aip)
1779 struct buf *bp; /* in-memory copy of the indirect block */
1780 struct inode *ip; /* inode for file being extended */
1781 struct allocindir *aip; /* allocindir allocated by the above routines */
1782 {
1783 struct worklist *wk;
1784 struct indirdep *indirdep, *newindirdep;
1785 struct bmsafemap *bmsafemap;
1786 struct allocindir *oldaip;
1787 struct freefrag *freefrag;
1788 struct newblk *newblk;
1789
1790 if (bp->b_lblkno >= 0)
1791 panic("setup_allocindir_phase2: not indir blk");
1792 for (indirdep = NULL, newindirdep = NULL; ; ) {
1793 ACQUIRE_LOCK(&lk);
1794 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
1795 if (wk->wk_type != D_INDIRDEP)
1796 continue;
1797 indirdep = WK_INDIRDEP(wk);
1798 break;
1799 }
1800 if (indirdep == NULL && newindirdep) {
1801 indirdep = newindirdep;
1802 WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
1803 newindirdep = NULL;
1804 }
1805 FREE_LOCK(&lk);
1806 if (indirdep) {
1807 if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
1808 &newblk) == 0)
1809 panic("setup_allocindir: lost block");
1810 ACQUIRE_LOCK(&lk);
1811 if (newblk->nb_state == DEPCOMPLETE) {
1812 aip->ai_state |= DEPCOMPLETE;
1813 aip->ai_buf = NULL;
1814 } else {
1815 bmsafemap = newblk->nb_bmsafemap;
1816 aip->ai_buf = bmsafemap->sm_buf;
1817 LIST_REMOVE(newblk, nb_deps);
1818 LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
1819 aip, ai_deps);
1820 }
1821 LIST_REMOVE(newblk, nb_hash);
1822 pool_put(&newblk_pool, newblk);
1823 aip->ai_indirdep = indirdep;
1824 /*
1825 * Check to see if there is an existing dependency
1826 * for this block. If there is, merge the old
1827 * dependency into the new one.
1828 */
1829 if (aip->ai_oldblkno == 0)
1830 oldaip = NULL;
1831 else
1832
1833 LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
1834 if (oldaip->ai_offset == aip->ai_offset)
1835 break;
1836 freefrag = NULL;
1837 if (oldaip != NULL) {
1838 if (oldaip->ai_newblkno != aip->ai_oldblkno) {
1839 FREE_LOCK(&lk);
1840 panic("setup_allocindir_phase2: blkno");
1841 }
1842 aip->ai_oldblkno = oldaip->ai_oldblkno;
1843 freefrag = aip->ai_freefrag;
1844 aip->ai_freefrag = oldaip->ai_freefrag;
1845 oldaip->ai_freefrag = NULL;
1846 free_allocindir(oldaip, NULL);
1847 }
1848 LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
1849 if (ip->i_ump->um_fstype == UM_UFS1)
1850 ((int32_t *)indirdep->ir_savebp->b_data)
1851 [aip->ai_offset] = aip->ai_oldblkno;
1852 else
1853 ((int64_t *)indirdep->ir_savebp->b_data)
1854 [aip->ai_offset] = aip->ai_oldblkno;
1855 FREE_LOCK(&lk);
1856 if (freefrag != NULL)
1857 handle_workitem_freefrag(freefrag);
1858 }
1859 if (newindirdep) {
1860 if (indirdep->ir_savebp != NULL)
1861 brelse(newindirdep->ir_savebp);
1862 WORKITEM_FREE(newindirdep, D_INDIRDEP);
1863 }
1864 if (indirdep)
1865 break;
1866 newindirdep = pool_get(&indirdep_pool, PR_WAITOK);
1867 newindirdep->ir_list.wk_type = D_INDIRDEP;
1868 newindirdep->ir_state = ATTACHED;
1869 if (ip->i_ump->um_fstype == UM_UFS1)
1870 newindirdep->ir_state |= UFS1FMT;
1871 LIST_INIT(&newindirdep->ir_deplisthd);
1872 LIST_INIT(&newindirdep->ir_donehd);
1873 if (bp->b_blkno == bp->b_lblkno) {
1874 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
1875 NULL);
1876 }
1877 newindirdep->ir_savebp =
1878 getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
1879 #if 0
1880 BUF_KERNPROC(newindirdep->ir_savebp);
1881 #endif
1882 bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
1883 }
1884 }
1885
1886 /*
1887 * Block de-allocation dependencies.
1888 *
1889 * When blocks are de-allocated, the on-disk pointers must be nullified before
1890 * the blocks are made available for use by other files. (The true
1891 * requirement is that old pointers must be nullified before new on-disk
1892 * pointers are set. We chose this slightly more stringent requirement to
1893 * reduce complexity.) Our implementation handles this dependency by updating
1894 * the inode (or indirect block) appropriately but delaying the actual block
1895 * de-allocation (i.e., freemap and free space count manipulation) until
1896 * after the updated versions reach stable storage. After the disk is
1897 * updated, the blocks can be safely de-allocated whenever it is convenient.
1898 * This implementation handles only the common case of reducing a file's
1899 * length to zero. Other cases are handled by the conventional synchronous
1900 * write approach.
1901 *
1902 * The ffs implementation with which we worked double-checks
1903 * the state of the block pointers and file size as it reduces
1904 * a file's length. Some of this code is replicated here in our
1905 * soft updates implementation. The freeblks->fb_chkcnt field is
1906 * used to transfer a part of this information to the procedure
1907 * that eventually de-allocates the blocks.
1908 *
1909 * This routine should be called from the routine that shortens
1910 * a file's length, before the inode's size or block pointers
1911 * are modified. It will save the block pointer information for
1912 * later release and zero the inode so that the calling routine
1913 * can release it.
1914 */
1915 void
1916 softdep_setup_freeblocks(ip, length)
1917 struct inode *ip; /* The inode whose length is to be reduced */
1918 off_t length; /* The new length for the file */
1919 {
1920 struct freeblks *freeblks;
1921 struct inodedep *inodedep;
1922 struct allocdirect *adp;
1923 struct vnode *vp;
1924 struct buf *bp;
1925 struct fs *fs;
1926 int i, delay, error;
1927
1928 fs = ip->i_fs;
1929 if (length != 0)
1930 panic("softdep_setup_freeblocks: non-zero length");
1931 freeblks = pool_get(&freeblks_pool, PR_WAITOK);
1932 bzero(freeblks, sizeof(struct freeblks));
1933 freeblks->fb_list.wk_type = D_FREEBLKS;
1934 freeblks->fb_state = ATTACHED;
1935 freeblks->fb_uid = DIP(ip, uid);
1936 freeblks->fb_previousinum = ip->i_number;
1937 freeblks->fb_devvp = ip->i_devvp;
1938 freeblks->fb_mnt = ITOV(ip)->v_mount;
1939 freeblks->fb_oldsize = DIP(ip, size);
1940 freeblks->fb_newsize = length;
1941 freeblks->fb_chkcnt = DIP(ip, blocks);
1942
1943 for (i = 0; i < NDADDR; i++) {
1944 freeblks->fb_dblks[i] = DIP(ip, db[i]);
1945 DIP_ASSIGN(ip, db[i], 0);
1946 }
1947
1948 for (i = 0; i < NIADDR; i++) {
1949 freeblks->fb_iblks[i] = DIP(ip, ib[i]);
1950 DIP_ASSIGN(ip, ib[i], 0);
1951 }
1952
1953 DIP_ASSIGN(ip, blocks, 0);
1954 DIP_ASSIGN(ip, size, 0);
1955
1956 /*
1957 * Push the zero'ed inode to to its disk buffer so that we are free
1958 * to delete its dependencies below. Once the dependencies are gone
1959 * the buffer can be safely released.
1960 */
1961 if ((error = bread(ip->i_devvp,
1962 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
1963 (int)fs->fs_bsize, NOCRED, &bp)) != 0)
1964 softdep_error("softdep_setup_freeblocks", error);
1965
1966 if (ip->i_ump->um_fstype == UM_UFS1)
1967 *((struct ufs1_dinode *) bp->b_data +
1968 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
1969 else
1970 *((struct ufs2_dinode *) bp->b_data +
1971 ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
1972
1973 /*
1974 * Find and eliminate any inode dependencies.
1975 */
1976 ACQUIRE_LOCK(&lk);
1977 (void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
1978 if ((inodedep->id_state & IOSTARTED) != 0) {
1979 FREE_LOCK(&lk);
1980 panic("softdep_setup_freeblocks: inode busy");
1981 }
1982 /*
1983 * Add the freeblks structure to the list of operations that
1984 * must await the zero'ed inode being written to disk. If we
1985 * still have a bitmap dependency (delay == 0), then the inode
1986 * has never been written to disk, so we can process the
1987 * freeblks below once we have deleted the dependencies.
1988 */
1989 delay = (inodedep->id_state & DEPCOMPLETE);
1990 if (delay)
1991 WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
1992 /*
1993 * Because the file length has been truncated to zero, any
1994 * pending block allocation dependency structures associated
1995 * with this inode are obsolete and can simply be de-allocated.
1996 * We must first merge the two dependency lists to get rid of
1997 * any duplicate freefrag structures, then purge the merged list.
1998 * If we still have a bitmap dependency, then the inode has never
1999 * been written to disk, so we can free any fragments without delay.
2000 */
2001 merge_inode_lists(inodedep);
2002 while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
2003 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
2004 FREE_LOCK(&lk);
2005 bdwrite(bp);
2006 /*
2007 * We must wait for any I/O in progress to finish so that
2008 * all potential buffers on the dirty list will be visible.
2009 * Once they are all there, walk the list and get rid of
2010 * any dependencies.
2011 */
2012 vp = ITOV(ip);
2013 ACQUIRE_LOCK(&lk);
2014 drain_output(vp, 1);
2015 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2016 if (!getdirtybuf(bp, MNT_WAIT))
2017 break;
2018 (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
2019 deallocate_dependencies(bp, inodedep);
2020 bp->b_flags |= B_INVAL | B_NOCACHE;
2021 FREE_LOCK(&lk);
2022 brelse(bp);
2023 ACQUIRE_LOCK(&lk);
2024 }
2025 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
2026 (void) free_inodedep(inodedep);
2027
2028 if (delay) {
2029 freeblks->fb_state |= DEPCOMPLETE;
2030 /*
2031 * If the inode with zeroed block pointers is now on disk we
2032 * can start freeing blocks. Add freeblks to the worklist
2033 * instead of calling handle_workitem_freeblocks() directly as
2034 * it is more likely that additional IO is needed to complete
2035 * the request than in the !delay case.
2036 */
2037 if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
2038 add_to_worklist(&freeblks->fb_list);
2039 }
2040
2041 FREE_LOCK(&lk);
2042 /*
2043 * If the inode has never been written to disk (delay == 0),
2044 * then we can process the freeblks now that we have deleted
2045 * the dependencies.
2046 */
2047 if (!delay)
2048 handle_workitem_freeblocks(freeblks);
2049 }
2050
2051 /*
2052 * Reclaim any dependency structures from a buffer that is about to
2053 * be reallocated to a new vnode. The buffer must be locked, thus,
2054 * no I/O completion operations can occur while we are manipulating
2055 * its associated dependencies. The mutex is held so that other I/O's
2056 * associated with related dependencies do not occur.
2057 */
2058 STATIC void
2059 deallocate_dependencies(bp, inodedep)
2060 struct buf *bp;
2061 struct inodedep *inodedep;
2062 {
2063 struct worklist *wk;
2064 struct indirdep *indirdep;
2065 struct allocindir *aip;
2066 struct pagedep *pagedep;
2067 struct dirrem *dirrem;
2068 struct diradd *dap;
2069 int i;
2070
2071 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2072 switch (wk->wk_type) {
2073
2074 case D_INDIRDEP:
2075 indirdep = WK_INDIRDEP(wk);
2076 /*
2077 * None of the indirect pointers will ever be visible,
2078 * so they can simply be tossed. GOINGAWAY ensures
2079 * that allocated pointers will be saved in the buffer
2080 * cache until they are freed. Note that they will
2081 * only be able to be found by their physical address
2082 * since the inode mapping the logical address will
2083 * be gone. The save buffer used for the safe copy
2084 * was allocated in setup_allocindir_phase2 using
2085 * the physical address so it could be used for this
2086 * purpose. Hence we swap the safe copy with the real
2087 * copy, allowing the safe copy to be freed and holding
2088 * on to the real copy for later use in indir_trunc.
2089 */
2090 if (indirdep->ir_state & GOINGAWAY) {
2091 FREE_LOCK(&lk);
2092 panic("deallocate_dependencies: already gone");
2093 }
2094 indirdep->ir_state |= GOINGAWAY;
2095 while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
2096 free_allocindir(aip, inodedep);
2097 if (bp->b_lblkno >= 0 ||
2098 bp->b_blkno != indirdep->ir_savebp->b_lblkno) {
2099 FREE_LOCK(&lk);
2100 panic("deallocate_dependencies: not indir");
2101 }
2102 bcopy(bp->b_data, indirdep->ir_savebp->b_data,
2103 bp->b_bcount);
2104 WORKLIST_REMOVE(wk);
2105 WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
2106 continue;
2107
2108 case D_PAGEDEP:
2109 pagedep = WK_PAGEDEP(wk);
2110 /*
2111 * None of the directory additions will ever be
2112 * visible, so they can simply be tossed.
2113 */
2114 for (i = 0; i < DAHASHSZ; i++)
2115 while ((dap =
2116 LIST_FIRST(&pagedep->pd_diraddhd[i])))
2117 free_diradd(dap);
2118 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
2119 free_diradd(dap);
2120 /*
2121 * Copy any directory remove dependencies to the list
2122 * to be processed after the zero'ed inode is written.
2123 * If the inode has already been written, then they
2124 * can be dumped directly onto the work list.
2125 */
2126 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd))) {
2127 LIST_REMOVE(dirrem, dm_next);
2128 dirrem->dm_dirinum = pagedep->pd_ino;
2129 if (inodedep == NULL ||
2130 (inodedep->id_state & ALLCOMPLETE) ==
2131 ALLCOMPLETE)
2132 add_to_worklist(&dirrem->dm_list);
2133 else
2134 WORKLIST_INSERT(&inodedep->id_bufwait,
2135 &dirrem->dm_list);
2136 }
2137 if ((pagedep->pd_state & NEWBLOCK) != 0) {
2138 LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
2139 if (wk->wk_type == D_NEWDIRBLK &&
2140 WK_NEWDIRBLK(wk)->db_pagedep ==
2141 pagedep)
2142 break;
2143 if (wk != NULL) {
2144 WORKLIST_REMOVE(wk);
2145 free_newdirblk(WK_NEWDIRBLK(wk));
2146 } else {
2147 FREE_LOCK(&lk);
2148 panic("deallocate_dependencies: "
2149 "lost pagedep");
2150 }
2151 }
2152 WORKLIST_REMOVE(&pagedep->pd_list);
2153 LIST_REMOVE(pagedep, pd_hash);
2154 WORKITEM_FREE(pagedep, D_PAGEDEP);
2155 continue;
2156
2157 case D_ALLOCINDIR:
2158 free_allocindir(WK_ALLOCINDIR(wk), inodedep);
2159 continue;
2160
2161 case D_ALLOCDIRECT:
2162 case D_INODEDEP:
2163 FREE_LOCK(&lk);
2164 panic("deallocate_dependencies: Unexpected type %s",
2165 TYPENAME(wk->wk_type));
2166 /* NOTREACHED */
2167
2168 default:
2169 FREE_LOCK(&lk);
2170 panic("deallocate_dependencies: Unknown type %s",
2171 TYPENAME(wk->wk_type));
2172 /* NOTREACHED */
2173 }
2174 }
2175 }
2176
2177 /*
2178 * Free an allocdirect. Generate a new freefrag work request if appropriate.
2179 * This routine must be called with splbio interrupts blocked.
2180 */
2181 STATIC void
2182 free_allocdirect(adphead, adp, delay)
2183 struct allocdirectlst *adphead;
2184 struct allocdirect *adp;
2185 int delay;
2186 {
2187 struct newdirblk *newdirblk;
2188 struct worklist *wk;
2189
2190 splassert(IPL_BIO);
2191
2192 #ifdef DEBUG
2193 if (lk.lkt_held == -1)
2194 panic("free_allocdirect: lock not held");
2195 #endif
2196 if ((adp->ad_state & DEPCOMPLETE) == 0)
2197 LIST_REMOVE(adp, ad_deps);
2198 TAILQ_REMOVE(adphead, adp, ad_next);
2199 if ((adp->ad_state & COMPLETE) == 0)
2200 WORKLIST_REMOVE(&adp->ad_list);
2201 if (adp->ad_freefrag != NULL) {
2202 if (delay)
2203 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2204 &adp->ad_freefrag->ff_list);
2205 else
2206 add_to_worklist(&adp->ad_freefrag->ff_list);
2207 }
2208 if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
2209 newdirblk = WK_NEWDIRBLK(wk);
2210 WORKLIST_REMOVE(&newdirblk->db_list);
2211 if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
2212 panic("free_allocdirect: extra newdirblk");
2213 if (delay)
2214 WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
2215 &newdirblk->db_list);
2216 else
2217 free_newdirblk(newdirblk);
2218 }
2219 WORKITEM_FREE(adp, D_ALLOCDIRECT);
2220 }
2221
2222 /*
2223 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
2224 * This routine must be called with splbio interrupts blocked.
2225 */
2226 void
2227 free_newdirblk(newdirblk)
2228 struct newdirblk *newdirblk;
2229 {
2230 struct pagedep *pagedep;
2231 struct diradd *dap;
2232 int i;
2233
2234 splassert(IPL_BIO);
2235
2236 #ifdef DEBUG
2237 if (lk.lkt_held == -1)
2238 panic("free_newdirblk: lock not held");
2239 #endif
2240 /*
2241 * If the pagedep is still linked onto the directory buffer
2242 * dependency chain, then some of the entries on the
2243 * pd_pendinghd list may not be committed to disk yet. In
2244 * this case, we will simply clear the NEWBLOCK flag and
2245 * let the pd_pendinghd list be processed when the pagedep
2246 * is next written. If the pagedep is no longer on the buffer
2247 * dependency chain, then all the entries on the pd_pending
2248 * list are committed to disk and we can free them here.
2249 */
2250 pagedep = newdirblk->db_pagedep;
2251 pagedep->pd_state &= ~NEWBLOCK;
2252 if ((pagedep->pd_state & ONWORKLIST) == 0)
2253 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
2254 free_diradd(dap);
2255 /*
2256 * If no dependencies remain, the pagedep will be freed.
2257 */
2258 for (i = 0; i < DAHASHSZ; i++)
2259 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
2260 break;
2261 if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
2262 LIST_REMOVE(pagedep, pd_hash);
2263 WORKITEM_FREE(pagedep, D_PAGEDEP);
2264 }
2265 WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2266 }
2267
2268 /*
2269 * Prepare an inode to be freed. The actual free operation is not
2270 * done until the zero'ed inode has been written to disk.
2271 */
2272 void
2273 softdep_freefile(pvp, ino, mode)
2274 struct vnode *pvp;
2275 ino_t ino;
2276 mode_t mode;
2277 {
2278 struct inode *ip = VTOI(pvp);
2279 struct inodedep *inodedep;
2280 struct freefile *freefile;
2281
2282 /*
2283 * This sets up the inode de-allocation dependency.
2284 */
2285 freefile = pool_get(&freefile_pool, PR_WAITOK);
2286 freefile->fx_list.wk_type = D_FREEFILE;
2287 freefile->fx_list.wk_state = 0;
2288 freefile->fx_mode = mode;
2289 freefile->fx_oldinum = ino;
2290 freefile->fx_devvp = ip->i_devvp;
2291 freefile->fx_mnt = ITOV(ip)->v_mount;
2292
2293 /*
2294 * If the inodedep does not exist, then the zero'ed inode has
2295 * been written to disk. If the allocated inode has never been
2296 * written to disk, then the on-disk inode is zero'ed. In either
2297 * case we can free the file immediately.
2298 */
2299 ACQUIRE_LOCK(&lk);
2300 if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
2301 check_inode_unwritten(inodedep)) {
2302 FREE_LOCK(&lk);
2303 handle_workitem_freefile(freefile);
2304 return;
2305 }
2306 WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
2307 FREE_LOCK(&lk);
2308 }
2309
2310 /*
2311 * Check to see if an inode has never been written to disk. If
2312 * so free the inodedep and return success, otherwise return failure.
2313 * This routine must be called with splbio interrupts blocked.
2314 *
2315 * If we still have a bitmap dependency, then the inode has never
2316 * been written to disk. Drop the dependency as it is no longer
2317 * necessary since the inode is being deallocated. We set the
2318 * ALLCOMPLETE flags since the bitmap now properly shows that the
2319 * inode is not allocated. Even if the inode is actively being
2320 * written, it has been rolled back to its zero'ed state, so we
2321 * are ensured that a zero inode is what is on the disk. For short
2322 * lived files, this change will usually result in removing all the
2323 * dependencies from the inode so that it can be freed immediately.
2324 */
2325 STATIC int
2326 check_inode_unwritten(inodedep)
2327 struct inodedep *inodedep;
2328 {
2329 splassert(IPL_BIO);
2330
2331 if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
2332 LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2333 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2334 LIST_FIRST(&inodedep->id_inowait) != NULL ||
2335 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2336 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2337 inodedep->id_nlinkdelta != 0)
2338 return (0);
2339 inodedep->id_state |= ALLCOMPLETE;
2340 LIST_REMOVE(inodedep, id_deps);
2341 inodedep->id_buf = NULL;
2342 if (inodedep->id_state & ONWORKLIST)
2343 WORKLIST_REMOVE(&inodedep->id_list);
2344 if (inodedep->id_savedino1 != NULL) {
2345 FREE(inodedep->id_savedino1, M_INODEDEP);
2346 inodedep->id_savedino1 = NULL;
2347 }
2348 if (free_inodedep(inodedep) == 0) {
2349 FREE_LOCK(&lk);
2350 panic("check_inode_unwritten: busy inode");
2351 }
2352 return (1);
2353 }
2354
2355 /*
2356 * Try to free an inodedep structure. Return 1 if it could be freed.
2357 */
2358 STATIC int
2359 free_inodedep(inodedep)
2360 struct inodedep *inodedep;
2361 {
2362
2363 if ((inodedep->id_state & ONWORKLIST) != 0 ||
2364 (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
2365 LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
2366 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
2367 LIST_FIRST(&inodedep->id_inowait) != NULL ||
2368 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
2369 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
2370 inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
2371 return (0);
2372 LIST_REMOVE(inodedep, id_hash);
2373 WORKITEM_FREE(inodedep, D_INODEDEP);
2374 num_inodedep -= 1;
2375 return (1);
2376 }
2377
2378 /*
2379 * This workitem routine performs the block de-allocation.
2380 * The workitem is added to the pending list after the updated
2381 * inode block has been written to disk. As mentioned above,
2382 * checks regarding the number of blocks de-allocated (compared
2383 * to the number of blocks allocated for the file) are also
2384 * performed in this function.
2385 */
2386 STATIC void
2387 handle_workitem_freeblocks(freeblks)
2388 struct freeblks *freeblks;
2389 {
2390 struct inode tip;
2391 daddr_t bn;
2392 union {
2393 struct ufs1_dinode di1;
2394 struct ufs2_dinode di2;
2395 } di;
2396 struct fs *fs;
2397 int i, level, bsize;
2398 long nblocks, blocksreleased = 0;
2399 int error, allerror = 0;
2400 daddr64_t baselbns[NIADDR], tmpval;
2401
2402 if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UM_UFS1)
2403 tip.i_din1 = &di.di1;
2404 else
2405 tip.i_din2 = &di.di2;
2406
2407 tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs;
2408 tip.i_number = freeblks->fb_previousinum;
2409 tip.i_ump = VFSTOUFS(freeblks->fb_mnt);
2410 tip.i_dev = freeblks->fb_devvp->v_rdev;
2411 DIP_ASSIGN(&tip, size, freeblks->fb_oldsize);
2412 DIP_ASSIGN(&tip, uid, freeblks->fb_uid);
2413 tip.i_vnode = NULL;
2414 tmpval = 1;
2415 baselbns[0] = NDADDR;
2416 for (i = 1; i < NIADDR; i++) {
2417 tmpval *= NINDIR(fs);
2418 baselbns[i] = baselbns[i - 1] + tmpval;
2419 }
2420 nblocks = btodb(fs->fs_bsize);
2421 blocksreleased = 0;
2422 /*
2423 * Indirect blocks first.
2424 */
2425 for (level = (NIADDR - 1); level >= 0; level--) {
2426 if ((bn = freeblks->fb_iblks[level]) == 0)
2427 continue;
2428 if ((error = indir_trunc(&tip, fsbtodb(fs, bn), level,
2429 baselbns[level], &blocksreleased)) != 0)
2430 allerror = error;
2431 ffs_blkfree(&tip, bn, fs->fs_bsize);
2432 blocksreleased += nblocks;
2433 }
2434 /*
2435 * All direct blocks or frags.
2436 */
2437 for (i = (NDADDR - 1); i >= 0; i--) {
2438 if ((bn = freeblks->fb_dblks[i]) == 0)
2439 continue;
2440 bsize = blksize(fs, &tip, i);
2441 ffs_blkfree(&tip, bn, bsize);
2442 blocksreleased += btodb(bsize);
2443 }
2444
2445 #ifdef DIAGNOSTIC
2446 if (freeblks->fb_chkcnt != blocksreleased)
2447 printf("handle_workitem_freeblocks: block count\n");
2448 if (allerror)
2449 softdep_error("handle_workitem_freeblks", allerror);
2450 #endif /* DIAGNOSTIC */
2451 WORKITEM_FREE(freeblks, D_FREEBLKS);
2452 }
2453
2454 /*
2455 * Release blocks associated with the inode ip and stored in the indirect
2456 * block dbn. If level is greater than SINGLE, the block is an indirect block
2457 * and recursive calls to indirtrunc must be used to cleanse other indirect
2458 * blocks.
2459 */
2460 STATIC int
2461 indir_trunc(ip, dbn, level, lbn, countp)
2462 struct inode *ip;
2463 daddr_t dbn;
2464 int level;
2465 daddr64_t lbn;
2466 long *countp;
2467 {
2468 struct buf *bp;
2469 int32_t *bap1 = NULL;
2470 int64_t nb, *bap2 = NULL;
2471 struct fs *fs;
2472 struct worklist *wk;
2473 struct indirdep *indirdep;
2474 int i, lbnadd, nblocks, ufs1fmt;
2475 int error, allerror = 0;
2476
2477 fs = ip->i_fs;
2478 lbnadd = 1;
2479 for (i = level; i > 0; i--)
2480 lbnadd *= NINDIR(fs);
2481 /*
2482 * Get buffer of block pointers to be freed. This routine is not
2483 * called until the zero'ed inode has been written, so it is safe
2484 * to free blocks as they are encountered. Because the inode has
2485 * been zero'ed, calls to bmap on these blocks will fail. So, we
2486 * have to use the on-disk address and the block device for the
2487 * filesystem to look them up. If the file was deleted before its
2488 * indirect blocks were all written to disk, the routine that set
2489 * us up (deallocate_dependencies) will have arranged to leave
2490 * a complete copy of the indirect block in memory for our use.
2491 * Otherwise we have to read the blocks in from the disk.
2492 */
2493 ACQUIRE_LOCK(&lk);
2494 if ((bp = incore(ip->i_devvp, dbn)) != NULL &&
2495 (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
2496 if (wk->wk_type != D_INDIRDEP ||
2497 (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
2498 (indirdep->ir_state & GOINGAWAY) == 0) {
2499 FREE_LOCK(&lk);
2500 panic("indir_trunc: lost indirdep");
2501 }
2502 WORKLIST_REMOVE(wk);
2503 WORKITEM_FREE(indirdep, D_INDIRDEP);
2504 if (LIST_FIRST(&bp->b_dep) != NULL) {
2505 FREE_LOCK(&lk);
2506 panic("indir_trunc: dangling dep");
2507 }
2508 FREE_LOCK(&lk);
2509 } else {
2510 FREE_LOCK(&lk);
2511 error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
2512 if (error)
2513 return (error);
2514 }
2515 /*
2516 * Recursively free indirect blocks.
2517 */
2518 if (ip->i_ump->um_fstype == UM_UFS1) {
2519 ufs1fmt = 1;
2520 bap1 = (int32_t *)bp->b_data;
2521 } else {
2522 ufs1fmt = 0;
2523 bap2 = (int64_t *)bp->b_data;
2524 }
2525 nblocks = btodb(fs->fs_bsize);
2526 for (i = NINDIR(fs) - 1; i >= 0; i--) {
2527 if (ufs1fmt)
2528 nb = bap1[i];
2529 else
2530 nb = bap2[i];
2531 if (nb == 0)
2532 continue;
2533 if (level != 0) {
2534 if ((error = indir_trunc(ip, fsbtodb(fs, nb),
2535 level - 1, lbn + (i * lbnadd), countp)) != 0)
2536 allerror = error;
2537 }
2538 ffs_blkfree(ip, nb, fs->fs_bsize);
2539 *countp += nblocks;
2540 }
2541 bp->b_flags |= B_INVAL | B_NOCACHE;
2542 brelse(bp);
2543 return (allerror);
2544 }
2545
2546 /*
2547 * Free an allocindir.
2548 * This routine must be called with splbio interrupts blocked.
2549 */
2550 STATIC void
2551 free_allocindir(aip, inodedep)
2552 struct allocindir *aip;
2553 struct inodedep *inodedep;
2554 {
2555 struct freefrag *freefrag;
2556
2557 splassert(IPL_BIO);
2558
2559 #ifdef DEBUG
2560 if (lk.lkt_held == -1)
2561 panic("free_allocindir: lock not held");
2562 #endif
2563 if ((aip->ai_state & DEPCOMPLETE) == 0)
2564 LIST_REMOVE(aip, ai_deps);
2565 if (aip->ai_state & ONWORKLIST)
2566 WORKLIST_REMOVE(&aip->ai_list);
2567 LIST_REMOVE(aip, ai_next);
2568 if ((freefrag = aip->ai_freefrag) != NULL) {
2569 if (inodedep == NULL)
2570 add_to_worklist(&freefrag->ff_list);
2571 else
2572 WORKLIST_INSERT(&inodedep->id_bufwait,
2573 &freefrag->ff_list);
2574 }
2575 WORKITEM_FREE(aip, D_ALLOCINDIR);
2576 }
2577
2578 /*
2579 * Directory entry addition dependencies.
2580 *
2581 * When adding a new directory entry, the inode (with its incremented link
2582 * count) must be written to disk before the directory entry's pointer to it.
2583 * Also, if the inode is newly allocated, the corresponding freemap must be
2584 * updated (on disk) before the directory entry's pointer. These requirements
2585 * are met via undo/redo on the directory entry's pointer, which consists
2586 * simply of the inode number.
2587 *
2588 * As directory entries are added and deleted, the free space within a
2589 * directory block can become fragmented. The ufs file system will compact
2590 * a fragmented directory block to make space for a new entry. When this
2591 * occurs, the offsets of previously added entries change. Any "diradd"
2592 * dependency structures corresponding to these entries must be updated with
2593 * the new offsets.
2594 */
2595
2596 /*
2597 * This routine is called after the in-memory inode's link
2598 * count has been incremented, but before the directory entry's
2599 * pointer to the inode has been set.
2600 */
2601 int
2602 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
2603 struct buf *bp; /* buffer containing directory block */
2604 struct inode *dp; /* inode for directory */
2605 off_t diroffset; /* offset of new entry in directory */
2606 long newinum; /* inode referenced by new directory entry */
2607 struct buf *newdirbp; /* non-NULL => contents of new mkdir */
2608 int isnewblk; /* entry is in a newly allocated block */
2609 {
2610 int offset; /* offset of new entry within directory block */
2611 daddr64_t lbn; /* block in directory containing new entry */
2612 struct fs *fs;
2613 struct diradd *dap;
2614 struct allocdirect *adp;
2615 struct pagedep *pagedep;
2616 struct inodedep *inodedep;
2617 struct newdirblk *newdirblk = NULL;
2618 struct mkdir *mkdir1, *mkdir2;
2619
2620
2621 fs = dp->i_fs;
2622 lbn = lblkno(fs, diroffset);
2623 offset = blkoff(fs, diroffset);
2624 dap = pool_get(&diradd_pool, PR_WAITOK);
2625 bzero(dap,sizeof(struct diradd));
2626 dap->da_list.wk_type = D_DIRADD;
2627 dap->da_offset = offset;
2628 dap->da_newinum = newinum;
2629 dap->da_state = ATTACHED;
2630 if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
2631 newdirblk = pool_get(&newdirblk_pool, PR_WAITOK);
2632 newdirblk->db_list.wk_type = D_NEWDIRBLK;
2633 newdirblk->db_state = 0;
2634 }
2635 if (newdirbp == NULL) {
2636 dap->da_state |= DEPCOMPLETE;
2637 ACQUIRE_LOCK(&lk);
2638 } else {
2639 dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
2640 mkdir1 = pool_get(&mkdir_pool, PR_WAITOK);
2641 mkdir1->md_list.wk_type = D_MKDIR;
2642 mkdir1->md_state = MKDIR_BODY;
2643 mkdir1->md_diradd = dap;
2644 mkdir2 = pool_get(&mkdir_pool, PR_WAITOK);
2645 mkdir2->md_list.wk_type = D_MKDIR;
2646 mkdir2->md_state = MKDIR_PARENT;
2647 mkdir2->md_diradd = dap;
2648 /*
2649 * Dependency on "." and ".." being written to disk.
2650 */
2651 mkdir1->md_buf = newdirbp;
2652 ACQUIRE_LOCK(&lk);
2653 LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
2654 WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
2655 FREE_LOCK(&lk);
2656 bdwrite(newdirbp);
2657 /*
2658 * Dependency on link count increase for parent directory
2659 */
2660 ACQUIRE_LOCK(&lk);
2661 if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
2662 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
2663 dap->da_state &= ~MKDIR_PARENT;
2664 WORKITEM_FREE(mkdir2, D_MKDIR);
2665 } else {
2666 LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
2667 WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
2668 }
2669 }
2670 /*
2671 * Link into parent directory pagedep to await its being written.
2672 */
2673 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2674 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2675 dap->da_pagedep = pagedep;
2676 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
2677 da_pdlist);
2678 /*
2679 * Link into its inodedep. Put it on the id_bufwait list if the inode
2680 * is not yet written. If it is written, do the post-inode write
2681 * processing to put it on the id_pendinghd list.
2682 */
2683 (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
2684 if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
2685 diradd_inode_written(dap, inodedep);
2686 else
2687 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
2688 if (isnewblk) {
2689 /*
2690 * Directories growing into indirect blocks are rare
2691 * enough and the frequency of new block allocation
2692 * in those cases even more rare, that we choose not
2693 * to bother tracking them. Rather we simply force the
2694 * new directory entry to disk.
2695 */
2696 if (lbn >= NDADDR) {
2697 FREE_LOCK(&lk);
2698 /*
2699 * We only have a new allocation when at the
2700 * beginning of a new block, not when we are
2701 * expanding into an existing block.
2702 */
2703 if (blkoff(fs, diroffset) == 0)
2704 return (1);
2705 return (0);
2706 }
2707 /*
2708 * We only have a new allocation when at the beginning
2709 * of a new fragment, not when we are expanding into an
2710 * existing fragment. Also, there is nothing to do if we
2711 * are already tracking this block.
2712 */
2713 if (fragoff(fs, diroffset) != 0) {
2714 FREE_LOCK(&lk);
2715 return (0);
2716 }
2717
2718 if ((pagedep->pd_state & NEWBLOCK) != 0) {
2719 WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
2720 FREE_LOCK(&lk);
2721 return (0);
2722 }
2723 /*
2724 * Find our associated allocdirect and have it track us.
2725 */
2726 if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
2727 panic("softdep_setup_directory_add: lost inodedep");
2728 adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
2729 if (adp == NULL || adp->ad_lbn != lbn) {
2730 FREE_LOCK(&lk);
2731 panic("softdep_setup_directory_add: lost entry");
2732 }
2733 pagedep->pd_state |= NEWBLOCK;
2734 newdirblk->db_pagedep = pagedep;
2735 WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
2736 }
2737 FREE_LOCK(&lk);
2738 return (0);
2739 }
2740
2741 /*
2742 * This procedure is called to change the offset of a directory
2743 * entry when compacting a directory block which must be owned
2744 * exclusively by the caller. Note that the actual entry movement
2745 * must be done in this procedure to ensure that no I/O completions
2746 * occur while the move is in progress.
2747 */
2748 void
2749 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
2750 struct inode *dp; /* inode for directory */
2751 caddr_t base; /* address of dp->i_offset */
2752 caddr_t oldloc; /* address of old directory location */
2753 caddr_t newloc; /* address of new directory location */
2754 int entrysize; /* size of directory entry */
2755 {
2756 int offset, oldoffset, newoffset;
2757 struct pagedep *pagedep;
2758 struct diradd *dap;
2759 daddr64_t lbn;
2760
2761 ACQUIRE_LOCK(&lk);
2762 lbn = lblkno(dp->i_fs, dp->i_offset);
2763 offset = blkoff(dp->i_fs, dp->i_offset);
2764 if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
2765 goto done;
2766 oldoffset = offset + (oldloc - base);
2767 newoffset = offset + (newloc - base);
2768
2769 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
2770 if (dap->da_offset != oldoffset)
2771 continue;
2772 dap->da_offset = newoffset;
2773 if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
2774 break;
2775 LIST_REMOVE(dap, da_pdlist);
2776 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
2777 dap, da_pdlist);
2778 break;
2779 }
2780 if (dap == NULL) {
2781
2782 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
2783 if (dap->da_offset == oldoffset) {
2784 dap->da_offset = newoffset;
2785 break;
2786 }
2787 }
2788 }
2789 done:
2790 bcopy(oldloc, newloc, entrysize);
2791 FREE_LOCK(&lk);
2792 }
2793
2794 /*
2795 * Free a diradd dependency structure. This routine must be called
2796 * with splbio interrupts blocked.
2797 */
2798 STATIC void
2799 free_diradd(dap)
2800 struct diradd *dap;
2801 {
2802 struct dirrem *dirrem;
2803 struct pagedep *pagedep;
2804 struct inodedep *inodedep;
2805 struct mkdir *mkdir, *nextmd;
2806
2807 splassert(IPL_BIO);
2808
2809 #ifdef DEBUG
2810 if (lk.lkt_held == -1)
2811 panic("free_diradd: lock not held");
2812 #endif
2813 WORKLIST_REMOVE(&dap->da_list);
2814 LIST_REMOVE(dap, da_pdlist);
2815 if ((dap->da_state & DIRCHG) == 0) {
2816 pagedep = dap->da_pagedep;
2817 } else {
2818 dirrem = dap->da_previous;
2819 pagedep = dirrem->dm_pagedep;
2820 dirrem->dm_dirinum = pagedep->pd_ino;
2821 add_to_worklist(&dirrem->dm_list);
2822 }
2823 if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
2824 0, &inodedep) != 0)
2825 (void) free_inodedep(inodedep);
2826 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2827 for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
2828 nextmd = LIST_NEXT(mkdir, md_mkdirs);
2829 if (mkdir->md_diradd != dap)
2830 continue;
2831 dap->da_state &= ~mkdir->md_state;
2832 WORKLIST_REMOVE(&mkdir->md_list);
2833 LIST_REMOVE(mkdir, md_mkdirs);
2834 WORKITEM_FREE(mkdir, D_MKDIR);
2835 }
2836 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
2837 FREE_LOCK(&lk);
2838 panic("free_diradd: unfound ref");
2839 }
2840 }
2841 WORKITEM_FREE(dap, D_DIRADD);
2842 }
2843
2844 /*
2845 * Directory entry removal dependencies.
2846 *
2847 * When removing a directory entry, the entry's inode pointer must be
2848 * zero'ed on disk before the corresponding inode's link count is decremented
2849 * (possibly freeing the inode for re-use). This dependency is handled by
2850 * updating the directory entry but delaying the inode count reduction until
2851 * after the directory block has been written to disk. After this point, the
2852 * inode count can be decremented whenever it is convenient.
2853 */
2854
2855 /*
2856 * This routine should be called immediately after removing
2857 * a directory entry. The inode's link count should not be
2858 * decremented by the calling procedure -- the soft updates
2859 * code will do this task when it is safe.
2860 */
2861 void
2862 softdep_setup_remove(bp, dp, ip, isrmdir)
2863 struct buf *bp; /* buffer containing directory block */
2864 struct inode *dp; /* inode for the directory being modified */
2865 struct inode *ip; /* inode for directory entry being removed */
2866 int isrmdir; /* indicates if doing RMDIR */
2867 {
2868 struct dirrem *dirrem, *prevdirrem;
2869
2870 /*
2871 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
2872 */
2873 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
2874
2875 /*
2876 * If the COMPLETE flag is clear, then there were no active
2877 * entries and we want to roll back to a zeroed entry until
2878 * the new inode is committed to disk. If the COMPLETE flag is
2879 * set then we have deleted an entry that never made it to
2880 * disk. If the entry we deleted resulted from a name change,
2881 * then the old name still resides on disk. We cannot delete
2882 * its inode (returned to us in prevdirrem) until the zeroed
2883 * directory entry gets to disk. The new inode has never been
2884 * referenced on the disk, so can be deleted immediately.
2885 */
2886 if ((dirrem->dm_state & COMPLETE) == 0) {
2887 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
2888 dm_next);
2889 FREE_LOCK(&lk);
2890 } else {
2891 if (prevdirrem != NULL)
2892 LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
2893 prevdirrem, dm_next);
2894 dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
2895 FREE_LOCK(&lk);
2896 handle_workitem_remove(dirrem);
2897 }
2898 }
2899
2900 /*
2901 * Allocate a new dirrem if appropriate and return it along with
2902 * its associated pagedep. Called without a lock, returns with lock.
2903 */
2904 STATIC long num_dirrem; /* number of dirrem allocated */
2905 STATIC struct dirrem *
2906 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
2907 struct buf *bp; /* buffer containing directory block */
2908 struct inode *dp; /* inode for the directory being modified */
2909 struct inode *ip; /* inode for directory entry being removed */
2910 int isrmdir; /* indicates if doing RMDIR */
2911 struct dirrem **prevdirremp; /* previously referenced inode, if any */
2912 {
2913 int offset;
2914 daddr64_t lbn;
2915 struct diradd *dap;
2916 struct dirrem *dirrem;
2917 struct pagedep *pagedep;
2918
2919 /*
2920 * Whiteouts have no deletion dependencies.
2921 */
2922 if (ip == NULL)
2923 panic("newdirrem: whiteout");
2924 /*
2925 * If we are over our limit, try to improve the situation.
2926 * Limiting the number of dirrem structures will also limit
2927 * the number of freefile and freeblks structures.
2928 */
2929 if (num_dirrem > max_softdeps / 2)
2930 (void) request_cleanup(FLUSH_REMOVE, 0);
2931 num_dirrem += 1;
2932 dirrem = pool_get(&dirrem_pool, PR_WAITOK);
2933 bzero(dirrem,sizeof(struct dirrem));
2934 dirrem->dm_list.wk_type = D_DIRREM;
2935 dirrem->dm_state = isrmdir ? RMDIR : 0;
2936 dirrem->dm_mnt = ITOV(ip)->v_mount;
2937 dirrem->dm_oldinum = ip->i_number;
2938 *prevdirremp = NULL;
2939
2940 ACQUIRE_LOCK(&lk);
2941 lbn = lblkno(dp->i_fs, dp->i_offset);
2942 offset = blkoff(dp->i_fs, dp->i_offset);
2943 if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
2944 WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2945 dirrem->dm_pagedep = pagedep;
2946 /*
2947 * Check for a diradd dependency for the same directory entry.
2948 * If present, then both dependencies become obsolete and can
2949 * be de-allocated. Check for an entry on both the pd_dirraddhd
2950 * list and the pd_pendinghd list.
2951 */
2952
2953 LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
2954 if (dap->da_offset == offset)
2955 break;
2956 if (dap == NULL) {
2957
2958 LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
2959 if (dap->da_offset == offset)
2960 break;
2961 if (dap == NULL)
2962 return (dirrem);
2963 }
2964 /*
2965 * Must be ATTACHED at this point.
2966 */
2967 if ((dap->da_state & ATTACHED) == 0) {
2968 FREE_LOCK(&lk);
2969 panic("newdirrem: not ATTACHED");
2970 }
2971 if (dap->da_newinum != ip->i_number) {
2972 FREE_LOCK(&lk);
2973 panic("newdirrem: inum %d should be %d",
2974 ip->i_number, dap->da_newinum);
2975 }
2976 /*
2977 * If we are deleting a changed name that never made it to disk,
2978 * then return the dirrem describing the previous inode (which
2979 * represents the inode currently referenced from this entry on disk).
2980 */
2981 if ((dap->da_state & DIRCHG) != 0) {
2982 *prevdirremp = dap->da_previous;
2983 dap->da_state &= ~DIRCHG;
2984 dap->da_pagedep = pagedep;
2985 }
2986 /*
2987 * We are deleting an entry that never made it to disk.
2988 * Mark it COMPLETE so we can delete its inode immediately.
2989 */
2990 dirrem->dm_state |= COMPLETE;
2991 free_diradd(dap);
2992 return (dirrem);
2993 }
2994
2995 /*
2996 * Directory entry change dependencies.
2997 *
2998 * Changing an existing directory entry requires that an add operation
2999 * be completed first followed by a deletion. The semantics for the addition
3000 * are identical to the description of adding a new entry above except
3001 * that the rollback is to the old inode number rather than zero. Once
3002 * the addition dependency is completed, the removal is done as described
3003 * in the removal routine above.
3004 */
3005
3006 /*
3007 * This routine should be called immediately after changing
3008 * a directory entry. The inode's link count should not be
3009 * decremented by the calling procedure -- the soft updates
3010 * code will perform this task when it is safe.
3011 */
3012 void
3013 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
3014 struct buf *bp; /* buffer containing directory block */
3015 struct inode *dp; /* inode for the directory being modified */
3016 struct inode *ip; /* inode for directory entry being removed */
3017 long newinum; /* new inode number for changed entry */
3018 int isrmdir; /* indicates if doing RMDIR */
3019 {
3020 int offset;
3021 struct diradd *dap = NULL;
3022 struct dirrem *dirrem, *prevdirrem;
3023 struct pagedep *pagedep;
3024 struct inodedep *inodedep;
3025
3026 offset = blkoff(dp->i_fs, dp->i_offset);
3027 dap = pool_get(&diradd_pool, PR_WAITOK);
3028 bzero(dap,sizeof(struct diradd));
3029 dap->da_list.wk_type = D_DIRADD;
3030 dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
3031 dap->da_offset = offset;
3032 dap->da_newinum = newinum;
3033
3034 /*
3035 * Allocate a new dirrem and ACQUIRE_LOCK.
3036 */
3037 dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
3038 pagedep = dirrem->dm_pagedep;
3039 /*
3040 * The possible values for isrmdir:
3041 * 0 - non-directory file rename
3042 * 1 - directory rename within same directory
3043 * inum - directory rename to new directory of given inode number
3044 * When renaming to a new directory, we are both deleting and
3045 * creating a new directory entry, so the link count on the new
3046 * directory should not change. Thus we do not need the followup
3047 * dirrem which is usually done in handle_workitem_remove. We set
3048 * the DIRCHG flag to tell handle_workitem_remove to skip the
3049 * followup dirrem.
3050 */
3051 if (isrmdir > 1)
3052 dirrem->dm_state |= DIRCHG;
3053
3054 /*
3055 * If the COMPLETE flag is clear, then there were no active
3056 * entries and we want to roll back to the previous inode until
3057 * the new inode is committed to disk. If the COMPLETE flag is
3058 * set, then we have deleted an entry that never made it to disk.
3059 * If the entry we deleted resulted from a name change, then the old
3060 * inode reference still resides on disk. Any rollback that we do
3061 * needs to be to that old inode (returned to us in prevdirrem). If
3062 * the entry we deleted resulted from a create, then there is
3063 * no entry on the disk, so we want to roll back to zero rather
3064 * than the uncommitted inode. In either of the COMPLETE cases we
3065 * want to immediately free the unwritten and unreferenced inode.
3066 */
3067 if ((dirrem->dm_state & COMPLETE) == 0) {
3068 dap->da_previous = dirrem;
3069 } else {
3070 if (prevdirrem != NULL) {
3071 dap->da_previous = prevdirrem;
3072 } else {
3073 dap->da_state &= ~DIRCHG;
3074 dap->da_pagedep = pagedep;
3075 }
3076 dirrem->dm_dirinum = pagedep->pd_ino;
3077 add_to_worklist(&dirrem->dm_list);
3078 }
3079 /*
3080 * Link into its inodedep. Put it on the id_bufwait list if the inode
3081 * is not yet written. If it is written, do the post-inode write
3082 * processing to put it on the id_pendinghd list.
3083 */
3084 if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
3085 (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
3086 dap->da_state |= COMPLETE;
3087 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
3088 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
3089 } else {
3090 LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
3091 dap, da_pdlist);
3092 WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
3093 }
3094 FREE_LOCK(&lk);
3095 }
3096
3097 /*
3098 * Called whenever the link count on an inode is changed.
3099 * It creates an inode dependency so that the new reference(s)
3100 * to the inode cannot be committed to disk until the updated
3101 * inode has been written.
3102 */
3103 void
3104 softdep_change_linkcnt(ip, nodelay)
3105 struct inode *ip; /* the inode with the increased link count */
3106 int nodelay; /* do background work or not */
3107 {
3108 struct inodedep *inodedep;
3109 int flags;
3110
3111 /*
3112 * If requested, do not allow background work to happen.
3113 */
3114 flags = DEPALLOC;
3115 if (nodelay)
3116 flags |= NODELAY;
3117
3118 ACQUIRE_LOCK(&lk);
3119
3120 (void) inodedep_lookup(ip->i_fs, ip->i_number, flags, &inodedep);
3121 if (DIP(ip, nlink) < ip->i_effnlink) {
3122 FREE_LOCK(&lk);
3123 panic("softdep_change_linkcnt: bad delta");
3124 }
3125
3126 inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3127
3128 FREE_LOCK(&lk);
3129 }
3130
3131 /*
3132 * This workitem decrements the inode's link count.
3133 * If the link count reaches zero, the file is removed.
3134 */
3135 STATIC void
3136 handle_workitem_remove(dirrem)
3137 struct dirrem *dirrem;
3138 {
3139 struct proc *p = CURPROC; /* XXX */
3140 struct inodedep *inodedep;
3141 struct vnode *vp;
3142 struct inode *ip;
3143 ino_t oldinum;
3144 int error;
3145
3146 if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) {
3147 softdep_error("handle_workitem_remove: vget", error);
3148 return;
3149 }
3150 ip = VTOI(vp);
3151 ACQUIRE_LOCK(&lk);
3152 if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep))
3153 == 0) {
3154 FREE_LOCK(&lk);
3155 panic("handle_workitem_remove: lost inodedep");
3156 }
3157 /*
3158 * Normal file deletion.
3159 */
3160 if ((dirrem->dm_state & RMDIR) == 0) {
3161 DIP_ADD(ip, nlink, -1);
3162 ip->i_flag |= IN_CHANGE;
3163 if (DIP(ip, nlink) < ip->i_effnlink) {
3164 FREE_LOCK(&lk);
3165 panic("handle_workitem_remove: bad file delta");
3166 }
3167 inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3168 FREE_LOCK(&lk);
3169 vput(vp);
3170 num_dirrem -= 1;
3171 WORKITEM_FREE(dirrem, D_DIRREM);
3172 return;
3173 }
3174 /*
3175 * Directory deletion. Decrement reference count for both the
3176 * just deleted parent directory entry and the reference for ".".
3177 * Next truncate the directory to length zero. When the
3178 * truncation completes, arrange to have the reference count on
3179 * the parent decremented to account for the loss of "..".
3180 */
3181 DIP_ADD(ip, nlink, -2);
3182 ip->i_flag |= IN_CHANGE;
3183 if (DIP(ip, nlink) < ip->i_effnlink)
3184 panic("handle_workitem_remove: bad dir delta");
3185 inodedep->id_nlinkdelta = DIP(ip, nlink) - ip->i_effnlink;
3186 FREE_LOCK(&lk);
3187 if ((error = UFS_TRUNCATE(ip, (off_t)0, 0, p->p_ucred)) != 0)
3188 softdep_error("handle_workitem_remove: truncate", error);
3189 /*
3190 * Rename a directory to a new parent. Since, we are both deleting
3191 * and creating a new directory entry, the link count on the new
3192 * directory should not change. Thus we skip the followup dirrem.
3193 */
3194 if (dirrem->dm_state & DIRCHG) {
3195 vput(vp);
3196 num_dirrem -= 1;
3197 WORKITEM_FREE(dirrem, D_DIRREM);
3198 return;
3199 }
3200 /*
3201 * If the inodedep does not exist, then the zero'ed inode has
3202 * been written to disk. If the allocated inode has never been
3203 * written to disk, then the on-disk inode is zero'ed. In either
3204 * case we can remove the file immediately.
3205 */
3206 ACQUIRE_LOCK(&lk);
3207 dirrem->dm_state = 0;
3208 oldinum = dirrem->dm_oldinum;
3209 dirrem->dm_oldinum = dirrem->dm_dirinum;
3210 if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
3211 check_inode_unwritten(inodedep)) {
3212 FREE_LOCK(&lk);
3213 vput(vp);
3214 handle_workitem_remove(dirrem);
3215 return;
3216 }
3217 WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
3218 FREE_LOCK(&lk);
3219 ip->i_flag |= IN_CHANGE;
3220 UFS_UPDATE(VTOI(vp), 0);
3221 vput(vp);
3222 }
3223
3224 /*
3225 * Inode de-allocation dependencies.
3226 *
3227 * When an inode's link count is reduced to zero, it can be de-allocated. We
3228 * found it convenient to postpone de-allocation until after the inode is
3229 * written to disk with its new link count (zero). At this point, all of the
3230 * on-disk inode's block pointers are nullified and, with careful dependency
3231 * list ordering, all dependencies related to the inode will be satisfied and
3232 * the corresponding dependency structures de-allocated. So, if/when the
3233 * inode is reused, there will be no mixing of old dependencies with new
3234 * ones. This artificial dependency is set up by the block de-allocation
3235 * procedure above (softdep_setup_freeblocks) and completed by the
3236 * following procedure.
3237 */
3238 STATIC void
3239 handle_workitem_freefile(freefile)
3240 struct freefile *freefile;
3241 {
3242 struct fs *fs;
3243 struct vnode vp;
3244 struct inode tip;
3245 #ifdef DEBUG
3246 struct inodedep *idp;
3247 #endif
3248 int error;
3249
3250 fs = VFSTOUFS(freefile->fx_mnt)->um_fs;
3251 #ifdef DEBUG
3252 ACQUIRE_LOCK(&lk);
3253 error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
3254 FREE_LOCK(&lk);
3255 if (error)
3256 panic("handle_workitem_freefile: inodedep survived");
3257 #endif
3258 tip.i_ump = VFSTOUFS(freefile->fx_mnt);
3259 tip.i_dev = freefile->fx_devvp->v_rdev;
3260 tip.i_fs = fs;
3261 tip.i_vnode = &vp;
3262 vp.v_data = &tip;
3263
3264 if ((error = ffs_freefile(&tip, freefile->fx_oldinum,
3265 freefile->fx_mode)) != 0) {
3266 softdep_error("handle_workitem_freefile", error);
3267 }
3268 WORKITEM_FREE(freefile, D_FREEFILE);
3269 }
3270
3271 /*
3272 * Disk writes.
3273 *
3274 * The dependency structures constructed above are most actively used when file
3275 * system blocks are written to disk. No constraints are placed on when a
3276 * block can be written, but unsatisfied update dependencies are made safe by
3277 * modifying (or replacing) the source memory for the duration of the disk
3278 * write. When the disk write completes, the memory block is again brought
3279 * up-to-date.
3280 *
3281 * In-core inode structure reclamation.
3282 *
3283 * Because there are a finite number of "in-core" inode structures, they are
3284 * reused regularly. By transferring all inode-related dependencies to the
3285 * in-memory inode block and indexing them separately (via "inodedep"s), we
3286 * can allow "in-core" inode structures to be reused at any time and avoid
3287 * any increase in contention.
3288 *
3289 * Called just before entering the device driver to initiate a new disk I/O.
3290 * The buffer must be locked, thus, no I/O completion operations can occur
3291 * while we are manipulating its associated dependencies.
3292 */
3293 void
3294 softdep_disk_io_initiation(bp)
3295 struct buf *bp; /* structure describing disk write to occur */
3296 {
3297 struct worklist *wk, *nextwk;
3298 struct indirdep *indirdep;
3299 struct inodedep *inodedep;
3300 struct buf *sbp;
3301
3302 /*
3303 * We only care about write operations. There should never
3304 * be dependencies for reads.
3305 */
3306 if (bp->b_flags & B_READ)
3307 panic("softdep_disk_io_initiation: read");
3308
3309 ACQUIRE_LOCK(&lk);
3310
3311 /*
3312 * Do any necessary pre-I/O processing.
3313 */
3314 for (wk = LIST_FIRST(&bp->b_dep); wk; wk = nextwk) {
3315 nextwk = LIST_NEXT(wk, wk_list);
3316 switch (wk->wk_type) {
3317
3318 case D_PAGEDEP:
3319 initiate_write_filepage(WK_PAGEDEP(wk), bp);
3320 continue;
3321
3322 case D_INODEDEP:
3323 inodedep = WK_INODEDEP(wk);
3324 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
3325 initiate_write_inodeblock_ufs1(inodedep, bp);
3326 #ifdef FFS2
3327 else
3328 initiate_write_inodeblock_ufs2(inodedep, bp);
3329 #endif
3330 continue;
3331
3332 case D_INDIRDEP:
3333 indirdep = WK_INDIRDEP(wk);
3334 if (indirdep->ir_state & GOINGAWAY)
3335 panic("disk_io_initiation: indirdep gone");
3336 /*
3337 * If there are no remaining dependencies, this
3338 * will be writing the real pointers, so the
3339 * dependency can be freed.
3340 */
3341 if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
3342 sbp = indirdep->ir_savebp;
3343 sbp->b_flags |= B_INVAL | B_NOCACHE;
3344 /* inline expand WORKLIST_REMOVE(wk); */
3345 wk->wk_state &= ~ONWORKLIST;
3346 LIST_REMOVE(wk, wk_list);
3347 WORKITEM_FREE(indirdep, D_INDIRDEP);
3348 FREE_LOCK(&lk);
3349 brelse(sbp);
3350 ACQUIRE_LOCK(&lk);
3351 continue;
3352 }
3353 /*
3354 * Replace up-to-date version with safe version.
3355 */
3356 FREE_LOCK(&lk);
3357 indirdep->ir_saveddata = malloc(bp->b_bcount,
3358 M_INDIRDEP, M_WAITOK);
3359 ACQUIRE_LOCK(&lk);
3360 indirdep->ir_state &= ~ATTACHED;
3361 indirdep->ir_state |= UNDONE;
3362 bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
3363 bcopy(indirdep->ir_savebp->b_data, bp->b_data,
3364 bp->b_bcount);
3365 continue;
3366
3367 case D_MKDIR:
3368 case D_BMSAFEMAP:
3369 case D_ALLOCDIRECT:
3370 case D_ALLOCINDIR:
3371 continue;
3372
3373 default:
3374 FREE_LOCK(&lk);
3375 panic("handle_disk_io_initiation: Unexpected type %s",
3376 TYPENAME(wk->wk_type));
3377 /* NOTREACHED */
3378 }
3379 }
3380
3381 FREE_LOCK(&lk);
3382 }
3383
3384 /*
3385 * Called from within the procedure above to deal with unsatisfied
3386 * allocation dependencies in a directory. The buffer must be locked,
3387 * thus, no I/O completion operations can occur while we are
3388 * manipulating its associated dependencies.
3389 */
3390 STATIC void
3391 initiate_write_filepage(pagedep, bp)
3392 struct pagedep *pagedep;
3393 struct buf *bp;
3394 {
3395 struct diradd *dap;
3396 struct direct *ep;
3397 int i;
3398
3399 if (pagedep->pd_state & IOSTARTED) {
3400 /*
3401 * This can only happen if there is a driver that does not
3402 * understand chaining. Here biodone will reissue the call
3403 * to strategy for the incomplete buffers.
3404 */
3405 printf("initiate_write_filepage: already started\n");
3406 return;
3407 }
3408 pagedep->pd_state |= IOSTARTED;
3409 for (i = 0; i < DAHASHSZ; i++) {
3410 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
3411 ep = (struct direct *)
3412 ((char *)bp->b_data + dap->da_offset);
3413 if (ep->d_ino != dap->da_newinum) {
3414 FREE_LOCK(&lk);
3415 panic("%s: dir inum %d != new %d",
3416 "initiate_write_filepage",
3417 ep->d_ino, dap->da_newinum);
3418 }
3419 if (dap->da_state & DIRCHG)
3420 ep->d_ino = dap->da_previous->dm_oldinum;
3421 else
3422 ep->d_ino = 0;
3423 dap->da_state &= ~ATTACHED;
3424 dap->da_state |= UNDONE;
3425 }
3426 }
3427 }
3428
3429 /*
3430 * Called from within the procedure above to deal with unsatisfied
3431 * allocation dependencies in an inodeblock. The buffer must be
3432 * locked, thus, no I/O completion operations can occur while we
3433 * are manipulating its associated dependencies.
3434 */
3435 STATIC void
3436 initiate_write_inodeblock_ufs1(inodedep, bp)
3437 struct inodedep *inodedep;
3438 struct buf *bp; /* The inode block */
3439 {
3440 struct allocdirect *adp, *lastadp;
3441 struct ufs1_dinode *dp;
3442 struct fs *fs;
3443 #ifdef DIAGNOSTIC
3444 daddr64_t prevlbn = 0;
3445 int32_t d1, d2;
3446 #endif
3447 int i, deplist;
3448
3449 if (inodedep->id_state & IOSTARTED) {
3450 FREE_LOCK(&lk);
3451 panic("initiate_write_inodeblock: already started");
3452 }
3453 inodedep->id_state |= IOSTARTED;
3454 fs = inodedep->id_fs;
3455 dp = (struct ufs1_dinode *)bp->b_data +
3456 ino_to_fsbo(fs, inodedep->id_ino);
3457 /*
3458 * If the bitmap is not yet written, then the allocated
3459 * inode cannot be written to disk.
3460 */
3461 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3462 if (inodedep->id_savedino1 != NULL) {
3463 FREE_LOCK(&lk);
3464 panic("initiate_write_inodeblock: already doing I/O");
3465 }
3466 FREE_LOCK(&lk);
3467 MALLOC(inodedep->id_savedino1, struct ufs1_dinode *,
3468 sizeof(struct ufs1_dinode), M_INODEDEP, M_WAITOK);
3469 ACQUIRE_LOCK(&lk);
3470 *inodedep->id_savedino1 = *dp;
3471 bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
3472 return;
3473 }
3474 /*
3475 * If no dependencies, then there is nothing to roll back.
3476 */
3477 inodedep->id_savedsize = dp->di_size;
3478 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3479 return;
3480 /*
3481 * Set the dependencies to busy.
3482 */
3483 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3484 adp = TAILQ_NEXT(adp, ad_next)) {
3485 #ifdef DIAGNOSTIC
3486 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3487 FREE_LOCK(&lk);
3488 panic("softdep_write_inodeblock: lbn order");
3489 }
3490 prevlbn = adp->ad_lbn;
3491 if (adp->ad_lbn < NDADDR &&
3492 (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
3493 FREE_LOCK(&lk);
3494 panic("%s: direct pointer #%ld mismatch %d != %d",
3495 "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
3496 }
3497 if (adp->ad_lbn >= NDADDR &&
3498 (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
3499 (d2 = adp->ad_newblkno)) {
3500 FREE_LOCK(&lk);
3501 panic("%s: indirect pointer #%ld mismatch %d != %d",
3502 "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
3503 d1, d2);
3504 }
3505 deplist |= 1 << adp->ad_lbn;
3506 if ((adp->ad_state & ATTACHED) == 0) {
3507 FREE_LOCK(&lk);
3508 panic("softdep_write_inodeblock: Unknown state 0x%x",
3509 adp->ad_state);
3510 }
3511 #endif /* DIAGNOSTIC */
3512 adp->ad_state &= ~ATTACHED;
3513 adp->ad_state |= UNDONE;
3514 }
3515 /*
3516 * The on-disk inode cannot claim to be any larger than the last
3517 * fragment that has been written. Otherwise, the on-disk inode
3518 * might have fragments that were not the last block in the file
3519 * which would corrupt the filesystem.
3520 */
3521 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3522 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3523 if (adp->ad_lbn >= NDADDR)
3524 break;
3525 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3526 /* keep going until hitting a rollback to a frag */
3527 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3528 continue;
3529 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3530 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3531 #ifdef DIAGNOSTIC
3532 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3533 FREE_LOCK(&lk);
3534 panic("softdep_write_inodeblock: lost dep1");
3535 }
3536 #endif /* DIAGNOSTIC */
3537 dp->di_db[i] = 0;
3538 }
3539 for (i = 0; i < NIADDR; i++) {
3540 #ifdef DIAGNOSTIC
3541 if (dp->di_ib[i] != 0 &&
3542 (deplist & ((1 << NDADDR) << i)) == 0) {
3543 FREE_LOCK(&lk);
3544 panic("softdep_write_inodeblock: lost dep2");
3545 }
3546 #endif /* DIAGNOSTIC */
3547 dp->di_ib[i] = 0;
3548 }
3549 return;
3550 }
3551 /*
3552 * If we have zero'ed out the last allocated block of the file,
3553 * roll back the size to the last currently allocated block.
3554 * We know that this last allocated block is a full-sized as
3555 * we already checked for fragments in the loop above.
3556 */
3557 if (lastadp != NULL &&
3558 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3559 for (i = lastadp->ad_lbn; i >= 0; i--)
3560 if (dp->di_db[i] != 0)
3561 break;
3562 dp->di_size = (i + 1) * fs->fs_bsize;
3563 }
3564 /*
3565 * The only dependencies are for indirect blocks.
3566 *
3567 * The file size for indirect block additions is not guaranteed.
3568 * Such a guarantee would be non-trivial to achieve. The conventional
3569 * synchronous write implementation also does not make this guarantee.
3570 * Fsck should catch and fix discrepancies. Arguably, the file size
3571 * can be over-estimated without destroying integrity when the file
3572 * moves into the indirect blocks (i.e., is large). If we want to
3573 * postpone fsck, we are stuck with this argument.
3574 */
3575 for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3576 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3577 }
3578
3579 #ifdef FFS2
3580 /*
3581 * Version of initiate_write_inodeblock that handles FFS2 dinodes.
3582 */
3583 STATIC void
3584 initiate_write_inodeblock_ufs2(inodedep, bp)
3585 struct inodedep *inodedep;
3586 struct buf *bp; /* The inode block */
3587 {
3588 struct allocdirect *adp, *lastadp;
3589 struct ufs2_dinode *dp;
3590 struct fs *fs = inodedep->id_fs;
3591 #ifdef DIAGNOSTIC
3592 daddr64_t prevlbn = -1, d1, d2;
3593 #endif
3594 int deplist, i;
3595
3596 if (inodedep->id_state & IOSTARTED)
3597 panic("initiate_write_inodeblock_ufs2: already started");
3598 inodedep->id_state |= IOSTARTED;
3599 fs = inodedep->id_fs;
3600 dp = (struct ufs2_dinode *)bp->b_data +
3601 ino_to_fsbo(fs, inodedep->id_ino);
3602 /*
3603 * If the bitmap is not yet written, then the allocated
3604 * inode cannot be written to disk.
3605 */
3606 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
3607 if (inodedep->id_savedino2 != NULL)
3608 panic("initiate_write_inodeblock_ufs2: I/O underway");
3609 MALLOC(inodedep->id_savedino2, struct ufs2_dinode *,
3610 sizeof(struct ufs2_dinode), M_INODEDEP, M_WAITOK);
3611 *inodedep->id_savedino2 = *dp;
3612 bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
3613 return;
3614 }
3615 /*
3616 * If no dependencies, then there is nothing to roll back.
3617 */
3618 inodedep->id_savedsize = dp->di_size;
3619 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
3620 return;
3621
3622 #ifdef notyet
3623 inodedep->id_savedextsize = dp->di_extsize;
3624 if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
3625 TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
3626 return;
3627 /*
3628 * Set the ext data dependencies to busy.
3629 */
3630 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3631 adp = TAILQ_NEXT(adp, ad_next)) {
3632 #ifdef DIAGNOSTIC
3633 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3634 FREE_LOCK(&lk);
3635 panic("softdep_write_inodeblock: lbn order");
3636 }
3637 prevlbn = adp->ad_lbn;
3638 if ((d1 = dp->di_extb[adp->ad_lbn]) !=
3639 (d2 = adp->ad_newblkno)) {
3640 FREE_LOCK(&lk);
3641 panic("%s: direct pointer #%ld mismatch %ld != %ld",
3642 "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
3643 }
3644 deplist |= 1 << adp->ad_lbn;
3645 if ((adp->ad_state & ATTACHED) == 0) {
3646 FREE_LOCK(&lk);
3647 panic("softdep_write_inodeblock: Unknown state 0x%x",
3648 adp->ad_state);
3649 }
3650 #endif /* DIAGNOSTIC */
3651 adp->ad_state &= ~ATTACHED;
3652 adp->ad_state |= UNDONE;
3653 }
3654 /*
3655 * The on-disk inode cannot claim to be any larger than the last
3656 * fragment that has been written. Otherwise, the on-disk inode
3657 * might have fragments that were not the last block in the ext
3658 * data which would corrupt the filesystem.
3659 */
3660 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
3661 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3662 dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
3663 /* keep going until hitting a rollback to a frag */
3664 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3665 continue;
3666 dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3667 for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
3668 #ifdef DIAGNOSTIC
3669 if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) {
3670 FREE_LOCK(&lk);
3671 panic("softdep_write_inodeblock: lost dep1");
3672 }
3673 #endif /* DIAGNOSTIC */
3674 dp->di_extb[i] = 0;
3675 }
3676 lastadp = NULL;
3677 break;
3678 }
3679 /*
3680 * If we have zero'ed out the last allocated block of the ext
3681 * data, roll back the size to the last currently allocated block.
3682 * We know that this last allocated block is a full-sized as
3683 * we already checked for fragments in the loop above.
3684 */
3685 if (lastadp != NULL &&
3686 dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3687 for (i = lastadp->ad_lbn; i >= 0; i--)
3688 if (dp->di_extb[i] != 0)
3689 break;
3690 dp->di_extsize = (i + 1) * fs->fs_bsize;
3691 }
3692 #endif /* notyet */
3693
3694 /*
3695 * Set the file data dependencies to busy.
3696 */
3697 for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3698 adp = TAILQ_NEXT(adp, ad_next)) {
3699 #ifdef DIAGNOSTIC
3700 if (deplist != 0 && prevlbn >= adp->ad_lbn) {
3701 FREE_LOCK(&lk);
3702 panic("softdep_write_inodeblock: lbn order");
3703 }
3704 prevlbn = adp->ad_lbn;
3705 if (adp->ad_lbn < NDADDR &&
3706 (d1 = dp->di_db[adp->ad_lbn]) != (d2 = adp->ad_newblkno)) {
3707 FREE_LOCK(&lk);
3708 panic("%s: direct pointer #%ld mismatch %ld != %ld",
3709 "softdep_write_inodeblock", adp->ad_lbn, d1, d2);
3710 }
3711 if (adp->ad_lbn >= NDADDR &&
3712 (d1 = dp->di_ib[adp->ad_lbn - NDADDR]) !=
3713 (d2 = adp->ad_newblkno)) {
3714 FREE_LOCK(&lk);
3715 panic("%s: indirect pointer #%ld mismatch %ld != %ld",
3716 "softdep_write_inodeblock", adp->ad_lbn - NDADDR,
3717 d1, d2);
3718 }
3719 deplist |= 1 << adp->ad_lbn;
3720 if ((adp->ad_state & ATTACHED) == 0) {
3721 FREE_LOCK(&lk);
3722 panic("softdep_write_inodeblock: Unknown state 0x%x",
3723 adp->ad_state);
3724 }
3725 #endif /* DIAGNOSTIC */
3726 adp->ad_state &= ~ATTACHED;
3727 adp->ad_state |= UNDONE;
3728 }
3729 /*
3730 * The on-disk inode cannot claim to be any larger than the last
3731 * fragment that has been written. Otherwise, the on-disk inode
3732 * might have fragments that were not the last block in the file
3733 * which would corrupt the filesystem.
3734 */
3735 for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
3736 lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
3737 if (adp->ad_lbn >= NDADDR)
3738 break;
3739 dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
3740 /* keep going until hitting a rollback to a frag */
3741 if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
3742 continue;
3743 dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
3744 for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
3745 #ifdef DIAGNOSTIC
3746 if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) {
3747 FREE_LOCK(&lk);
3748 panic("softdep_write_inodeblock: lost dep2");
3749 }
3750 #endif /* DIAGNOSTIC */
3751 dp->di_db[i] = 0;
3752 }
3753 for (i = 0; i < NIADDR; i++) {
3754 #ifdef DIAGNOSTIC
3755 if (dp->di_ib[i] != 0 &&
3756 (deplist & ((1 << NDADDR) << i)) == 0) {
3757 FREE_LOCK(&lk);
3758 panic("softdep_write_inodeblock: lost dep3");
3759 }
3760 #endif /* DIAGNOSTIC */
3761 dp->di_ib[i] = 0;
3762 }
3763 return;
3764 }
3765 /*
3766 * If we have zero'ed out the last allocated block of the file,
3767 * roll back the size to the last currently allocated block.
3768 * We know that this last allocated block is a full-sized as
3769 * we already checked for fragments in the loop above.
3770 */
3771 if (lastadp != NULL &&
3772 dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
3773 for (i = lastadp->ad_lbn; i >= 0; i--)
3774 if (dp->di_db[i] != 0)
3775 break;
3776 dp->di_size = (i + 1) * fs->fs_bsize;
3777 }
3778 /*
3779 * The only dependencies are for indirect blocks.
3780 *
3781 * The file size for indirect block additions is not guaranteed.
3782 * Such a guarantee would be non-trivial to achieve. The conventional
3783 * synchronous write implementation also does not make this guarantee.
3784 * Fsck should catch and fix discrepancies. Arguably, the file size
3785 * can be over-estimated without destroying integrity when the file
3786 * moves into the indirect blocks (i.e., is large). If we want to
3787 * postpone fsck, we are stuck with this argument.
3788 */
3789 for (; adp; adp = TAILQ_NEXT(adp, ad_next))
3790 dp->di_ib[adp->ad_lbn - NDADDR] = 0;
3791 }
3792 #endif /* FFS2 */
3793
3794 /*
3795 * This routine is called during the completion interrupt
3796 * service routine for a disk write (from the procedure called
3797 * by the device driver to inform the file system caches of
3798 * a request completion). It should be called early in this
3799 * procedure, before the block is made available to other
3800 * processes or other routines are called.
3801 */
3802 void
3803 softdep_disk_write_complete(bp)
3804 struct buf *bp; /* describes the completed disk write */
3805 {
3806 struct worklist *wk;
3807 struct workhead reattach;
3808 struct newblk *newblk;
3809 struct allocindir *aip;
3810 struct allocdirect *adp;
3811 struct indirdep *indirdep;
3812 struct inodedep *inodedep;
3813 struct bmsafemap *bmsafemap;
3814
3815 /*
3816 * If an error occurred while doing the write, then the data
3817 * has not hit the disk and the dependencies cannot be unrolled.
3818 */
3819 if ((bp->b_flags & B_ERROR) && !(bp->b_flags & B_INVAL))
3820 return;
3821
3822 #ifdef DEBUG
3823 if (lk.lkt_held != -1)
3824 panic("softdep_disk_write_complete: lock is held");
3825 lk.lkt_held = -2;
3826 #endif
3827 LIST_INIT(&reattach);
3828 while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
3829 WORKLIST_REMOVE(wk);
3830 switch (wk->wk_type) {
3831
3832 case D_PAGEDEP:
3833 if (handle_written_filepage(WK_PAGEDEP(wk), bp))
3834 WORKLIST_INSERT(&reattach, wk);
3835 continue;
3836
3837 case D_INODEDEP:
3838 if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
3839 WORKLIST_INSERT(&reattach, wk);
3840 continue;
3841
3842 case D_BMSAFEMAP:
3843 bmsafemap = WK_BMSAFEMAP(wk);
3844 while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
3845 newblk->nb_state |= DEPCOMPLETE;
3846 newblk->nb_bmsafemap = NULL;
3847 LIST_REMOVE(newblk, nb_deps);
3848 }
3849 while ((adp =
3850 LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
3851 adp->ad_state |= DEPCOMPLETE;
3852 adp->ad_buf = NULL;
3853 LIST_REMOVE(adp, ad_deps);
3854 handle_allocdirect_partdone(adp);
3855 }
3856 while ((aip =
3857 LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
3858 aip->ai_state |= DEPCOMPLETE;
3859 aip->ai_buf = NULL;
3860 LIST_REMOVE(aip, ai_deps);
3861 handle_allocindir_partdone(aip);
3862 }
3863 while ((inodedep =
3864 LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
3865 inodedep->id_state |= DEPCOMPLETE;
3866 LIST_REMOVE(inodedep, id_deps);
3867 inodedep->id_buf = NULL;
3868 }
3869 WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
3870 continue;
3871
3872 case D_MKDIR:
3873 handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
3874 continue;
3875
3876 case D_ALLOCDIRECT:
3877 adp = WK_ALLOCDIRECT(wk);
3878 adp->ad_state |= COMPLETE;
3879 handle_allocdirect_partdone(adp);
3880 continue;
3881
3882 case D_ALLOCINDIR:
3883 aip = WK_ALLOCINDIR(wk);
3884 aip->ai_state |= COMPLETE;
3885 handle_allocindir_partdone(aip);
3886 continue;
3887
3888 case D_INDIRDEP:
3889 indirdep = WK_INDIRDEP(wk);
3890 if (indirdep->ir_state & GOINGAWAY)
3891 panic("disk_write_complete: indirdep gone");
3892 bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
3893 free(indirdep->ir_saveddata, M_INDIRDEP);
3894 indirdep->ir_saveddata = 0;
3895 indirdep->ir_state &= ~UNDONE;
3896 indirdep->ir_state |= ATTACHED;
3897 while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
3898 handle_allocindir_partdone(aip);
3899 if (aip == LIST_FIRST(&indirdep->ir_donehd))
3900 panic("disk_write_complete: not gone");
3901 }
3902 WORKLIST_INSERT(&reattach, wk);
3903 if ((bp->b_flags & B_DELWRI) == 0)
3904 stat_indir_blk_ptrs++;
3905 buf_dirty(bp);
3906 continue;
3907
3908 default:
3909 panic("handle_disk_write_complete: Unknown type %s",
3910 TYPENAME(wk->wk_type));
3911 /* NOTREACHED */
3912 }
3913 }
3914 /*
3915 * Reattach any requests that must be redone.
3916 */
3917 while ((wk = LIST_FIRST(&reattach)) != NULL) {
3918 WORKLIST_REMOVE(wk);
3919 WORKLIST_INSERT(&bp->b_dep, wk);
3920 }
3921 #ifdef DEBUG
3922 if (lk.lkt_held != -2)
3923 panic("softdep_disk_write_complete: lock lost");
3924 lk.lkt_held = -1;
3925 #endif
3926 }
3927
3928 /*
3929 * Called from within softdep_disk_write_complete above. Note that
3930 * this routine is always called from interrupt level with further
3931 * splbio interrupts blocked.
3932 */
3933 STATIC void
3934 handle_allocdirect_partdone(adp)
3935 struct allocdirect *adp; /* the completed allocdirect */
3936 {
3937 struct allocdirect *listadp;
3938 struct inodedep *inodedep;
3939 long bsize, delay;
3940
3941 splassert(IPL_BIO);
3942
3943 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3944 return;
3945 if (adp->ad_buf != NULL)
3946 panic("handle_allocdirect_partdone: dangling dep");
3947
3948 /*
3949 * The on-disk inode cannot claim to be any larger than the last
3950 * fragment that has been written. Otherwise, the on-disk inode
3951 * might have fragments that were not the last block in the file
3952 * which would corrupt the filesystem. Thus, we cannot free any
3953 * allocdirects after one whose ad_oldblkno claims a fragment as
3954 * these blocks must be rolled back to zero before writing the inode.
3955 * We check the currently active set of allocdirects in id_inoupdt.
3956 */
3957 inodedep = adp->ad_inodedep;
3958 bsize = inodedep->id_fs->fs_bsize;
3959 TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) {
3960 /* found our block */
3961 if (listadp == adp)
3962 break;
3963 /* continue if ad_oldlbn is not a fragment */
3964 if (listadp->ad_oldsize == 0 ||
3965 listadp->ad_oldsize == bsize)
3966 continue;
3967 /* hit a fragment */
3968 return;
3969 }
3970 /*
3971 * If we have reached the end of the current list without
3972 * finding the just finished dependency, then it must be
3973 * on the future dependency list. Future dependencies cannot
3974 * be freed until they are moved to the current list.
3975 */
3976 if (listadp == NULL) {
3977 #ifdef DEBUG
3978 TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next)
3979 /* found our block */
3980 if (listadp == adp)
3981 break;
3982 if (listadp == NULL)
3983 panic("handle_allocdirect_partdone: lost dep");
3984 #endif /* DEBUG */
3985 return;
3986 }
3987 /*
3988 * If we have found the just finished dependency, then free
3989 * it along with anything that follows it that is complete.
3990 * If the inode still has a bitmap dependency, then it has
3991 * never been written to disk, hence the on-disk inode cannot
3992 * reference the old fragment so we can free it without delay.
3993 */
3994 delay = (inodedep->id_state & DEPCOMPLETE);
3995 for (; adp; adp = listadp) {
3996 listadp = TAILQ_NEXT(adp, ad_next);
3997 if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
3998 return;
3999 free_allocdirect(&inodedep->id_inoupdt, adp, delay);
4000 }
4001 }
4002
4003 /*
4004 * Called from within softdep_disk_write_complete above. Note that
4005 * this routine is always called from interrupt level with further
4006 * splbio interrupts blocked.
4007 */
4008 STATIC void
4009 handle_allocindir_partdone(aip)
4010 struct allocindir *aip; /* the completed allocindir */
4011 {
4012 struct indirdep *indirdep;
4013
4014 splassert(IPL_BIO);
4015
4016 if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
4017 return;
4018 if (aip->ai_buf != NULL)
4019 panic("handle_allocindir_partdone: dangling dependency");
4020 indirdep = aip->ai_indirdep;
4021 if (indirdep->ir_state & UNDONE) {
4022 LIST_REMOVE(aip, ai_next);
4023 LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
4024 return;
4025 }
4026 if (indirdep->ir_state & UFS1FMT)
4027 ((int32_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4028 aip->ai_newblkno;
4029 else
4030 ((int64_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
4031 aip->ai_newblkno;
4032 LIST_REMOVE(aip, ai_next);
4033 if (aip->ai_freefrag != NULL)
4034 add_to_worklist(&aip->ai_freefrag->ff_list);
4035 WORKITEM_FREE(aip, D_ALLOCINDIR);
4036 }
4037
4038 /*
4039 * Called from within softdep_disk_write_complete above to restore
4040 * in-memory inode block contents to their most up-to-date state. Note
4041 * that this routine is always called from interrupt level with further
4042 * splbio interrupts blocked.
4043 */
4044 STATIC int
4045 handle_written_inodeblock(inodedep, bp)
4046 struct inodedep *inodedep;
4047 struct buf *bp; /* buffer containing the inode block */
4048 {
4049 struct worklist *wk, *filefree;
4050 struct allocdirect *adp, *nextadp;
4051 struct ufs1_dinode *dp1 = NULL;
4052 struct ufs2_dinode *dp2 = NULL;
4053 int hadchanges, fstype;
4054
4055 splassert(IPL_BIO);
4056
4057 if ((inodedep->id_state & IOSTARTED) == 0)
4058 panic("handle_written_inodeblock: not started");
4059 inodedep->id_state &= ~IOSTARTED;
4060
4061 if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
4062 fstype = UM_UFS1;
4063 dp1 = (struct ufs1_dinode *) bp->b_data +
4064 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4065 } else {
4066 fstype = UM_UFS2;
4067 dp2 = (struct ufs2_dinode *) bp->b_data +
4068 ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
4069 }
4070
4071 /*
4072 * If we had to rollback the inode allocation because of
4073 * bitmaps being incomplete, then simply restore it.
4074 * Keep the block dirty so that it will not be reclaimed until
4075 * all associated dependencies have been cleared and the
4076 * corresponding updates written to disk.
4077 */
4078 if (inodedep->id_savedino1 != NULL) {
4079 if (fstype == UM_UFS1)
4080 *dp1 = *inodedep->id_savedino1;
4081 else
4082 *dp2 = *inodedep->id_savedino2;
4083 FREE(inodedep->id_savedino1, M_INODEDEP);
4084 inodedep->id_savedino1 = NULL;
4085 if ((bp->b_flags & B_DELWRI) == 0)
4086 stat_inode_bitmap++;
4087 buf_dirty(bp);
4088 return (1);
4089 }
4090 inodedep->id_state |= COMPLETE;
4091 /*
4092 * Roll forward anything that had to be rolled back before
4093 * the inode could be updated.
4094 */
4095 hadchanges = 0;
4096 for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
4097 nextadp = TAILQ_NEXT(adp, ad_next);
4098 if (adp->ad_state & ATTACHED)
4099 panic("handle_written_inodeblock: new entry");
4100 if (fstype == UM_UFS1) {
4101 if (adp->ad_lbn < NDADDR) {
4102 if (dp1->di_db[adp->ad_lbn] != adp->ad_oldblkno)
4103 panic("%s: %s #%ld mismatch %d != %d",
4104 "handle_written_inodeblock",
4105 "direct pointer", adp->ad_lbn,
4106 dp1->di_db[adp->ad_lbn],
4107 adp->ad_oldblkno);
4108 dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
4109 } else {
4110 if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
4111 panic("%s: %s #%ld allocated as %d",
4112 "handle_written_inodeblock",
4113 "indirect pointer",
4114 adp->ad_lbn - NDADDR,
4115 dp1->di_ib[adp->ad_lbn - NDADDR]);
4116 dp1->di_ib[adp->ad_lbn - NDADDR] =
4117 adp->ad_newblkno;
4118 }
4119 } else {
4120 if (adp->ad_lbn < NDADDR) {
4121 if (dp2->di_db[adp->ad_lbn] != adp->ad_oldblkno)
4122 panic("%s: %s #%ld mismatch %d != %d",
4123 "handle_written_inodeblock",
4124 "direct pointer", adp->ad_lbn,
4125 dp2->di_db[adp->ad_lbn],
4126 adp->ad_oldblkno);
4127 dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
4128 } else {
4129 if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
4130 panic("%s: %s #%ld allocated as %d",
4131 "handle_written_inodeblock",
4132 "indirect pointer",
4133 adp->ad_lbn - NDADDR,
4134 dp2->di_ib[adp->ad_lbn - NDADDR]);
4135 dp2->di_ib[adp->ad_lbn - NDADDR] =
4136 adp->ad_newblkno;
4137 }
4138 }
4139 adp->ad_state &= ~UNDONE;
4140 adp->ad_state |= ATTACHED;
4141 hadchanges = 1;
4142 }
4143 if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
4144 stat_direct_blk_ptrs++;
4145 /*
4146 * Reset the file size to its most up-to-date value.
4147 */
4148 if (inodedep->id_savedsize == -1)
4149 panic("handle_written_inodeblock: bad size");
4150
4151 if (fstype == UM_UFS1) {
4152 if (dp1->di_size != inodedep->id_savedsize) {
4153 dp1->di_size = inodedep->id_savedsize;
4154 hadchanges = 1;
4155 }
4156 } else {
4157 if (dp2->di_size != inodedep->id_savedsize) {
4158 dp2->di_size = inodedep->id_savedsize;
4159 hadchanges = 1;
4160 }
4161 }
4162 inodedep->id_savedsize = -1;
4163 /*
4164 * If there were any rollbacks in the inode block, then it must be
4165 * marked dirty so that its will eventually get written back in
4166 * its correct form.
4167 */
4168 if (hadchanges)
4169 buf_dirty(bp);
4170 /*
4171 * Process any allocdirects that completed during the update.
4172 */
4173 if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
4174 handle_allocdirect_partdone(adp);
4175 /*
4176 * Process deallocations that were held pending until the
4177 * inode had been written to disk. Freeing of the inode
4178 * is delayed until after all blocks have been freed to
4179 * avoid creation of new <vfsid, inum, lbn> triples
4180 * before the old ones have been deleted.
4181 */
4182 filefree = NULL;
4183 while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
4184 WORKLIST_REMOVE(wk);
4185 switch (wk->wk_type) {
4186
4187 case D_FREEFILE:
4188 /*
4189 * We defer adding filefree to the worklist until
4190 * all other additions have been made to ensure
4191 * that it will be done after all the old blocks
4192 * have been freed.
4193 */
4194 if (filefree != NULL)
4195 panic("handle_written_inodeblock: filefree");
4196 filefree = wk;
4197 continue;
4198
4199 case D_MKDIR:
4200 handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
4201 continue;
4202
4203 case D_DIRADD:
4204 diradd_inode_written(WK_DIRADD(wk), inodedep);
4205 continue;
4206
4207 case D_FREEBLKS:
4208 wk->wk_state |= COMPLETE;
4209 if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE)
4210 continue;
4211 /* FALLTHROUGH */
4212 case D_FREEFRAG:
4213 case D_DIRREM:
4214 add_to_worklist(wk);
4215 continue;
4216
4217 case D_NEWDIRBLK:
4218 free_newdirblk(WK_NEWDIRBLK(wk));
4219 continue;
4220
4221 default:
4222 panic("handle_written_inodeblock: Unknown type %s",
4223 TYPENAME(wk->wk_type));
4224 /* NOTREACHED */
4225 }
4226 }
4227 if (filefree != NULL) {
4228 if (free_inodedep(inodedep) == 0)
4229 panic("handle_written_inodeblock: live inodedep");
4230 add_to_worklist(filefree);
4231 return (0);
4232 }
4233
4234 /*
4235 * If no outstanding dependencies, free it.
4236 */
4237 if (free_inodedep(inodedep) || TAILQ_FIRST(&inodedep->id_inoupdt) == 0)
4238 return (0);
4239 return (hadchanges);
4240 }
4241
4242 /*
4243 * Process a diradd entry after its dependent inode has been written.
4244 * This routine must be called with splbio interrupts blocked.
4245 */
4246 STATIC void
4247 diradd_inode_written(dap, inodedep)
4248 struct diradd *dap;
4249 struct inodedep *inodedep;
4250 {
4251 struct pagedep *pagedep;
4252
4253 splassert(IPL_BIO);
4254
4255 dap->da_state |= COMPLETE;
4256 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4257 if (dap->da_state & DIRCHG)
4258 pagedep = dap->da_previous->dm_pagedep;
4259 else
4260 pagedep = dap->da_pagedep;
4261 LIST_REMOVE(dap, da_pdlist);
4262 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4263 }
4264 WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
4265 }
4266
4267 /*
4268 * Handle the completion of a mkdir dependency.
4269 */
4270 STATIC void
4271 handle_written_mkdir(mkdir, type)
4272 struct mkdir *mkdir;
4273 int type;
4274 {
4275 struct diradd *dap;
4276 struct pagedep *pagedep;
4277
4278 splassert(IPL_BIO);
4279
4280 if (mkdir->md_state != type)
4281 panic("handle_written_mkdir: bad type");
4282 dap = mkdir->md_diradd;
4283 dap->da_state &= ~type;
4284 if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
4285 dap->da_state |= DEPCOMPLETE;
4286 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4287 if (dap->da_state & DIRCHG)
4288 pagedep = dap->da_previous->dm_pagedep;
4289 else
4290 pagedep = dap->da_pagedep;
4291 LIST_REMOVE(dap, da_pdlist);
4292 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
4293 }
4294 LIST_REMOVE(mkdir, md_mkdirs);
4295 WORKITEM_FREE(mkdir, D_MKDIR);
4296 }
4297
4298 /*
4299 * Called from within softdep_disk_write_complete above.
4300 * A write operation was just completed. Removed inodes can
4301 * now be freed and associated block pointers may be committed.
4302 * Note that this routine is always called from interrupt level
4303 * with further splbio interrupts blocked.
4304 */
4305 STATIC int
4306 handle_written_filepage(pagedep, bp)
4307 struct pagedep *pagedep;
4308 struct buf *bp; /* buffer containing the written page */
4309 {
4310 struct dirrem *dirrem;
4311 struct diradd *dap, *nextdap;
4312 struct direct *ep;
4313 int i, chgs;
4314
4315 splassert(IPL_BIO);
4316
4317 if ((pagedep->pd_state & IOSTARTED) == 0)
4318 panic("handle_written_filepage: not started");
4319 pagedep->pd_state &= ~IOSTARTED;
4320 /*
4321 * Process any directory removals that have been committed.
4322 */
4323 while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
4324 LIST_REMOVE(dirrem, dm_next);
4325 dirrem->dm_dirinum = pagedep->pd_ino;
4326 add_to_worklist(&dirrem->dm_list);
4327 }
4328 /*
4329 * Free any directory additions that have been committed.
4330 * If it is a newly allocated block, we have to wait until
4331 * the on-disk directory inode claims the new block.
4332 */
4333 if ((pagedep->pd_state & NEWBLOCK) == 0)
4334 while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
4335 free_diradd(dap);
4336 /*
4337 * Uncommitted directory entries must be restored.
4338 */
4339 for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
4340 for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
4341 dap = nextdap) {
4342 nextdap = LIST_NEXT(dap, da_pdlist);
4343 if (dap->da_state & ATTACHED)
4344 panic("handle_written_filepage: attached");
4345 ep = (struct direct *)
4346 ((char *)bp->b_data + dap->da_offset);
4347 ep->d_ino = dap->da_newinum;
4348 dap->da_state &= ~UNDONE;
4349 dap->da_state |= ATTACHED;
4350 chgs = 1;
4351 /*
4352 * If the inode referenced by the directory has
4353 * been written out, then the dependency can be
4354 * moved to the pending list.
4355 */
4356 if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
4357 LIST_REMOVE(dap, da_pdlist);
4358 LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
4359 da_pdlist);
4360 }
4361 }
4362 }
4363 /*
4364 * If there were any rollbacks in the directory, then it must be
4365 * marked dirty so that its will eventually get written back in
4366 * its correct form.
4367 */
4368 if (chgs) {
4369 if ((bp->b_flags & B_DELWRI) == 0)
4370 stat_dir_entry++;
4371 buf_dirty(bp);
4372 return (1);
4373 }
4374 /*
4375 * If we are not waiting for a new directory block to be
4376 * claimed by its inode, then the pagedep will be freed.
4377 * Otherwise it will remain to track any new entries on
4378 * the page in case they are fsync'ed.
4379 */
4380 if ((pagedep->pd_state & NEWBLOCK) == 0) {
4381 LIST_REMOVE(pagedep, pd_hash);
4382 WORKITEM_FREE(pagedep, D_PAGEDEP);
4383 }
4384 return (0);
4385 }
4386
4387 /*
4388 * Writing back in-core inode structures.
4389 *
4390 * The file system only accesses an inode's contents when it occupies an
4391 * "in-core" inode structure. These "in-core" structures are separate from
4392 * the page frames used to cache inode blocks. Only the latter are
4393 * transferred to/from the disk. So, when the updated contents of the
4394 * "in-core" inode structure are copied to the corresponding in-memory inode
4395 * block, the dependencies are also transferred. The following procedure is
4396 * called when copying a dirty "in-core" inode to a cached inode block.
4397 */
4398
4399 /*
4400 * Called when an inode is loaded from disk. If the effective link count
4401 * differed from the actual link count when it was last flushed, then we
4402 * need to ensure that the correct effective link count is put back.
4403 */
4404 void
4405 softdep_load_inodeblock(ip)
4406 struct inode *ip; /* the "in_core" copy of the inode */
4407 {
4408 struct inodedep *inodedep;
4409
4410 /*
4411 * Check for alternate nlink count.
4412 */
4413 ip->i_effnlink = DIP(ip, nlink);
4414 ACQUIRE_LOCK(&lk);
4415 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4416 FREE_LOCK(&lk);
4417 return;
4418 }
4419 ip->i_effnlink -= inodedep->id_nlinkdelta;
4420 FREE_LOCK(&lk);
4421 }
4422
4423 /*
4424 * This routine is called just before the "in-core" inode
4425 * information is to be copied to the in-memory inode block.
4426 * Recall that an inode block contains several inodes. If
4427 * the force flag is set, then the dependencies will be
4428 * cleared so that the update can always be made. Note that
4429 * the buffer is locked when this routine is called, so we
4430 * will never be in the middle of writing the inode block
4431 * to disk.
4432 */
4433 void
4434 softdep_update_inodeblock(ip, bp, waitfor)
4435 struct inode *ip; /* the "in_core" copy of the inode */
4436 struct buf *bp; /* the buffer containing the inode block */
4437 int waitfor; /* nonzero => update must be allowed */
4438 {
4439 struct inodedep *inodedep;
4440 struct worklist *wk;
4441 int error, gotit;
4442
4443 /*
4444 * If the effective link count is not equal to the actual link
4445 * count, then we must track the difference in an inodedep while
4446 * the inode is (potentially) tossed out of the cache. Otherwise,
4447 * if there is no existing inodedep, then there are no dependencies
4448 * to track.
4449 */
4450 ACQUIRE_LOCK(&lk);
4451 if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
4452 FREE_LOCK(&lk);
4453 if (ip->i_effnlink != DIP(ip, nlink))
4454 panic("softdep_update_inodeblock: bad link count");
4455 return;
4456 }
4457 if (inodedep->id_nlinkdelta != DIP(ip, nlink) - ip->i_effnlink) {
4458 FREE_LOCK(&lk);
4459 panic("softdep_update_inodeblock: bad delta");
4460 }
4461 /*
4462 * Changes have been initiated. Anything depending on these
4463 * changes cannot occur until this inode has been written.
4464 */
4465 inodedep->id_state &= ~COMPLETE;
4466 if ((inodedep->id_state & ONWORKLIST) == 0)
4467 WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
4468 /*
4469 * Any new dependencies associated with the incore inode must
4470 * now be moved to the list associated with the buffer holding
4471 * the in-memory copy of the inode. Once merged process any
4472 * allocdirects that are completed by the merger.
4473 */
4474 merge_inode_lists(inodedep);
4475 if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
4476 handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
4477 /*
4478 * Now that the inode has been pushed into the buffer, the
4479 * operations dependent on the inode being written to disk
4480 * can be moved to the id_bufwait so that they will be
4481 * processed when the buffer I/O completes.
4482 */
4483 while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
4484 WORKLIST_REMOVE(wk);
4485 WORKLIST_INSERT(&inodedep->id_bufwait, wk);
4486 }
4487 /*
4488 * Newly allocated inodes cannot be written until the bitmap
4489 * that allocates them have been written (indicated by
4490 * DEPCOMPLETE being set in id_state). If we are doing a
4491 * forced sync (e.g., an fsync on a file), we force the bitmap
4492 * to be written so that the update can be done.
4493 */
4494 if ((inodedep->id_state & DEPCOMPLETE) != 0 || waitfor == 0) {
4495 FREE_LOCK(&lk);
4496 return;
4497 }
4498 bp = inodedep->id_buf;
4499 gotit = getdirtybuf(bp, MNT_WAIT);
4500 FREE_LOCK(&lk);
4501 if (gotit && (error = bwrite(bp)) != 0)
4502 softdep_error("softdep_update_inodeblock: bwrite", error);
4503 if ((inodedep->id_state & DEPCOMPLETE) == 0)
4504 panic("softdep_update_inodeblock: update failed");
4505 }
4506
4507 /*
4508 * Merge the new inode dependency list (id_newinoupdt) into the old
4509 * inode dependency list (id_inoupdt). This routine must be called
4510 * with splbio interrupts blocked.
4511 */
4512 STATIC void
4513 merge_inode_lists(inodedep)
4514 struct inodedep *inodedep;
4515 {
4516 struct allocdirect *listadp, *newadp;
4517
4518 splassert(IPL_BIO);
4519
4520 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4521 for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
4522 if (listadp->ad_lbn < newadp->ad_lbn) {
4523 listadp = TAILQ_NEXT(listadp, ad_next);
4524 continue;
4525 }
4526 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4527 TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
4528 if (listadp->ad_lbn == newadp->ad_lbn) {
4529 allocdirect_merge(&inodedep->id_inoupdt, newadp,
4530 listadp);
4531 listadp = newadp;
4532 }
4533 newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
4534 }
4535 while ((newadp = TAILQ_FIRST(&inodedep->id_newinoupdt)) != NULL) {
4536 TAILQ_REMOVE(&inodedep->id_newinoupdt, newadp, ad_next);
4537 TAILQ_INSERT_TAIL(&inodedep->id_inoupdt, newadp, ad_next);
4538 }
4539 }
4540
4541 /*
4542 * If we are doing an fsync, then we must ensure that any directory
4543 * entries for the inode have been written after the inode gets to disk.
4544 */
4545 int
4546 softdep_fsync(vp)
4547 struct vnode *vp; /* the "in_core" copy of the inode */
4548 {
4549 struct inodedep *inodedep;
4550 struct pagedep *pagedep;
4551 struct worklist *wk;
4552 struct diradd *dap;
4553 struct mount *mnt;
4554 struct vnode *pvp;
4555 struct inode *ip;
4556 struct inode *pip;
4557 struct buf *bp;
4558 struct fs *fs;
4559 struct proc *p = CURPROC; /* XXX */
4560 int error, flushparent;
4561 ino_t parentino;
4562 daddr64_t lbn;
4563
4564 ip = VTOI(vp);
4565 fs = ip->i_fs;
4566 ACQUIRE_LOCK(&lk);
4567 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
4568 FREE_LOCK(&lk);
4569 return (0);
4570 }
4571 if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
4572 LIST_FIRST(&inodedep->id_bufwait) != NULL ||
4573 TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
4574 TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) {
4575 FREE_LOCK(&lk);
4576 panic("softdep_fsync: pending ops");
4577 }
4578 for (error = 0, flushparent = 0; ; ) {
4579 if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
4580 break;
4581 if (wk->wk_type != D_DIRADD) {
4582 FREE_LOCK(&lk);
4583 panic("softdep_fsync: Unexpected type %s",
4584 TYPENAME(wk->wk_type));
4585 }
4586 dap = WK_DIRADD(wk);
4587 /*
4588 * Flush our parent if this directory entry has a MKDIR_PARENT
4589 * dependency or is contained in a newly allocated block.
4590 */
4591 if (dap->da_state & DIRCHG)
4592 pagedep = dap->da_previous->dm_pagedep;
4593 else
4594 pagedep = dap->da_pagedep;
4595 mnt = pagedep->pd_mnt;
4596 parentino = pagedep->pd_ino;
4597 lbn = pagedep->pd_lbn;
4598 if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) {
4599 FREE_LOCK(&lk);
4600 panic("softdep_fsync: dirty");
4601 }
4602 if ((dap->da_state & MKDIR_PARENT) ||
4603 (pagedep->pd_state & NEWBLOCK))
4604 flushparent = 1;
4605 else
4606 flushparent = 0;
4607 /*
4608 * If we are being fsync'ed as part of vgone'ing this vnode,
4609 * then we will not be able to release and recover the
4610 * vnode below, so we just have to give up on writing its
4611 * directory entry out. It will eventually be written, just
4612 * not now, but then the user was not asking to have it
4613 * written, so we are not breaking any promises.
4614 */
4615 if (vp->v_flag & VXLOCK)
4616 break;
4617 /*
4618 * We prevent deadlock by always fetching inodes from the
4619 * root, moving down the directory tree. Thus, when fetching
4620 * our parent directory, we must unlock ourselves before
4621 * requesting the lock on our parent. See the comment in
4622 * ufs_lookup for details on possible races.
4623 */
4624 FREE_LOCK(&lk);
4625 VOP_UNLOCK(vp, 0, p);
4626 error = VFS_VGET(mnt, parentino, &pvp);
4627 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
4628 if (error != 0)
4629 return (error);
4630 /*
4631 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
4632 * that are contained in direct blocks will be resolved by
4633 * doing a UFS_UPDATE. Pagedeps contained in indirect blocks
4634 * may require a complete sync'ing of the directory. So, we
4635 * try the cheap and fast UFS_UPDATE first, and if that fails,
4636 * then we do the slower VOP_FSYNC of the directory.
4637 */
4638 pip = VTOI(pvp);
4639 if (flushparent) {
4640 error = UFS_UPDATE(pip, MNT_WAIT);
4641 if (error) {
4642 vput(pvp);
4643 return (error);
4644 }
4645 if (pagedep->pd_state & NEWBLOCK) {
4646 error = VOP_FSYNC(pvp, p->p_ucred, MNT_WAIT, p);
4647 if (error) {
4648 vput(pvp);
4649 return (error);
4650 }
4651 }
4652 }
4653 /*
4654 * Flush directory page containing the inode's name.
4655 */
4656 error = bread(pvp, lbn, fs->fs_bsize, p->p_ucred, &bp);
4657 if (error == 0) {
4658 bp->b_bcount = blksize(fs, pip, lbn);
4659 error = bwrite(bp);
4660 } else
4661 brelse(bp);
4662 vput(pvp);
4663 if (error != 0)
4664 return (error);
4665 ACQUIRE_LOCK(&lk);
4666 if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
4667 break;
4668 }
4669 FREE_LOCK(&lk);
4670 return (0);
4671 }
4672
4673 /*
4674 * Flush all the dirty bitmaps associated with the block device
4675 * before flushing the rest of the dirty blocks so as to reduce
4676 * the number of dependencies that will have to be rolled back.
4677 */
4678 void
4679 softdep_fsync_mountdev(vp, waitfor)
4680 struct vnode *vp;
4681 int waitfor;
4682 {
4683 struct buf *bp, *nbp;
4684 struct worklist *wk;
4685
4686 if (!vn_isdisk(vp, NULL))
4687 panic("softdep_fsync_mountdev: vnode not a disk");
4688 ACQUIRE_LOCK(&lk);
4689 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
4690 nbp = LIST_NEXT(bp, b_vnbufs);
4691 /*
4692 * If it is already scheduled, skip to the next buffer.
4693 */
4694 if (bp->b_flags & B_BUSY)
4695 continue;
4696 bp->b_flags |= B_BUSY;
4697
4698 if ((bp->b_flags & B_DELWRI) == 0) {
4699 FREE_LOCK(&lk);
4700 panic("softdep_fsync_mountdev: not dirty");
4701 }
4702 /*
4703 * We are only interested in bitmaps with outstanding
4704 * dependencies.
4705 */
4706 if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
4707 wk->wk_type != D_BMSAFEMAP) {
4708 bp->b_flags &= ~B_BUSY;
4709 continue;
4710 }
4711 bremfree(bp);
4712 FREE_LOCK(&lk);
4713 (void) bawrite(bp);
4714 ACQUIRE_LOCK(&lk);
4715 /*
4716 * Since we may have slept during the I/O, we need
4717 * to start from a known point.
4718 */
4719 nbp = LIST_FIRST(&vp->v_dirtyblkhd);
4720 }
4721 if (waitfor == MNT_WAIT)
4722 drain_output(vp, 1);
4723 FREE_LOCK(&lk);
4724 }
4725
4726 /*
4727 * This routine is called when we are trying to synchronously flush a
4728 * file. This routine must eliminate any filesystem metadata dependencies
4729 * so that the syncing routine can succeed by pushing the dirty blocks
4730 * associated with the file. If any I/O errors occur, they are returned.
4731 */
4732 int
4733 softdep_sync_metadata(ap)
4734 struct vop_fsync_args /* {
4735 struct vnode *a_vp;
4736 struct ucred *a_cred;
4737 int a_waitfor;
4738 struct proc *a_p;
4739 } */ *ap;
4740 {
4741 struct vnode *vp = ap->a_vp;
4742 struct pagedep *pagedep;
4743 struct allocdirect *adp;
4744 struct allocindir *aip;
4745 struct buf *bp, *nbp;
4746 struct worklist *wk;
4747 int i, error, waitfor;
4748
4749 /*
4750 * Check whether this vnode is involved in a filesystem
4751 * that is doing soft dependency processing.
4752 */
4753 if (!vn_isdisk(vp, NULL)) {
4754 if (!DOINGSOFTDEP(vp))
4755 return (0);
4756 } else
4757 if (vp->v_specmountpoint == NULL ||
4758 (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP) == 0)
4759 return (0);
4760 /*
4761 * Ensure that any direct block dependencies have been cleared.
4762 */
4763 ACQUIRE_LOCK(&lk);
4764 if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
4765 FREE_LOCK(&lk);
4766 return (error);
4767 }
4768 /*
4769 * For most files, the only metadata dependencies are the
4770 * cylinder group maps that allocate their inode or blocks.
4771 * The block allocation dependencies can be found by traversing
4772 * the dependency lists for any buffers that remain on their
4773 * dirty buffer list. The inode allocation dependency will
4774 * be resolved when the inode is updated with MNT_WAIT.
4775 * This work is done in two passes. The first pass grabs most
4776 * of the buffers and begins asynchronously writing them. The
4777 * only way to wait for these asynchronous writes is to sleep
4778 * on the filesystem vnode which may stay busy for a long time
4779 * if the filesystem is active. So, instead, we make a second
4780 * pass over the dependencies blocking on each write. In the
4781 * usual case we will be blocking against a write that we
4782 * initiated, so when it is done the dependency will have been
4783 * resolved. Thus the second pass is expected to end quickly.
4784 */
4785 waitfor = MNT_NOWAIT;
4786 top:
4787 /*
4788 * We must wait for any I/O in progress to finish so that
4789 * all potential buffers on the dirty list will be visible.
4790 */
4791 drain_output(vp, 1);
4792 bp = LIST_FIRST(&vp->v_dirtyblkhd);
4793 if (getdirtybuf(bp, MNT_WAIT) == 0) {
4794 FREE_LOCK(&lk);
4795 return (0);
4796 }
4797 loop:
4798 /*
4799 * As we hold the buffer locked, none of its dependencies
4800 * will disappear.
4801 */
4802 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
4803 switch (wk->wk_type) {
4804
4805 case D_ALLOCDIRECT:
4806 adp = WK_ALLOCDIRECT(wk);
4807 if (adp->ad_state & DEPCOMPLETE)
4808 break;
4809 nbp = adp->ad_buf;
4810 if (getdirtybuf(nbp, waitfor) == 0)
4811 break;
4812 FREE_LOCK(&lk);
4813 if (waitfor == MNT_NOWAIT) {
4814 bawrite(nbp);
4815 } else if ((error = VOP_BWRITE(nbp)) != 0) {
4816 bawrite(bp);
4817 return (error);
4818 }
4819 ACQUIRE_LOCK(&lk);
4820 break;
4821
4822 case D_ALLOCINDIR:
4823 aip = WK_ALLOCINDIR(wk);
4824 if (aip->ai_state & DEPCOMPLETE)
4825 break;
4826 nbp = aip->ai_buf;
4827 if (getdirtybuf(nbp, waitfor) == 0)
4828 break;
4829 FREE_LOCK(&lk);
4830 if (waitfor == MNT_NOWAIT) {
4831 bawrite(nbp);
4832 } else if ((error = VOP_BWRITE(nbp)) != 0) {
4833 bawrite(bp);
4834 return (error);
4835 }
4836 ACQUIRE_LOCK(&lk);
4837 break;
4838
4839 case D_INDIRDEP:
4840 restart:
4841
4842 LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
4843 if (aip->ai_state & DEPCOMPLETE)
4844 continue;
4845 nbp = aip->ai_buf;
4846 if (getdirtybuf(nbp, MNT_WAIT) == 0)
4847 goto restart;
4848 FREE_LOCK(&lk);
4849 if ((error = VOP_BWRITE(nbp)) != 0) {
4850 bawrite(bp);
4851 return (error);
4852 }
4853 ACQUIRE_LOCK(&lk);
4854 goto restart;
4855 }
4856 break;
4857
4858 case D_INODEDEP:
4859 if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
4860 WK_INODEDEP(wk)->id_ino)) != 0) {
4861 FREE_LOCK(&lk);
4862 bawrite(bp);
4863 return (error);
4864 }
4865 break;
4866
4867 case D_PAGEDEP:
4868 /*
4869 * We are trying to sync a directory that may
4870 * have dependencies on both its own metadata
4871 * and/or dependencies on the inodes of any
4872 * recently allocated files. We walk its diradd
4873 * lists pushing out the associated inode.
4874 */
4875 pagedep = WK_PAGEDEP(wk);
4876 for (i = 0; i < DAHASHSZ; i++) {
4877 if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
4878 continue;
4879 if ((error =
4880 flush_pagedep_deps(vp, pagedep->pd_mnt,
4881 &pagedep->pd_diraddhd[i]))) {
4882 FREE_LOCK(&lk);
4883 bawrite(bp);
4884 return (error);
4885 }
4886 }
4887 break;
4888
4889 case D_MKDIR:
4890 /*
4891 * This case should never happen if the vnode has
4892 * been properly sync'ed. However, if this function
4893 * is used at a place where the vnode has not yet
4894 * been sync'ed, this dependency can show up. So,
4895 * rather than panic, just flush it.
4896 */
4897 nbp = WK_MKDIR(wk)->md_buf;
4898 if (getdirtybuf(nbp, waitfor) == 0)
4899 break;
4900 FREE_LOCK(&lk);
4901 if (waitfor == MNT_NOWAIT) {
4902 bawrite(nbp);
4903 } else if ((error = VOP_BWRITE(nbp)) != 0) {
4904 bawrite(bp);
4905 return (error);
4906 }
4907 ACQUIRE_LOCK(&lk);
4908 break;
4909
4910 case D_BMSAFEMAP:
4911 /*
4912 * This case should never happen if the vnode has
4913 * been properly sync'ed. However, if this function
4914 * is used at a place where the vnode has not yet
4915 * been sync'ed, this dependency can show up. So,
4916 * rather than panic, just flush it.
4917 */
4918 nbp = WK_BMSAFEMAP(wk)->sm_buf;
4919 if (getdirtybuf(nbp, waitfor) == 0)
4920 break;
4921 FREE_LOCK(&lk);
4922 if (waitfor == MNT_NOWAIT) {
4923 bawrite(nbp);
4924 } else if ((error = VOP_BWRITE(nbp)) != 0) {
4925 bawrite(bp);
4926 return (error);
4927 }
4928 ACQUIRE_LOCK(&lk);
4929 break;
4930
4931 default:
4932 FREE_LOCK(&lk);
4933 panic("softdep_sync_metadata: Unknown type %s",
4934 TYPENAME(wk->wk_type));
4935 /* NOTREACHED */
4936 }
4937 }
4938 nbp = LIST_NEXT(bp, b_vnbufs);
4939 getdirtybuf(nbp, MNT_WAIT);
4940 FREE_LOCK(&lk);
4941 bawrite(bp);
4942 ACQUIRE_LOCK(&lk);
4943 if (nbp != NULL) {
4944 bp = nbp;
4945 goto loop;
4946 }
4947 /*
4948 * The brief unlock is to allow any pent up dependency
4949 * processing to be done. Then proceed with the second pass.
4950 */
4951 if (waitfor == MNT_NOWAIT) {
4952 waitfor = MNT_WAIT;
4953 FREE_LOCK(&lk);
4954 ACQUIRE_LOCK(&lk);
4955 goto top;
4956 }
4957
4958 /*
4959 * If we have managed to get rid of all the dirty buffers,
4960 * then we are done. For certain directories and block
4961 * devices, we may need to do further work.
4962 *
4963 * We must wait for any I/O in progress to finish so that
4964 * all potential buffers on the dirty list will be visible.
4965 */
4966 drain_output(vp, 1);
4967 if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
4968 FREE_LOCK(&lk);
4969 return (0);
4970 }
4971
4972 FREE_LOCK(&lk);
4973 /*
4974 * If we are trying to sync a block device, some of its buffers may
4975 * contain metadata that cannot be written until the contents of some
4976 * partially written files have been written to disk. The only easy
4977 * way to accomplish this is to sync the entire filesystem (luckily
4978 * this happens rarely).
4979 */
4980 if (vn_isdisk(vp, NULL) &&
4981 vp->v_specmountpoint && !VOP_ISLOCKED(vp) &&
4982 (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred,
4983 ap->a_p)) != 0)
4984 return (error);
4985 return (0);
4986 }
4987
4988 /*
4989 * Flush the dependencies associated with an inodedep.
4990 * Called with splbio blocked.
4991 */
4992 STATIC int
4993 flush_inodedep_deps(fs, ino)
4994 struct fs *fs;
4995 ino_t ino;
4996 {
4997 struct inodedep *inodedep;
4998 struct allocdirect *adp;
4999 int error, waitfor;
5000 struct buf *bp;
5001
5002 splassert(IPL_BIO);
5003
5004 /*
5005 * This work is done in two passes. The first pass grabs most
5006 * of the buffers and begins asynchronously writing them. The
5007 * only way to wait for these asynchronous writes is to sleep
5008 * on the filesystem vnode which may stay busy for a long time
5009 * if the filesystem is active. So, instead, we make a second
5010 * pass over the dependencies blocking on each write. In the
5011 * usual case we will be blocking against a write that we
5012 * initiated, so when it is done the dependency will have been
5013 * resolved. Thus the second pass is expected to end quickly.
5014 * We give a brief window at the top of the loop to allow
5015 * any pending I/O to complete.
5016 */
5017 for (waitfor = MNT_NOWAIT; ; ) {
5018 FREE_LOCK(&lk);
5019 ACQUIRE_LOCK(&lk);
5020 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5021 return (0);
5022 TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) {
5023 if (adp->ad_state & DEPCOMPLETE)
5024 continue;
5025 bp = adp->ad_buf;
5026 if (getdirtybuf(bp, waitfor) == 0) {
5027 if (waitfor == MNT_NOWAIT)
5028 continue;
5029 break;
5030 }
5031 FREE_LOCK(&lk);
5032 if (waitfor == MNT_NOWAIT) {
5033 bawrite(bp);
5034 } else if ((error = VOP_BWRITE(bp)) != 0) {
5035 ACQUIRE_LOCK(&lk);
5036 return (error);
5037 }
5038 ACQUIRE_LOCK(&lk);
5039 break;
5040 }
5041 if (adp != NULL)
5042 continue;
5043 TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) {
5044 if (adp->ad_state & DEPCOMPLETE)
5045 continue;
5046 bp = adp->ad_buf;
5047 if (getdirtybuf(bp, waitfor) == 0) {
5048 if (waitfor == MNT_NOWAIT)
5049 continue;
5050 break;
5051 }
5052 FREE_LOCK(&lk);
5053 if (waitfor == MNT_NOWAIT) {
5054 bawrite(bp);
5055 } else if ((error = VOP_BWRITE(bp)) != 0) {
5056 ACQUIRE_LOCK(&lk);
5057 return (error);
5058 }
5059 ACQUIRE_LOCK(&lk);
5060 break;
5061 }
5062 if (adp != NULL)
5063 continue;
5064 /*
5065 * If pass2, we are done, otherwise do pass 2.
5066 */
5067 if (waitfor == MNT_WAIT)
5068 break;
5069 waitfor = MNT_WAIT;
5070 }
5071 /*
5072 * Try freeing inodedep in case all dependencies have been removed.
5073 */
5074 if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
5075 (void) free_inodedep(inodedep);
5076 return (0);
5077 }
5078
5079 /*
5080 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
5081 * Called with splbio blocked.
5082 */
5083 STATIC int
5084 flush_pagedep_deps(pvp, mp, diraddhdp)
5085 struct vnode *pvp;
5086 struct mount *mp;
5087 struct diraddhd *diraddhdp;
5088 {
5089 struct proc *p = CURPROC; /* XXX */
5090 struct worklist *wk;
5091 struct inodedep *inodedep;
5092 struct ufsmount *ump;
5093 struct diradd *dap;
5094 struct vnode *vp;
5095 int gotit, error = 0;
5096 struct buf *bp;
5097 ino_t inum;
5098
5099 splassert(IPL_BIO);
5100
5101 ump = VFSTOUFS(mp);
5102 while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
5103 /*
5104 * Flush ourselves if this directory entry
5105 * has a MKDIR_PARENT dependency.
5106 */
5107 if (dap->da_state & MKDIR_PARENT) {
5108 FREE_LOCK(&lk);
5109 if ((error = UFS_UPDATE(VTOI(pvp), MNT_WAIT)))
5110 break;
5111 ACQUIRE_LOCK(&lk);
5112 /*
5113 * If that cleared dependencies, go on to next.
5114 */
5115 if (dap != LIST_FIRST(diraddhdp))
5116 continue;
5117 if (dap->da_state & MKDIR_PARENT) {
5118 FREE_LOCK(&lk);
5119 panic("flush_pagedep_deps: MKDIR_PARENT");
5120 }
5121 }
5122 /*
5123 * A newly allocated directory must have its "." and
5124 * ".." entries written out before its name can be
5125 * committed in its parent. We do not want or need
5126 * the full semantics of a synchronous VOP_FSYNC as
5127 * that may end up here again, once for each directory
5128 * level in the filesystem. Instead, we push the blocks
5129 * and wait for them to clear. We have to fsync twice
5130 * because the first call may choose to defer blocks
5131 * that still have dependencies, but deferral will
5132 * happen at most once.
5133 */
5134 inum = dap->da_newinum;
5135 if (dap->da_state & MKDIR_BODY) {
5136 FREE_LOCK(&lk);
5137 if ((error = VFS_VGET(mp, inum, &vp)) != 0)
5138 break;
5139 if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) ||
5140 (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) {
5141 vput(vp);
5142 break;
5143 }
5144 drain_output(vp, 0);
5145 /*
5146 * If first block is still dirty with a D_MKDIR
5147 * dependency then it needs to be written now.
5148 */
5149 for (;;) {
5150 error = 0;
5151 ACQUIRE_LOCK(&lk);
5152 bp = incore(vp, 0);
5153 if (bp == NULL) {
5154 FREE_LOCK(&lk);
5155 break;
5156 }
5157 LIST_FOREACH(wk, &bp->b_dep, wk_list)
5158 if (wk->wk_type == D_MKDIR)
5159 break;
5160 if (wk) {
5161 gotit = getdirtybuf(bp, MNT_WAIT);
5162 FREE_LOCK(&lk);
5163 if (gotit && (error = bwrite(bp)) != 0)
5164 break;
5165 } else
5166 FREE_LOCK(&lk);
5167 break;
5168 }
5169 vput(vp);
5170 /* Flushing of first block failed */
5171 if (error)
5172 break;
5173 ACQUIRE_LOCK(&lk);
5174 /*
5175 * If that cleared dependencies, go on to next.
5176 */
5177 if (dap != LIST_FIRST(diraddhdp))
5178 continue;
5179 if (dap->da_state & MKDIR_BODY) {
5180 FREE_LOCK(&lk);
5181 panic("flush_pagedep_deps: MKDIR_BODY");
5182 }
5183 }
5184 /*
5185 * Flush the inode on which the directory entry depends.
5186 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
5187 * the only remaining dependency is that the updated inode
5188 * count must get pushed to disk. The inode has already
5189 * been pushed into its inode buffer (via VOP_UPDATE) at
5190 * the time of the reference count change. So we need only
5191 * locate that buffer, ensure that there will be no rollback
5192 * caused by a bitmap dependency, then write the inode buffer.
5193 */
5194 if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) {
5195 FREE_LOCK(&lk);
5196 panic("flush_pagedep_deps: lost inode");
5197 }
5198 /*
5199 * If the inode still has bitmap dependencies,
5200 * push them to disk.
5201 */
5202 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5203 bp = inodedep->id_buf;
5204 gotit = getdirtybuf(bp, MNT_WAIT);
5205 FREE_LOCK(&lk);
5206 if (gotit && (error = bwrite(bp)) != 0)
5207 break;
5208 ACQUIRE_LOCK(&lk);
5209 if (dap != LIST_FIRST(diraddhdp))
5210 continue;
5211 }
5212 /*
5213 * If the inode is still sitting in a buffer waiting
5214 * to be written, push it to disk.
5215 */
5216 FREE_LOCK(&lk);
5217 if ((error = bread(ump->um_devvp,
5218 fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
5219 (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
5220 brelse(bp);
5221 break;
5222 }
5223 if ((error = bwrite(bp)) != 0)
5224 break;
5225 ACQUIRE_LOCK(&lk);
5226 /*
5227 * If we have failed to get rid of all the dependencies
5228 * then something is seriously wrong.
5229 */
5230 if (dap == LIST_FIRST(diraddhdp)) {
5231 FREE_LOCK(&lk);
5232 panic("flush_pagedep_deps: flush failed");
5233 }
5234 }
5235 if (error)
5236 ACQUIRE_LOCK(&lk);
5237 return (error);
5238 }
5239
5240 /*
5241 * A large burst of file addition or deletion activity can drive the
5242 * memory load excessively high. First attempt to slow things down
5243 * using the techniques below. If that fails, this routine requests
5244 * the offending operations to fall back to running synchronously
5245 * until the memory load returns to a reasonable level.
5246 */
5247 int
5248 softdep_slowdown(vp)
5249 struct vnode *vp;
5250 {
5251 int max_softdeps_hard;
5252
5253 max_softdeps_hard = max_softdeps * 11 / 10;
5254 if (num_dirrem < max_softdeps_hard / 2 &&
5255 num_inodedep < max_softdeps_hard)
5256 return (0);
5257 stat_sync_limit_hit += 1;
5258 return (1);
5259 }
5260
5261 /*
5262 * If memory utilization has gotten too high, deliberately slow things
5263 * down and speed up the I/O processing.
5264 */
5265 STATIC int
5266 request_cleanup(resource, islocked)
5267 int resource;
5268 int islocked;
5269 {
5270 struct proc *p = CURPROC;
5271 int s;
5272
5273 /*
5274 * We never hold up the filesystem syncer process.
5275 */
5276 if (p == filesys_syncer || (p->p_flag & P_SOFTDEP))
5277 return (0);
5278 /*
5279 * First check to see if the work list has gotten backlogged.
5280 * If it has, co-opt this process to help clean up two entries.
5281 * Because this process may hold inodes locked, we cannot
5282 * handle any remove requests that might block on a locked
5283 * inode as that could lead to deadlock. We set P_SOFTDEP
5284 * to avoid recursively processing the worklist.
5285 */
5286 if (num_on_worklist > max_softdeps / 10) {
5287 atomic_setbits_int(&p->p_flag, P_SOFTDEP);
5288 if (islocked)
5289 FREE_LOCK(&lk);
5290 process_worklist_item(NULL, LK_NOWAIT);
5291 process_worklist_item(NULL, LK_NOWAIT);
5292 atomic_clearbits_int(&p->p_flag, P_SOFTDEP);
5293 stat_worklist_push += 2;
5294 if (islocked)
5295 ACQUIRE_LOCK(&lk);
5296 return(1);
5297 }
5298 /*
5299 * Next, we attempt to speed up the syncer process. If that
5300 * is successful, then we allow the process to continue.
5301 */
5302 if (speedup_syncer())
5303 return(0);
5304 /*
5305 * If we are resource constrained on inode dependencies, try
5306 * flushing some dirty inodes. Otherwise, we are constrained
5307 * by file deletions, so try accelerating flushes of directories
5308 * with removal dependencies. We would like to do the cleanup
5309 * here, but we probably hold an inode locked at this point and
5310 * that might deadlock against one that we try to clean. So,
5311 * the best that we can do is request the syncer daemon to do
5312 * the cleanup for us.
5313 */
5314 switch (resource) {
5315
5316 case FLUSH_INODES:
5317 stat_ino_limit_push += 1;
5318 req_clear_inodedeps += 1;
5319 stat_countp = &stat_ino_limit_hit;
5320 break;
5321
5322 case FLUSH_REMOVE:
5323 stat_blk_limit_push += 1;
5324 req_clear_remove += 1;
5325 stat_countp = &stat_blk_limit_hit;
5326 break;
5327
5328 default:
5329 if (islocked)
5330 FREE_LOCK(&lk);
5331 panic("request_cleanup: unknown type");
5332 }
5333 /*
5334 * Hopefully the syncer daemon will catch up and awaken us.
5335 * We wait at most tickdelay before proceeding in any case.
5336 */
5337 if (islocked == 0)
5338 ACQUIRE_LOCK(&lk);
5339 proc_waiting += 1;
5340 if (!timeout_pending(&proc_waiting_timeout))
5341 timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
5342
5343 s = FREE_LOCK_INTERLOCKED(&lk);
5344 (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0);
5345 ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5346 proc_waiting -= 1;
5347 if (islocked == 0)
5348 FREE_LOCK(&lk);
5349 return (1);
5350 }
5351
5352 /*
5353 * Awaken processes pausing in request_cleanup and clear proc_waiting
5354 * to indicate that there is no longer a timer running.
5355 */
5356 void
5357 pause_timer(arg)
5358 void *arg;
5359 {
5360
5361 *stat_countp += 1;
5362 wakeup_one(&proc_waiting);
5363 if (proc_waiting > 0)
5364 timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2);
5365 }
5366
5367 /*
5368 * Flush out a directory with at least one removal dependency in an effort to
5369 * reduce the number of dirrem, freefile, and freeblks dependency structures.
5370 */
5371 STATIC void
5372 clear_remove(p)
5373 struct proc *p;
5374 {
5375 struct pagedep_hashhead *pagedephd;
5376 struct pagedep *pagedep;
5377 static int next = 0;
5378 struct mount *mp;
5379 struct vnode *vp;
5380 int error, cnt;
5381 ino_t ino;
5382
5383 ACQUIRE_LOCK(&lk);
5384 for (cnt = 0; cnt < pagedep_hash; cnt++) {
5385 pagedephd = &pagedep_hashtbl[next++];
5386 if (next >= pagedep_hash)
5387 next = 0;
5388 LIST_FOREACH(pagedep, pagedephd, pd_hash) {
5389 if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
5390 continue;
5391 mp = pagedep->pd_mnt;
5392 ino = pagedep->pd_ino;
5393 #if 0
5394 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5395 continue;
5396 #endif
5397 FREE_LOCK(&lk);
5398 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
5399 softdep_error("clear_remove: vget", error);
5400 #if 0
5401 vn_finished_write(mp);
5402 #endif
5403 return;
5404 }
5405 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
5406 softdep_error("clear_remove: fsync", error);
5407 drain_output(vp, 0);
5408 vput(vp);
5409 #if 0
5410 vn_finished_write(mp);
5411 #endif
5412 return;
5413 }
5414 }
5415 FREE_LOCK(&lk);
5416 }
5417
5418 /*
5419 * Clear out a block of dirty inodes in an effort to reduce
5420 * the number of inodedep dependency structures.
5421 */
5422 STATIC void
5423 clear_inodedeps(p)
5424 struct proc *p;
5425 {
5426 struct inodedep_hashhead *inodedephd;
5427 struct inodedep *inodedep;
5428 static int next = 0;
5429 struct mount *mp;
5430 struct vnode *vp;
5431 struct fs *fs;
5432 int error, cnt;
5433 ino_t firstino, lastino, ino;
5434
5435 ACQUIRE_LOCK(&lk);
5436 /*
5437 * Pick a random inode dependency to be cleared.
5438 * We will then gather up all the inodes in its block
5439 * that have dependencies and flush them out.
5440 */
5441 for (cnt = 0; cnt < inodedep_hash; cnt++) {
5442 inodedephd = &inodedep_hashtbl[next++];
5443 if (next >= inodedep_hash)
5444 next = 0;
5445 if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
5446 break;
5447 }
5448 if (inodedep == NULL) {
5449 FREE_LOCK(&lk);
5450 return;
5451 }
5452 /*
5453 * Ugly code to find mount point given pointer to superblock.
5454 */
5455 fs = inodedep->id_fs;
5456 CIRCLEQ_FOREACH(mp, &mountlist, mnt_list)
5457 if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
5458 break;
5459 /*
5460 * Find the last inode in the block with dependencies.
5461 */
5462 firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
5463 for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
5464 if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
5465 break;
5466 /*
5467 * Asynchronously push all but the last inode with dependencies.
5468 * Synchronously push the last inode with dependencies to ensure
5469 * that the inode block gets written to free up the inodedeps.
5470 */
5471 for (ino = firstino; ino <= lastino; ino++) {
5472 if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
5473 continue;
5474 FREE_LOCK(&lk);
5475 #if 0
5476 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
5477 continue;
5478 #endif
5479 if ((error = VFS_VGET(mp, ino, &vp)) != 0) {
5480 softdep_error("clear_inodedeps: vget", error);
5481 #if 0
5482 vn_finished_write(mp);
5483 #endif
5484 return;
5485 }
5486 if (ino == lastino) {
5487 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_WAIT, p)))
5488 softdep_error("clear_inodedeps: fsync1", error);
5489 } else {
5490 if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)))
5491 softdep_error("clear_inodedeps: fsync2", error);
5492 drain_output(vp, 0);
5493 }
5494 vput(vp);
5495 #if 0
5496 vn_finished_write(mp);
5497 #endif
5498 ACQUIRE_LOCK(&lk);
5499 }
5500 FREE_LOCK(&lk);
5501 }
5502
5503 /*
5504 * Function to determine if the buffer has outstanding dependencies
5505 * that will cause a roll-back if the buffer is written. If wantcount
5506 * is set, return number of dependencies, otherwise just yes or no.
5507 */
5508 int
5509 softdep_count_dependencies(bp, wantcount, islocked)
5510 struct buf *bp;
5511 int wantcount;
5512 int islocked;
5513 {
5514 struct worklist *wk;
5515 struct inodedep *inodedep;
5516 struct indirdep *indirdep;
5517 struct allocindir *aip;
5518 struct pagedep *pagedep;
5519 struct diradd *dap;
5520 int i, retval;
5521
5522 retval = 0;
5523 if (!islocked)
5524 ACQUIRE_LOCK(&lk);
5525 LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5526 switch (wk->wk_type) {
5527
5528 case D_INODEDEP:
5529 inodedep = WK_INODEDEP(wk);
5530 if ((inodedep->id_state & DEPCOMPLETE) == 0) {
5531 /* bitmap allocation dependency */
5532 retval += 1;
5533 if (!wantcount)
5534 goto out;
5535 }
5536 if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
5537 /* direct block pointer dependency */
5538 retval += 1;
5539 if (!wantcount)
5540 goto out;
5541 }
5542 continue;
5543
5544 case D_INDIRDEP:
5545 indirdep = WK_INDIRDEP(wk);
5546
5547 LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
5548 /* indirect block pointer dependency */
5549 retval += 1;
5550 if (!wantcount)
5551 goto out;
5552 }
5553 continue;
5554
5555 case D_PAGEDEP:
5556 pagedep = WK_PAGEDEP(wk);
5557 for (i = 0; i < DAHASHSZ; i++) {
5558
5559 LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
5560 /* directory entry dependency */
5561 retval += 1;
5562 if (!wantcount)
5563 goto out;
5564 }
5565 }
5566 continue;
5567
5568 case D_BMSAFEMAP:
5569 case D_ALLOCDIRECT:
5570 case D_ALLOCINDIR:
5571 case D_MKDIR:
5572 /* never a dependency on these blocks */
5573 continue;
5574
5575 default:
5576 if (!islocked)
5577 FREE_LOCK(&lk);
5578 panic("softdep_check_for_rollback: Unexpected type %s",
5579 TYPENAME(wk->wk_type));
5580 /* NOTREACHED */
5581 }
5582 }
5583 out:
5584 if (!islocked)
5585 FREE_LOCK(&lk);
5586 return retval;
5587 }
5588
5589 /*
5590 * Acquire exclusive access to a buffer.
5591 * Must be called with splbio blocked.
5592 * Return 1 if buffer was acquired.
5593 */
5594 STATIC int
5595 getdirtybuf(bp, waitfor)
5596 struct buf *bp;
5597 int waitfor;
5598 {
5599 int s;
5600
5601 if (bp == NULL)
5602 return (0);
5603
5604 splassert(IPL_BIO);
5605
5606 for (;;) {
5607 if ((bp->b_flags & B_BUSY) == 0)
5608 break;
5609 if (waitfor != MNT_WAIT)
5610 return (0);
5611 bp->b_flags |= B_WANTED;
5612 s = FREE_LOCK_INTERLOCKED(&lk);
5613 tsleep((caddr_t)bp, PRIBIO + 1, "sdsdty", 0);
5614 ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5615 }
5616 if ((bp->b_flags & B_DELWRI) == 0)
5617 return (0);
5618 bremfree(bp);
5619 bp->b_flags |= B_BUSY;
5620 return (1);
5621 }
5622
5623 /*
5624 * Wait for pending output on a vnode to complete.
5625 * Must be called with vnode locked.
5626 */
5627 STATIC void
5628 drain_output(vp, islocked)
5629 struct vnode *vp;
5630 int islocked;
5631 {
5632 int s;
5633
5634 if (!islocked)
5635 ACQUIRE_LOCK(&lk);
5636
5637 splassert(IPL_BIO);
5638
5639 while (vp->v_numoutput) {
5640 vp->v_bioflag |= VBIOWAIT;
5641 s = FREE_LOCK_INTERLOCKED(&lk);
5642 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "drain_output", 0);
5643 ACQUIRE_LOCK_INTERLOCKED(&lk, s);
5644 }
5645 if (!islocked)
5646 FREE_LOCK(&lk);
5647 }
5648
5649 /*
5650 * Called whenever a buffer that is being invalidated or reallocated
5651 * contains dependencies. This should only happen if an I/O error has
5652 * occurred. The routine is called with the buffer locked.
5653 */
5654 void
5655 softdep_deallocate_dependencies(bp)
5656 struct buf *bp;
5657 {
5658
5659 if ((bp->b_flags & B_ERROR) == 0)
5660 panic("softdep_deallocate_dependencies: dangling deps");
5661 softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
5662 panic("softdep_deallocate_dependencies: unrecovered I/O error");
5663 }
5664
5665 /*
5666 * Function to handle asynchronous write errors in the filesystem.
5667 */
5668 void
5669 softdep_error(func, error)
5670 char *func;
5671 int error;
5672 {
5673
5674 /* XXX should do something better! */
5675 printf("%s: got error %d while accessing filesystem\n", func, error);
5676 }
5677
5678 #ifdef DDB
5679 #include <machine/db_machdep.h>
5680 #include <ddb/db_interface.h>
5681 #include <ddb/db_output.h>
5682
5683 void
5684 softdep_print(struct buf *bp, int full, int (*pr)(const char *, ...))
5685 {
5686 struct worklist *wk;
5687
5688 (*pr)(" deps:\n");
5689 LIST_FOREACH(wk, &bp->b_dep, wk_list)
5690 worklist_print(wk, full, pr);
5691 }
5692
5693 void
5694 worklist_print(struct worklist *wk, int full, int (*pr)(const char *, ...))
5695 {
5696 struct pagedep *pagedep;
5697 struct inodedep *inodedep;
5698 struct newblk *newblk;
5699 struct bmsafemap *bmsafemap;
5700 struct allocdirect *adp;
5701 struct indirdep *indirdep;
5702 struct allocindir *aip;
5703 struct freefrag *freefrag;
5704 struct freeblks *freeblks;
5705 struct freefile *freefile;
5706 struct diradd *dap;
5707 struct mkdir *mkdir;
5708 struct dirrem *dirrem;
5709 struct newdirblk *newdirblk;
5710 char prefix[33];
5711 int i;
5712
5713 for (prefix[i = 2 * MIN(16, full)] = '\0'; i--; prefix[i] = ' ')
5714 ;
5715
5716 (*pr)("%s%s(%p) state %b\n%s", prefix, TYPENAME(wk->wk_type), wk,
5717 wk->wk_state, DEP_BITS, prefix);
5718 switch (wk->wk_type) {
5719 case D_PAGEDEP:
5720 pagedep = WK_PAGEDEP(wk);
5721 (*pr)("mount %p ino %u lbn %lld\n", pagedep->pd_mnt,
5722 pagedep->pd_ino, pagedep->pd_lbn);
5723 break;
5724 case D_INODEDEP:
5725 inodedep = WK_INODEDEP(wk);
5726 (*pr)("fs %p ino %u nlinkdelta %u dino %p\n"
5727 "%s bp %p savsz %lld\n", inodedep->id_fs,
5728 inodedep->id_ino, inodedep->id_nlinkdelta,
5729 inodedep->id_un.idu_savedino1,
5730 prefix, inodedep->id_buf, inodedep->id_savedsize);
5731 break;
5732 case D_NEWBLK:
5733 newblk = WK_NEWBLK(wk);
5734 (*pr)("fs %p newblk %d state %d bmsafemap %p\n",
5735 newblk->nb_fs, newblk->nb_newblkno, newblk->nb_state,
5736 newblk->nb_bmsafemap);
5737 break;
5738 case D_BMSAFEMAP:
5739 bmsafemap = WK_BMSAFEMAP(wk);
5740 (*pr)("buf %p\n", bmsafemap->sm_buf);
5741 break;
5742 case D_ALLOCDIRECT:
5743 adp = WK_ALLOCDIRECT(wk);
5744 (*pr)("lbn %lld newlbk %d oldblk %d newsize %lu olsize %lu\n"
5745 "%s bp %p inodedep %p freefrag %p\n", adp->ad_lbn,
5746 adp->ad_newblkno, adp->ad_oldblkno, adp->ad_newsize,
5747 adp->ad_oldsize,
5748 prefix, adp->ad_buf, adp->ad_inodedep, adp->ad_freefrag);
5749 break;
5750 case D_INDIRDEP:
5751 indirdep = WK_INDIRDEP(wk);
5752 (*pr)("savedata %p savebp %p\n", indirdep->ir_saveddata,
5753 indirdep->ir_savebp);
5754 break;
5755 case D_ALLOCINDIR:
5756 aip = WK_ALLOCINDIR(wk);
5757 (*pr)("off %d newblk %d oldblk %d freefrag %p\n"
5758 "%s indirdep %p buf %p\n", aip->ai_offset,
5759 aip->ai_newblkno, aip->ai_oldblkno, aip->ai_freefrag,
5760 prefix, aip->ai_indirdep, aip->ai_buf);
5761 break;
5762 case D_FREEFRAG:
5763 freefrag = WK_FREEFRAG(wk);
5764 (*pr)("vnode %p mp %p blkno %d fsize %ld ino %u\n",
5765 freefrag->ff_devvp, freefrag->ff_mnt, freefrag->ff_blkno,
5766 freefrag->ff_fragsize, freefrag->ff_inum);
5767 break;
5768 case D_FREEBLKS:
5769 freeblks = WK_FREEBLKS(wk);
5770 (*pr)("previno %u devvp %p mp %p oldsz %lld newsz %lld\n"
5771 "%s chkcnt %d uid %d\n", freeblks->fb_previousinum,
5772 freeblks->fb_devvp, freeblks->fb_mnt, freeblks->fb_oldsize,
5773 freeblks->fb_newsize,
5774 prefix, freeblks->fb_chkcnt, freeblks->fb_uid);
5775 break;
5776 case D_FREEFILE:
5777 freefile = WK_FREEFILE(wk);
5778 (*pr)("mode %x oldino %u vnode %p mp %p\n", freefile->fx_mode,
5779 freefile->fx_oldinum, freefile->fx_devvp, freefile->fx_mnt);
5780 break;
5781 case D_DIRADD:
5782 dap = WK_DIRADD(wk);
5783 (*pr)("off %ld ino %u da_un %p\n", dap->da_offset,
5784 dap->da_newinum, dap->da_un.dau_previous);
5785 break;
5786 case D_MKDIR:
5787 mkdir = WK_MKDIR(wk);
5788 (*pr)("diradd %p bp %p\n", mkdir->md_diradd, mkdir->md_buf);
5789 break;
5790 case D_DIRREM:
5791 dirrem = WK_DIRREM(wk);
5792 (*pr)("mp %p ino %u dm_un %p\n", dirrem->dm_mnt,
5793 dirrem->dm_oldinum, dirrem->dm_un.dmu_pagedep);
5794 break;
5795 case D_NEWDIRBLK:
5796 newdirblk = WK_NEWDIRBLK(wk);
5797 (*pr)("pagedep %p\n", newdirblk->db_pagedep);
5798 break;
5799 }
5800 }
5801 #endif