root/uvm/uvm_swap.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. uvm_swap_init
  2. uvm_swap_initcrypt_all
  3. uvm_swap_initcrypt
  4. uvm_swap_allocpages
  5. uvm_swap_freepages
  6. uvm_swap_markdecrypt
  7. uvm_swap_needdecrypt
  8. swaplist_insert
  9. swaplist_find
  10. swaplist_trim
  11. swapdrum_add
  12. swapdrum_getsdp
  13. sys_swapctl
  14. swap_on
  15. swap_off
  16. swread
  17. swwrite
  18. swstrategy
  19. sw_reg_strategy
  20. sw_reg_start
  21. sw_reg_iodone
  22. uvm_swap_alloc
  23. uvm_swap_markbad
  24. uvm_swap_free
  25. uvm_swap_put
  26. uvm_swap_get
  27. uvm_swap_io
  28. swapmount

    1 /*      $OpenBSD: uvm_swap.c,v 1.72 2007/06/18 21:51:15 pedro Exp $     */
    2 /*      $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $        */
    3 
    4 /*
    5  * Copyright (c) 1995, 1996, 1997 Matthew R. Green
    6  * All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 3. The name of the author may not be used to endorse or promote products
   17  *    derived from this software without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   25  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  *
   31  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
   32  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
   33  */
   34 
   35 #include <sys/param.h>
   36 #include <sys/systm.h>
   37 #include <sys/buf.h>
   38 #include <sys/conf.h>
   39 #include <sys/proc.h>
   40 #include <sys/namei.h>
   41 #include <sys/disklabel.h>
   42 #include <sys/errno.h>
   43 #include <sys/kernel.h>
   44 #include <sys/malloc.h>
   45 #include <sys/vnode.h>
   46 #include <sys/file.h>
   47 #include <sys/extent.h>
   48 #include <sys/mount.h>
   49 #include <sys/pool.h>
   50 #include <sys/syscallargs.h>
   51 #include <sys/swap.h>
   52 
   53 #include <uvm/uvm.h>
   54 #ifdef UVM_SWAP_ENCRYPT
   55 #include <sys/syslog.h>
   56 #endif
   57 
   58 #include <miscfs/specfs/specdev.h>
   59 
   60 /*
   61  * uvm_swap.c: manage configuration and i/o to swap space.
   62  */
   63 
   64 /*
   65  * swap space is managed in the following way:
   66  * 
   67  * each swap partition or file is described by a "swapdev" structure.
   68  * each "swapdev" structure contains a "swapent" structure which contains
   69  * information that is passed up to the user (via system calls).
   70  *
   71  * each swap partition is assigned a "priority" (int) which controls
   72  * swap partition usage.
   73  *
   74  * the system maintains a global data structure describing all swap
   75  * partitions/files.   there is a sorted LIST of "swappri" structures
   76  * which describe "swapdev"'s at that priority.   this LIST is headed
   77  * by the "swap_priority" global var.    each "swappri" contains a 
   78  * CIRCLEQ of "swapdev" structures at that priority.
   79  *
   80  * locking:
   81  *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
   82  *    system call and prevents the swap priority list from changing
   83  *    while we are in the middle of a system call (e.g. SWAP_STATS).
   84  *  - uvm.swap_data_lock (simple_lock): this lock protects all swap data
   85  *    structures including the priority list, the swapdev structures,
   86  *    and the swapmap extent.
   87  *
   88  * each swap device has the following info:
   89  *  - swap device in use (could be disabled, preventing future use)
   90  *  - swap enabled (allows new allocations on swap)
   91  *  - map info in /dev/drum
   92  *  - vnode pointer
   93  * for swap files only:
   94  *  - block size
   95  *  - max byte count in buffer
   96  *  - buffer
   97  *  - credentials to use when doing i/o to file
   98  *
   99  * userland controls and configures swap with the swapctl(2) system call.
  100  * the sys_swapctl performs the following operations:
  101  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
  102  *  [2] SWAP_STATS: given a pointer to an array of swapent structures 
  103  *      (passed in via "arg") of a size passed in via "misc" ... we load
  104  *      the current swap config into the array.
  105  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
  106  *      priority in "misc", start swapping on it.
  107  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
  108  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
  109  *      "misc")
  110  */
  111 
  112 /*
  113  * swapdev: describes a single swap partition/file
  114  *
  115  * note the following should be true:
  116  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
  117  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
  118  */
  119 struct swapdev {
  120         struct swapent  swd_se;
  121 #define swd_dev         swd_se.se_dev           /* device id */
  122 #define swd_flags       swd_se.se_flags         /* flags:inuse/enable/fake */
  123 #define swd_priority    swd_se.se_priority      /* our priority */
  124 #define swd_inuse       swd_se.se_inuse         /* our priority */
  125 #define swd_nblks       swd_se.se_nblks         /* our priority */
  126         char                    *swd_path;      /* saved pathname of device */
  127         int                     swd_pathlen;    /* length of pathname */
  128         int                     swd_npages;     /* #pages we can use */
  129         int                     swd_npginuse;   /* #pages in use */
  130         int                     swd_npgbad;     /* #pages bad */
  131         int                     swd_drumoffset; /* page0 offset in drum */
  132         int                     swd_drumsize;   /* #pages in drum */
  133         struct extent           *swd_ex;        /* extent for this swapdev */
  134         char                    swd_exname[12]; /* name of extent above */
  135         struct vnode            *swd_vp;        /* backing vnode */
  136         CIRCLEQ_ENTRY(swapdev)  swd_next;       /* priority circleq */
  137 
  138         int                     swd_bsize;      /* blocksize (bytes) */
  139         int                     swd_maxactive;  /* max active i/o reqs */
  140         struct buf              swd_tab;        /* buffer list */
  141         struct ucred            *swd_cred;      /* cred for file access */
  142 #ifdef UVM_SWAP_ENCRYPT
  143 #define SWD_KEY_SHIFT           7               /* One key per 0.5 MByte */
  144 #define SWD_KEY(x,y)            &((x)->swd_keys[((y) - (x)->swd_drumoffset) >> SWD_KEY_SHIFT])
  145 
  146 #define SWD_DCRYPT_SHIFT        5
  147 #define SWD_DCRYPT_BITS         32
  148 #define SWD_DCRYPT_MASK         (SWD_DCRYPT_BITS - 1)
  149 #define SWD_DCRYPT_OFF(x)       ((x) >> SWD_DCRYPT_SHIFT)
  150 #define SWD_DCRYPT_BIT(x)       ((x) & SWD_DCRYPT_MASK)
  151 #define SWD_DCRYPT_SIZE(x)      (SWD_DCRYPT_OFF((x) + SWD_DCRYPT_MASK) * sizeof(u_int32_t))
  152         u_int32_t               *swd_decrypt;   /* bitmap for decryption */
  153         struct swap_key         *swd_keys;      /* keys for different parts */
  154         int                     swd_nkeys;      /* active keys */
  155 #endif
  156 };
  157 
  158 /*
  159  * swap device priority entry; the list is kept sorted on `spi_priority'.
  160  */
  161 struct swappri {
  162         int                     spi_priority;     /* priority */
  163         CIRCLEQ_HEAD(spi_swapdev, swapdev)      spi_swapdev;
  164         /* circleq of swapdevs at this priority */
  165         LIST_ENTRY(swappri)     spi_swappri;      /* global list of pri's */
  166 };
  167 
  168 /*
  169  * The following two structures are used to keep track of data transfers
  170  * on swap devices associated with regular files.
  171  * NOTE: this code is more or less a copy of vnd.c; we use the same
  172  * structure names here to ease porting..
  173  */
  174 struct vndxfer {
  175         struct buf      *vx_bp;         /* Pointer to parent buffer */
  176         struct swapdev  *vx_sdp;
  177         int             vx_error;
  178         int             vx_pending;     /* # of pending aux buffers */
  179         int             vx_flags;
  180 #define VX_BUSY         1
  181 #define VX_DEAD         2
  182 };
  183 
  184 struct vndbuf {
  185         struct buf      vb_buf;
  186         struct vndxfer  *vb_xfer;
  187 };
  188 
  189 
  190 /*
  191  * We keep a of pool vndbuf's and vndxfer structures.
  192  */
  193 struct pool vndxfer_pool;
  194 struct pool vndbuf_pool;
  195 
  196 #define getvndxfer(vnx) do {                                            \
  197         int s = splbio();                                               \
  198         vnx = pool_get(&vndxfer_pool, PR_WAITOK);                       \
  199         splx(s);                                                        \
  200 } while (0)
  201 
  202 #define putvndxfer(vnx) {                                               \
  203         pool_put(&vndxfer_pool, (void *)(vnx));                         \
  204 }
  205 
  206 #define getvndbuf(vbp)  do {                                            \
  207         int s = splbio();                                               \
  208         vbp = pool_get(&vndbuf_pool, PR_WAITOK);                        \
  209         splx(s);                                                        \
  210 } while (0)
  211 
  212 #define putvndbuf(vbp) {                                                \
  213         pool_put(&vndbuf_pool, (void *)(vbp));                          \
  214 }
  215 
  216 /* /dev/drum */
  217 bdev_decl(sw);
  218 cdev_decl(sw);
  219 
  220 /*
  221  * local variables
  222  */
  223 static struct extent *swapmap;          /* controls the mapping of /dev/drum */
  224 
  225 /* list of all active swap devices [by priority] */
  226 LIST_HEAD(swap_priority, swappri);
  227 static struct swap_priority swap_priority;
  228 
  229 /* locks */
  230 struct rwlock swap_syscall_lock = RWLOCK_INITIALIZER("swplk");
  231 
  232 /*
  233  * prototypes
  234  */
  235 static void              swapdrum_add(struct swapdev *, int);
  236 static struct swapdev   *swapdrum_getsdp(int);
  237 
  238 static struct swapdev   *swaplist_find(struct vnode *, int);
  239 static void              swaplist_insert(struct swapdev *, 
  240                                              struct swappri *, int);
  241 static void              swaplist_trim(void);
  242 
  243 static int swap_on(struct proc *, struct swapdev *);
  244 static int swap_off(struct proc *, struct swapdev *);
  245 
  246 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
  247 static void sw_reg_iodone(struct buf *);
  248 static void sw_reg_start(struct swapdev *);
  249 
  250 static int uvm_swap_io(struct vm_page **, int, int, int);
  251 
  252 static void swapmount(void);
  253 
  254 #ifdef UVM_SWAP_ENCRYPT
  255 /* for swap encrypt */
  256 boolean_t uvm_swap_allocpages(struct vm_page **, int);
  257 void uvm_swap_markdecrypt(struct swapdev *, int, int, int);
  258 boolean_t uvm_swap_needdecrypt(struct swapdev *, int);
  259 void uvm_swap_initcrypt(struct swapdev *, int);
  260 #endif
  261 
  262 /*
  263  * uvm_swap_init: init the swap system data structures and locks
  264  *
  265  * => called at boot time from init_main.c after the filesystems 
  266  *      are brought up (which happens after uvm_init())
  267  */
  268 void
  269 uvm_swap_init()
  270 {
  271         UVMHIST_FUNC("uvm_swap_init");
  272 
  273         UVMHIST_CALLED(pdhist);
  274         /*
  275          * first, init the swap list, its counter, and its lock.
  276          * then get a handle on the vnode for /dev/drum by using
  277          * the its dev_t number ("swapdev", from MD conf.c).
  278          */
  279 
  280         LIST_INIT(&swap_priority);
  281         uvmexp.nswapdev = 0;
  282         simple_lock_init(&uvm.swap_data_lock);
  283 
  284         if (!swapdev_vp && bdevvp(swapdev, &swapdev_vp))
  285                 panic("uvm_swap_init: can't get vnode for swap device");
  286 
  287         /*
  288          * create swap block resource map to map /dev/drum.   the range
  289          * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
  290          * that block 0 is reserved (used to indicate an allocation 
  291          * failure, or no allocation).
  292          */
  293         swapmap = extent_create("swapmap", 1, INT_MAX,
  294                                 M_VMSWAP, 0, 0, EX_NOWAIT);
  295         if (swapmap == 0)
  296                 panic("uvm_swap_init: extent_create failed");
  297 
  298         /*
  299          * allocate pools for structures used for swapping to files.
  300          */
  301 
  302 
  303         pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
  304             NULL);
  305 
  306         pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
  307             NULL);
  308 
  309         /*
  310          * Setup the initial swap partition
  311          */
  312         swapmount();
  313 
  314         /*
  315          * done!
  316          */
  317         UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
  318 }
  319 
  320 #ifdef UVM_SWAP_ENCRYPT
  321 void
  322 uvm_swap_initcrypt_all(void)
  323 {
  324         struct swapdev *sdp;
  325         struct swappri *spp;
  326 
  327         simple_lock(&uvm.swap_data_lock);
  328 
  329         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
  330                 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next)
  331                         if (sdp->swd_decrypt == NULL)
  332                                 uvm_swap_initcrypt(sdp, sdp->swd_npages);
  333         }
  334         simple_unlock(&uvm.swap_data_lock);
  335 }
  336 
  337 void
  338 uvm_swap_initcrypt(struct swapdev *sdp, int npages)
  339 {
  340         /*
  341          * keep information if a page needs to be decrypted when we get it
  342          * from the swap device.
  343          * We cannot chance a malloc later, if we are doing ASYNC puts,
  344          * we may not call malloc with M_WAITOK.  This consumes only
  345          * 8KB memory for a 256MB swap partition.
  346          */
  347         sdp->swd_decrypt = malloc(SWD_DCRYPT_SIZE(npages), M_VMSWAP, M_WAITOK);
  348         memset(sdp->swd_decrypt, 0, SWD_DCRYPT_SIZE(npages));
  349         sdp->swd_keys = malloc((npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key),
  350                                M_VMSWAP, M_WAITOK);
  351         memset(sdp->swd_keys, 0, (npages >> SWD_KEY_SHIFT) * sizeof(struct swap_key));
  352         sdp->swd_nkeys = 0;
  353 }
  354 
  355 boolean_t
  356 uvm_swap_allocpages(struct vm_page **pps, int npages)
  357 {
  358         int i, s;
  359         int minus, reserve;
  360         boolean_t fail;
  361 
  362         /* Estimate if we will succeed */
  363         s = uvm_lock_fpageq();
  364 
  365         minus = uvmexp.free - npages;
  366         reserve = uvmexp.reserve_kernel;
  367         fail = uvmexp.free - npages < uvmexp.reserve_kernel;
  368 
  369         uvm_unlock_fpageq(s);
  370 
  371         if (fail)
  372                 return FALSE;
  373 
  374         /* Get new pages */
  375         for (i = 0; i < npages; i++) {
  376                 pps[i] = uvm_pagealloc(NULL, 0, NULL, 0);
  377                 if (pps[i] == NULL)
  378                         break;
  379         }
  380 
  381         /* On failure free and return */
  382         if (i < npages) {
  383                 uvm_swap_freepages(pps, i);
  384                 return FALSE;
  385         }
  386 
  387         return TRUE;
  388 }
  389 
  390 void
  391 uvm_swap_freepages(struct vm_page **pps, int npages)
  392 {
  393         int i;
  394 
  395         uvm_lock_pageq();
  396         for (i = 0; i < npages; i++)
  397                 uvm_pagefree(pps[i]);
  398         uvm_unlock_pageq();
  399 }
  400 
  401 /*
  402  * Mark pages on the swap device for later decryption
  403  */
  404 
  405 void
  406 uvm_swap_markdecrypt(struct swapdev *sdp, int startslot, int npages,
  407                      int decrypt)
  408 {
  409         int pagestart, i;
  410         int off, bit;
  411         
  412         if (!sdp)
  413                 return;
  414 
  415         pagestart = startslot - sdp->swd_drumoffset;
  416         for (i = 0; i < npages; i++, pagestart++) {
  417                 off = SWD_DCRYPT_OFF(pagestart);
  418                 bit = SWD_DCRYPT_BIT(pagestart);
  419                 if (decrypt)
  420                         /* pages read need decryption */
  421                         sdp->swd_decrypt[off] |= 1 << bit;
  422                 else 
  423                         /* pages read do not need decryption */
  424                         sdp->swd_decrypt[off] &= ~(1 << bit);
  425         }
  426 }
  427 
  428 /*
  429  * Check if the page that we got from disk needs to be decrypted
  430  */
  431 
  432 boolean_t
  433 uvm_swap_needdecrypt(struct swapdev *sdp, int off)
  434 {
  435         if (!sdp)
  436                 return FALSE;
  437 
  438         off -= sdp->swd_drumoffset;
  439         return sdp->swd_decrypt[SWD_DCRYPT_OFF(off)] & (1 << SWD_DCRYPT_BIT(off)) ?
  440                 TRUE : FALSE;
  441 }
  442 #endif /* UVM_SWAP_ENCRYPT */
  443 /*
  444  * swaplist functions: functions that operate on the list of swap
  445  * devices on the system.
  446  */
  447 
  448 /*
  449  * swaplist_insert: insert swap device "sdp" into the global list
  450  *
  451  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
  452  * => caller must provide a newly malloc'd swappri structure (we will
  453  *      FREE it if we don't need it... this it to prevent malloc blocking
  454  *      here while adding swap)
  455  */
  456 static void
  457 swaplist_insert(sdp, newspp, priority)
  458         struct swapdev *sdp;
  459         struct swappri *newspp;
  460         int priority;
  461 {
  462         struct swappri *spp, *pspp;
  463         UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
  464 
  465         /*
  466          * find entry at or after which to insert the new device.
  467          */
  468         for (pspp = NULL, spp = LIST_FIRST(&swap_priority); spp != NULL;
  469              spp = LIST_NEXT(spp, spi_swappri)) {
  470                 if (priority <= spp->spi_priority)
  471                         break;
  472                 pspp = spp;
  473         }
  474 
  475         /*
  476          * new priority?
  477          */
  478         if (spp == NULL || spp->spi_priority != priority) {
  479                 spp = newspp;  /* use newspp! */
  480                 UVMHIST_LOG(pdhist, "created new swappri = %ld",
  481                             priority, 0, 0, 0);
  482 
  483                 spp->spi_priority = priority;
  484                 CIRCLEQ_INIT(&spp->spi_swapdev);
  485 
  486                 if (pspp)
  487                         LIST_INSERT_AFTER(pspp, spp, spi_swappri);
  488                 else
  489                         LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
  490         } else {
  491                 /* we don't need a new priority structure, free it */
  492                 FREE(newspp, M_VMSWAP);
  493         }
  494 
  495         /*
  496          * priority found (or created).   now insert on the priority's
  497          * circleq list and bump the total number of swapdevs.
  498          */
  499         sdp->swd_priority = priority;
  500         CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
  501         uvmexp.nswapdev++;
  502 }
  503 
  504 /*
  505  * swaplist_find: find and optionally remove a swap device from the
  506  *      global list.
  507  *
  508  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
  509  * => we return the swapdev we found (and removed)
  510  */
  511 static struct swapdev *
  512 swaplist_find(vp, remove)
  513         struct vnode *vp;
  514         boolean_t remove;
  515 {
  516         struct swapdev *sdp;
  517         struct swappri *spp;
  518 
  519         /*
  520          * search the lists for the requested vp
  521          */
  522         for (spp = LIST_FIRST(&swap_priority); spp != NULL;
  523              spp = LIST_NEXT(spp, spi_swappri)) {
  524                 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
  525                      sdp != (void *)&spp->spi_swapdev;
  526                      sdp = CIRCLEQ_NEXT(sdp, swd_next))
  527                         if (sdp->swd_vp == vp) {
  528                                 if (remove) {
  529                                         CIRCLEQ_REMOVE(&spp->spi_swapdev,
  530                                             sdp, swd_next);
  531                                         uvmexp.nswapdev--;
  532                                 }
  533                                 return(sdp);
  534                         }
  535         }
  536         return (NULL);
  537 }
  538 
  539 
  540 /*
  541  * swaplist_trim: scan priority list for empty priority entries and kill
  542  *      them.
  543  *
  544  * => caller must hold both swap_syscall_lock and uvm.swap_data_lock
  545  */
  546 static void
  547 swaplist_trim()
  548 {
  549         struct swappri *spp, *nextspp;
  550 
  551         for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
  552                 nextspp = LIST_NEXT(spp, spi_swappri);
  553                 if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
  554                     (void *)&spp->spi_swapdev)
  555                         continue;
  556                 LIST_REMOVE(spp, spi_swappri);
  557                 free(spp, M_VMSWAP);
  558         }
  559 }
  560 
  561 /*
  562  * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area.
  563  *
  564  * => caller must hold swap_syscall_lock
  565  * => uvm.swap_data_lock should be unlocked (we may sleep)
  566  */
  567 static void
  568 swapdrum_add(sdp, npages)
  569         struct swapdev *sdp;
  570         int     npages;
  571 {
  572         u_long result;
  573 
  574         if (extent_alloc(swapmap, npages, EX_NOALIGN, 0, EX_NOBOUNDARY,
  575             EX_WAITOK, &result))
  576                 panic("swapdrum_add");
  577 
  578         sdp->swd_drumoffset = result;
  579         sdp->swd_drumsize = npages;
  580 }
  581 
  582 /*
  583  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
  584  *      to the "swapdev" that maps that section of the drum.
  585  *
  586  * => each swapdev takes one big contig chunk of the drum
  587  * => caller must hold uvm.swap_data_lock
  588  */
  589 static struct swapdev *
  590 swapdrum_getsdp(pgno)
  591         int pgno;
  592 {
  593         struct swapdev *sdp;
  594         struct swappri *spp;
  595         
  596         for (spp = LIST_FIRST(&swap_priority); spp != NULL;
  597              spp = LIST_NEXT(spp, spi_swappri))
  598                 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
  599                      sdp != (void *)&spp->spi_swapdev;
  600                      sdp = CIRCLEQ_NEXT(sdp, swd_next))
  601                         if (pgno >= sdp->swd_drumoffset &&
  602                             pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
  603                                 return sdp;
  604                         }
  605         return NULL;
  606 }
  607 
  608 
  609 /*
  610  * sys_swapctl: main entry point for swapctl(2) system call
  611  *      [with two helper functions: swap_on and swap_off]
  612  */
  613 int
  614 sys_swapctl(p, v, retval)
  615         struct proc *p;
  616         void *v;
  617         register_t *retval;
  618 {
  619         struct sys_swapctl_args /* {
  620                 syscallarg(int) cmd;
  621                 syscallarg(void *) arg;
  622                 syscallarg(int) misc;
  623         } */ *uap = (struct sys_swapctl_args *)v;
  624         struct vnode *vp;
  625         struct nameidata nd;
  626         struct swappri *spp;
  627         struct swapdev *sdp;
  628         struct swapent *sep;
  629         char    userpath[MAXPATHLEN];
  630         size_t  len;
  631         int     count, error, misc;
  632         int     priority;
  633         UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
  634 
  635         misc = SCARG(uap, misc);
  636 
  637         /*
  638          * ensure serialized syscall access by grabbing the swap_syscall_lock
  639          */
  640         rw_enter_write(&swap_syscall_lock);
  641         
  642         /*
  643          * we handle the non-priv NSWAP and STATS request first.
  644          *
  645          * SWAP_NSWAP: return number of config'd swap devices 
  646          * [can also be obtained with uvmexp sysctl]
  647          */
  648         if (SCARG(uap, cmd) == SWAP_NSWAP) {
  649                 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%ld", uvmexp.nswapdev,
  650                     0, 0, 0);
  651                 *retval = uvmexp.nswapdev;
  652                 error = 0;
  653                 goto out;
  654         }
  655 
  656         /*
  657          * SWAP_STATS: get stats on current # of configured swap devs
  658          *
  659          * note that the swap_priority list can't change as long 
  660          * as we are holding the swap_syscall_lock.  we don't want
  661          * to grab the uvm.swap_data_lock because we may fault&sleep during 
  662          * copyout() and we don't want to be holding that lock then!
  663          */
  664         if (SCARG(uap, cmd) == SWAP_STATS
  665 #if defined(COMPAT_13)
  666             || SCARG(uap, cmd) == SWAP_OSTATS
  667 #endif
  668             ) {
  669                 sep = (struct swapent *)SCARG(uap, arg);
  670                 count = 0;
  671 
  672                 for (spp = LIST_FIRST(&swap_priority); spp != NULL;
  673                     spp = LIST_NEXT(spp, spi_swappri)) {
  674                         for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
  675                              sdp != (void *)&spp->spi_swapdev && misc-- > 0;
  676                              sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
  677                                 sdp->swd_inuse = 
  678                                     btodb((u_int64_t)sdp->swd_npginuse <<
  679                                     PAGE_SHIFT);
  680                                 error = copyout(&sdp->swd_se, sep,
  681                                     sizeof(struct swapent));
  682 
  683                                 /* now copy out the path if necessary */
  684 #if defined(COMPAT_13)
  685                                 if (error == 0 && SCARG(uap, cmd) == SWAP_STATS)
  686 #else
  687                                 if (error == 0)
  688 #endif
  689                                         error = copyout(sdp->swd_path,
  690                                             &sep->se_path, sdp->swd_pathlen);
  691 
  692                                 if (error)
  693                                         goto out;
  694                                 count++;
  695 #if defined(COMPAT_13)
  696                                 if (SCARG(uap, cmd) == SWAP_OSTATS)
  697                                         ((struct oswapent *)sep)++;
  698                                 else
  699 #endif
  700                                         sep++;
  701                         }
  702                 }
  703 
  704                 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
  705 
  706                 *retval = count;
  707                 error = 0;
  708                 goto out;
  709         } 
  710 
  711         /*
  712          * all other requests require superuser privs.   verify.
  713          */
  714         if ((error = suser(p, 0)))
  715                 goto out;
  716 
  717         /*
  718          * at this point we expect a path name in arg.   we will
  719          * use namei() to gain a vnode reference (vref), and lock
  720          * the vnode (VOP_LOCK).
  721          *
  722          * XXX: a NULL arg means use the root vnode pointer (e.g. for
  723          * miniroot)
  724          */
  725         if (SCARG(uap, arg) == NULL) {
  726                 vp = rootvp;            /* miniroot */
  727                 if (vget(vp, LK_EXCLUSIVE, p)) {
  728                         error = EBUSY;
  729                         goto out;
  730                 }
  731                 if (SCARG(uap, cmd) == SWAP_ON &&
  732                     copystr("miniroot", userpath, sizeof userpath, &len))
  733                         panic("swapctl: miniroot copy failed");
  734         } else {
  735                 int     space;
  736                 char    *where;
  737 
  738                 if (SCARG(uap, cmd) == SWAP_ON) {
  739                         if ((error = copyinstr(SCARG(uap, arg), userpath,
  740                             sizeof userpath, &len)))
  741                                 goto out;
  742                         space = UIO_SYSSPACE;
  743                         where = userpath;
  744                 } else {
  745                         space = UIO_USERSPACE;
  746                         where = (char *)SCARG(uap, arg);
  747                 }
  748                 NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p);
  749                 if ((error = namei(&nd)))
  750                         goto out;
  751                 vp = nd.ni_vp;
  752         }
  753         /* note: "vp" is referenced and locked */
  754 
  755         error = 0;              /* assume no error */
  756         switch(SCARG(uap, cmd)) {
  757 
  758         case SWAP_DUMPDEV:
  759                 if (vp->v_type != VBLK) {
  760                         error = ENOTBLK;
  761                         break;
  762                 }
  763                 dumpdev = vp->v_rdev;
  764                 break;
  765 
  766         case SWAP_CTL:
  767                 /*
  768                  * get new priority, remove old entry (if any) and then
  769                  * reinsert it in the correct place.  finally, prune out
  770                  * any empty priority structures.
  771                  */
  772                 priority = SCARG(uap, misc);
  773                 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
  774                 simple_lock(&uvm.swap_data_lock);
  775                 if ((sdp = swaplist_find(vp, 1)) == NULL) {
  776                         error = ENOENT;
  777                 } else {
  778                         swaplist_insert(sdp, spp, priority);
  779                         swaplist_trim();
  780                 }
  781                 simple_unlock(&uvm.swap_data_lock);
  782                 if (error)
  783                         free(spp, M_VMSWAP);
  784                 break;
  785 
  786         case SWAP_ON:
  787 
  788                 /*
  789                  * check for duplicates.   if none found, then insert a
  790                  * dummy entry on the list to prevent someone else from
  791                  * trying to enable this device while we are working on
  792                  * it.
  793                  */
  794 
  795                 priority = SCARG(uap, misc);
  796                 simple_lock(&uvm.swap_data_lock);
  797                 if ((sdp = swaplist_find(vp, 0)) != NULL) {
  798                         error = EBUSY;
  799                         simple_unlock(&uvm.swap_data_lock);
  800                         break;
  801                 }
  802                 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
  803                 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
  804                 memset(sdp, 0, sizeof(*sdp));
  805                 sdp->swd_flags = SWF_FAKE;      /* placeholder only */
  806                 sdp->swd_vp = vp;
  807                 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
  808 
  809                 /*
  810                  * XXX Is NFS elaboration necessary?
  811                  */
  812                 if (vp->v_type == VREG) {
  813                         sdp->swd_cred = crdup(p->p_ucred);
  814                 }
  815 
  816                 swaplist_insert(sdp, spp, priority);
  817                 simple_unlock(&uvm.swap_data_lock);
  818 
  819                 sdp->swd_pathlen = len;
  820                 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
  821                 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
  822                         panic("swapctl: copystr");
  823 
  824                 /*
  825                  * we've now got a FAKE placeholder in the swap list.
  826                  * now attempt to enable swap on it.  if we fail, undo
  827                  * what we've done and kill the fake entry we just inserted.
  828                  * if swap_on is a success, it will clear the SWF_FAKE flag
  829                  */
  830 
  831                 if ((error = swap_on(p, sdp)) != 0) {
  832                         simple_lock(&uvm.swap_data_lock);
  833                         (void) swaplist_find(vp, 1);  /* kill fake entry */
  834                         swaplist_trim();
  835                         simple_unlock(&uvm.swap_data_lock);
  836                         if (vp->v_type == VREG) {
  837                                 crfree(sdp->swd_cred);
  838                         }
  839                         free(sdp->swd_path, M_VMSWAP);
  840                         free(sdp, M_VMSWAP);
  841                         break;
  842                 }
  843                 break;
  844 
  845         case SWAP_OFF:
  846                 simple_lock(&uvm.swap_data_lock);
  847                 if ((sdp = swaplist_find(vp, 0)) == NULL) {
  848                         simple_unlock(&uvm.swap_data_lock);
  849                         error = ENXIO;
  850                         break;
  851                 }
  852 
  853                 /*
  854                  * If a device isn't in use or enabled, we
  855                  * can't stop swapping from it (again).
  856                  */
  857                 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
  858                         simple_unlock(&uvm.swap_data_lock);
  859                         error = EBUSY;
  860                         break;
  861                 }
  862 
  863                 /*
  864                  * do the real work.
  865                  */
  866                 error = swap_off(p, sdp);
  867                 break;
  868 
  869         default:
  870                 error = EINVAL;
  871         }
  872 
  873         /*
  874          * done!  release the ref gained by namei() and unlock.
  875          */
  876         vput(vp);
  877 
  878 out:
  879         rw_exit_write(&swap_syscall_lock);
  880 
  881         UVMHIST_LOG(pdhist, "<- done!  error=%ld", error, 0, 0, 0);
  882         return (error);
  883 }
  884 
  885 /*
  886  * swap_on: attempt to enable a swapdev for swapping.   note that the
  887  *      swapdev is already on the global list, but disabled (marked
  888  *      SWF_FAKE).
  889  *
  890  * => we avoid the start of the disk (to protect disk labels)
  891  * => we also avoid the miniroot, if we are swapping to root.
  892  * => caller should leave uvm.swap_data_lock unlocked, we may lock it
  893  *      if needed.
  894  */
  895 static int
  896 swap_on(p, sdp)
  897         struct proc *p;
  898         struct swapdev *sdp;
  899 {
  900         static int count = 0;   /* static */
  901         struct vnode *vp;
  902         int error, npages, nblocks, size;
  903         long addr;
  904         struct vattr va;
  905 #if defined(NFSCLIENT)
  906         extern int (**nfsv2_vnodeop_p)(void *);
  907 #endif /* defined(NFSCLIENT) */
  908         dev_t dev;
  909         UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
  910 
  911         /*
  912          * we want to enable swapping on sdp.   the swd_vp contains
  913          * the vnode we want (locked and ref'd), and the swd_dev
  914          * contains the dev_t of the file, if it a block device.
  915          */
  916 
  917         vp = sdp->swd_vp;
  918         dev = sdp->swd_dev;
  919 
  920         /*
  921          * open the swap file (mostly useful for block device files to
  922          * let device driver know what is up).
  923          *
  924          * we skip the open/close for root on swap because the root
  925          * has already been opened when root was mounted (mountroot).
  926          */
  927         if (vp != rootvp) {
  928                 if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
  929                         return (error);
  930         }
  931 
  932         /* XXX this only works for block devices */
  933         UVMHIST_LOG(pdhist, "  dev=%ld, major(dev)=%ld", dev, major(dev), 0,0);
  934 
  935         /*
  936          * we now need to determine the size of the swap area.   for
  937          * block specials we can call the d_psize function.
  938          * for normal files, we must stat [get attrs].
  939          *
  940          * we put the result in nblks.
  941          * for normal files, we also want the filesystem block size
  942          * (which we get with statfs).
  943          */
  944         switch (vp->v_type) {
  945         case VBLK:
  946                 if (bdevsw[major(dev)].d_psize == 0 ||
  947                     (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
  948                         error = ENXIO;
  949                         goto bad;
  950                 }
  951                 break;
  952 
  953         case VREG:
  954                 if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
  955                         goto bad;
  956                 nblocks = (int)btodb(va.va_size);
  957                 if ((error =
  958                      VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
  959                         goto bad;
  960 
  961                 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
  962                 /*
  963                  * limit the max # of outstanding I/O requests we issue
  964                  * at any one time.   take it easy on NFS servers.
  965                  */
  966 #if defined(NFSCLIENT)
  967                 if (vp->v_op == nfsv2_vnodeop_p)
  968                         sdp->swd_maxactive = 2; /* XXX */
  969                 else
  970 #endif /* defined(NFSCLIENT) */
  971                         sdp->swd_maxactive = 8; /* XXX */
  972                 break;
  973 
  974         default:
  975                 error = ENXIO;
  976                 goto bad;
  977         }
  978 
  979         /*
  980          * save nblocks in a safe place and convert to pages.
  981          */
  982 
  983         sdp->swd_nblks = nblocks;
  984         npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT;
  985 
  986         /*
  987          * for block special files, we want to make sure that leave
  988          * the disklabel and bootblocks alone, so we arrange to skip
  989          * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
  990          * note that because of this the "size" can be less than the
  991          * actual number of blocks on the device.
  992          */
  993         if (vp->v_type == VBLK) {
  994                 /* we use pages 1 to (size - 1) [inclusive] */
  995                 size = npages - 1;
  996                 addr = 1;
  997         } else {
  998                 /* we use pages 0 to (size - 1) [inclusive] */
  999                 size = npages;
 1000                 addr = 0;
 1001         }
 1002 
 1003         /*
 1004          * make sure we have enough blocks for a reasonable sized swap
 1005          * area.   we want at least one page.
 1006          */
 1007 
 1008         if (size < 1) {
 1009                 UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
 1010                 error = EINVAL;
 1011                 goto bad;
 1012         }
 1013 
 1014         UVMHIST_LOG(pdhist, "  dev=%lx: size=%ld addr=0x%lx\n",
 1015             dev, size, addr, 0);
 1016 
 1017         /*
 1018          * now we need to allocate an extent to manage this swap device
 1019          */
 1020         snprintf(sdp->swd_exname, sizeof(sdp->swd_exname), "swap0x%04x",
 1021             count++);
 1022 
 1023         /* note that extent_create's 3rd arg is inclusive, thus "- 1" */
 1024         sdp->swd_ex = extent_create(sdp->swd_exname, 0, npages - 1, M_VMSWAP,
 1025                                     0, 0, EX_WAITOK);
 1026         /* allocate the `saved' region from the extent so it won't be used */
 1027         if (addr) {
 1028                 if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
 1029                         panic("disklabel region");
 1030         }
 1031 
 1032         /*
 1033          * if the vnode we are swapping to is the root vnode 
 1034          * (i.e. we are swapping to the miniroot) then we want
 1035          * to make sure we don't overwrite it.   do a statfs to 
 1036          * find its size and skip over it.
 1037          */
 1038         if (vp == rootvp) {
 1039                 struct mount *mp;
 1040                 struct statfs *sp;
 1041                 int rootblocks, rootpages;
 1042 
 1043                 mp = rootvnode->v_mount;
 1044                 sp = &mp->mnt_stat;
 1045                 rootblocks = sp->f_blocks * btodb(sp->f_bsize);
 1046                 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
 1047                 if (rootpages > size)
 1048                         panic("swap_on: miniroot larger than swap?");
 1049 
 1050                 if (extent_alloc_region(sdp->swd_ex, addr, 
 1051                                         rootpages, EX_WAITOK))
 1052                         panic("swap_on: unable to preserve miniroot");
 1053 
 1054                 size -= rootpages;
 1055                 printf("Preserved %d pages of miniroot ", rootpages);
 1056                 printf("leaving %d pages of swap\n", size);
 1057         }
 1058 
 1059         /*
 1060          * add a ref to vp to reflect usage as a swap device.
 1061          */
 1062         vref(vp);
 1063 
 1064 #ifdef UVM_SWAP_ENCRYPT
 1065         if (uvm_doswapencrypt)
 1066                 uvm_swap_initcrypt(sdp, npages);
 1067 #endif
 1068         /*
 1069          * now add the new swapdev to the drum and enable.
 1070          */
 1071         simple_lock(&uvm.swap_data_lock);
 1072         swapdrum_add(sdp, npages);
 1073         sdp->swd_npages = size;
 1074         sdp->swd_flags &= ~SWF_FAKE;    /* going live */
 1075         sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
 1076         uvmexp.swpages += size;
 1077         simple_unlock(&uvm.swap_data_lock);
 1078         return (0);
 1079 
 1080 bad:
 1081         /*
 1082          * failure: close device if necessary and return error.
 1083          */
 1084         if (vp != rootvp)
 1085                 (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
 1086         return (error);
 1087 }
 1088 
 1089 /*
 1090  * swap_off: stop swapping on swapdev
 1091  *
 1092  * => swap data should be locked, we will unlock.
 1093  */
 1094 static int
 1095 swap_off(p, sdp)
 1096         struct proc *p;
 1097         struct swapdev *sdp;
 1098 {
 1099         int error;
 1100         UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
 1101         UVMHIST_LOG(pdhist, "  dev=%lx", sdp->swd_dev,0,0,0);
 1102 
 1103         /* disable the swap area being removed */
 1104         sdp->swd_flags &= ~SWF_ENABLE;
 1105         simple_unlock(&uvm.swap_data_lock);
 1106 
 1107         /*
 1108          * the idea is to find all the pages that are paged out to this
 1109          * device, and page them all in.  in uvm, swap-backed pageable
 1110          * memory can take two forms: aobjs and anons.  call the
 1111          * swapoff hook for each subsystem to bring in pages.
 1112          */
 1113 
 1114         if (uao_swap_off(sdp->swd_drumoffset,
 1115                          sdp->swd_drumoffset + sdp->swd_drumsize) ||
 1116             amap_swap_off(sdp->swd_drumoffset,
 1117                           sdp->swd_drumoffset + sdp->swd_drumsize)) {
 1118                 
 1119                 error = ENOMEM;
 1120         } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
 1121                 error = EBUSY;
 1122         }
 1123 
 1124         if (error) {
 1125                 simple_lock(&uvm.swap_data_lock);
 1126                 sdp->swd_flags |= SWF_ENABLE;
 1127                 simple_unlock(&uvm.swap_data_lock);
 1128                 return (error);
 1129         }
 1130 
 1131         /*
 1132          * done with the vnode and saved creds.
 1133          * drop our ref on the vnode before calling VOP_CLOSE()
 1134          * so that spec_close() can tell if this is the last close.
 1135          */
 1136         if (sdp->swd_vp->v_type == VREG) {
 1137                 crfree(sdp->swd_cred);
 1138         }
 1139         vrele(sdp->swd_vp);
 1140         if (sdp->swd_vp != rootvp) {
 1141                 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
 1142         }
 1143 
 1144         simple_lock(&uvm.swap_data_lock);
 1145         uvmexp.swpages -= sdp->swd_npages;
 1146 
 1147         if (swaplist_find(sdp->swd_vp, 1) == NULL)
 1148                 panic("swap_off: swapdev not in list");
 1149         swaplist_trim();
 1150 
 1151         /*
 1152          * free all resources!
 1153          */
 1154         extent_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize,
 1155                     EX_WAITOK);
 1156         extent_destroy(sdp->swd_ex);
 1157         free(sdp, M_VMSWAP);
 1158         simple_unlock(&uvm.swap_data_lock);
 1159         return (0);
 1160 }
 1161 
 1162 /*
 1163  * /dev/drum interface and i/o functions
 1164  */
 1165 
 1166 /*
 1167  * swread: the read function for the drum (just a call to physio)
 1168  */
 1169 /*ARGSUSED*/
 1170 int
 1171 swread(dev, uio, ioflag)
 1172         dev_t dev;
 1173         struct uio *uio;
 1174         int ioflag;
 1175 {
 1176         UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
 1177 
 1178         UVMHIST_LOG(pdhist, "  dev=%lx offset=%lx",
 1179             dev, (u_long)uio->uio_offset, 0, 0);
 1180         return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
 1181 }
 1182 
 1183 /*
 1184  * swwrite: the write function for the drum (just a call to physio)
 1185  */
 1186 /*ARGSUSED*/
 1187 int
 1188 swwrite(dev, uio, ioflag)
 1189         dev_t dev;
 1190         struct uio *uio;
 1191         int ioflag;
 1192 {
 1193         UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
 1194 
 1195         UVMHIST_LOG(pdhist, "  dev=%lx offset=%lx",
 1196             dev, (u_long)uio->uio_offset, 0, 0);
 1197         return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
 1198 }
 1199 
 1200 /*
 1201  * swstrategy: perform I/O on the drum
 1202  *
 1203  * => we must map the i/o request from the drum to the correct swapdev.
 1204  */
 1205 void
 1206 swstrategy(bp)
 1207         struct buf *bp;
 1208 {
 1209         struct swapdev *sdp;
 1210         int s, pageno, bn;
 1211         UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
 1212 
 1213         /*
 1214          * convert block number to swapdev.   note that swapdev can't
 1215          * be yanked out from under us because we are holding resources
 1216          * in it (i.e. the blocks we are doing I/O on).
 1217          */
 1218         pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
 1219         simple_lock(&uvm.swap_data_lock);
 1220         sdp = swapdrum_getsdp(pageno);
 1221         simple_unlock(&uvm.swap_data_lock);
 1222         if (sdp == NULL) {
 1223                 bp->b_error = EINVAL;
 1224                 bp->b_flags |= B_ERROR;
 1225                 s = splbio();
 1226                 biodone(bp);
 1227                 splx(s);
 1228                 UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
 1229                 return;
 1230         }
 1231 
 1232         /*
 1233          * convert drum page number to block number on this swapdev.
 1234          */
 1235 
 1236         pageno -= sdp->swd_drumoffset;  /* page # on swapdev */
 1237         bn = btodb((u_int64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
 1238 
 1239         UVMHIST_LOG(pdhist, "  %s: mapoff=%lx bn=0x%lx bcount=%ld",
 1240                 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
 1241                 sdp->swd_drumoffset, bn, bp->b_bcount);
 1242 
 1243         /*
 1244          * for block devices we finish up here.
 1245          * for regular files we have to do more work which we delegate
 1246          * to sw_reg_strategy().
 1247          */
 1248 
 1249         switch (sdp->swd_vp->v_type) {
 1250         default:
 1251                 panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type);
 1252 
 1253         case VBLK:
 1254 
 1255                 /*
 1256                  * must convert "bp" from an I/O on /dev/drum to an I/O
 1257                  * on the swapdev (sdp).
 1258                  */
 1259                 s = splbio();
 1260                 buf_replacevnode(bp, sdp->swd_vp);
 1261 
 1262                 bp->b_blkno = bn;
 1263                 splx(s);
 1264                 VOP_STRATEGY(bp);
 1265                 return;
 1266 
 1267         case VREG:
 1268                 /*
 1269                  * delegate to sw_reg_strategy function.
 1270                  */
 1271                 sw_reg_strategy(sdp, bp, bn);
 1272                 return;
 1273         }
 1274         /* NOTREACHED */
 1275 }
 1276 
 1277 /*
 1278  * sw_reg_strategy: handle swap i/o to regular files
 1279  */
 1280 static void
 1281 sw_reg_strategy(sdp, bp, bn)
 1282         struct swapdev  *sdp;
 1283         struct buf      *bp;
 1284         int             bn;
 1285 {
 1286         struct vnode    *vp;
 1287         struct vndxfer  *vnx;
 1288         daddr64_t       nbn;
 1289         caddr_t         addr;
 1290         off_t           byteoff;
 1291         int             s, off, nra, error, sz, resid;
 1292         UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
 1293 
 1294         /*
 1295          * allocate a vndxfer head for this transfer and point it to
 1296          * our buffer.
 1297          */
 1298         getvndxfer(vnx);
 1299         vnx->vx_flags = VX_BUSY;
 1300         vnx->vx_error = 0;
 1301         vnx->vx_pending = 0;
 1302         vnx->vx_bp = bp;
 1303         vnx->vx_sdp = sdp;
 1304 
 1305         /*
 1306          * setup for main loop where we read filesystem blocks into
 1307          * our buffer.
 1308          */
 1309         error = 0;
 1310         bp->b_resid = bp->b_bcount;     /* nothing transferred yet! */
 1311         addr = bp->b_data;              /* current position in buffer */
 1312         byteoff = dbtob((u_int64_t)bn);
 1313 
 1314         for (resid = bp->b_resid; resid; resid -= sz) {
 1315                 struct vndbuf   *nbp;
 1316 
 1317                 /*
 1318                  * translate byteoffset into block number.  return values:
 1319                  *   vp = vnode of underlying device
 1320                  *  nbn = new block number (on underlying vnode dev)
 1321                  *  nra = num blocks we can read-ahead (excludes requested
 1322                  *      block)
 1323                  */
 1324                 nra = 0;
 1325                 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
 1326                                         &vp, &nbn, &nra);
 1327 
 1328                 if (error == 0 && nbn == (daddr64_t)-1) {
 1329                         /* 
 1330                          * this used to just set error, but that doesn't
 1331                          * do the right thing.  Instead, it causes random
 1332                          * memory errors.  The panic() should remain until
 1333                          * this condition doesn't destabilize the system.
 1334                          */
 1335 #if 1
 1336                         panic("sw_reg_strategy: swap to sparse file");
 1337 #else
 1338                         error = EIO;    /* failure */
 1339 #endif
 1340                 }
 1341 
 1342                 /*
 1343                  * punt if there was an error or a hole in the file.
 1344                  * we must wait for any i/o ops we have already started
 1345                  * to finish before returning.
 1346                  *
 1347                  * XXX we could deal with holes here but it would be
 1348                  * a hassle (in the write case).
 1349                  */
 1350                 if (error) {
 1351                         s = splbio();
 1352                         vnx->vx_error = error;  /* pass error up */
 1353                         goto out;
 1354                 }
 1355 
 1356                 /*
 1357                  * compute the size ("sz") of this transfer (in bytes).
 1358                  */
 1359                 off = byteoff % sdp->swd_bsize;
 1360                 sz = (1 + nra) * sdp->swd_bsize - off;
 1361                 if (sz > resid)
 1362                         sz = resid;
 1363 
 1364                 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
 1365                             "vp %p/%p offset 0x%lx/0x%llx",
 1366                             sdp->swd_vp, vp, (u_long)byteoff, nbn);
 1367 
 1368                 /*
 1369                  * now get a buf structure.   note that the vb_buf is
 1370                  * at the front of the nbp structure so that you can
 1371                  * cast pointers between the two structure easily.
 1372                  */
 1373                 getvndbuf(nbp);
 1374                 nbp->vb_buf.b_flags    = bp->b_flags | B_CALL;
 1375                 nbp->vb_buf.b_bcount   = sz;
 1376                 nbp->vb_buf.b_bufsize  = sz;
 1377                 nbp->vb_buf.b_error    = 0;
 1378                 nbp->vb_buf.b_data     = addr;
 1379                 nbp->vb_buf.b_blkno    = nbn + btodb(off);
 1380                 nbp->vb_buf.b_proc     = bp->b_proc;
 1381                 nbp->vb_buf.b_iodone   = sw_reg_iodone;
 1382                 nbp->vb_buf.b_vp       = NULLVP;
 1383                 nbp->vb_buf.b_vnbufs.le_next = NOLIST;
 1384                 LIST_INIT(&nbp->vb_buf.b_dep);
 1385 
 1386                 /* 
 1387                  * set b_dirtyoff/end and b_validoff/end.   this is
 1388                  * required by the NFS client code (otherwise it will
 1389                  * just discard our I/O request).
 1390                  */
 1391                 if (bp->b_dirtyend == 0) {
 1392                         nbp->vb_buf.b_dirtyoff = 0;
 1393                         nbp->vb_buf.b_dirtyend = sz;
 1394                 } else {
 1395                         nbp->vb_buf.b_dirtyoff =
 1396                             max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
 1397                         nbp->vb_buf.b_dirtyend =
 1398                             min(sz,
 1399                                 max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
 1400                 }
 1401                 if (bp->b_validend == 0) {
 1402                         nbp->vb_buf.b_validoff = 0;
 1403                         nbp->vb_buf.b_validend = sz;
 1404                 } else {
 1405                         nbp->vb_buf.b_validoff =
 1406                             max(0, bp->b_validoff - (bp->b_bcount-resid));
 1407                         nbp->vb_buf.b_validend =
 1408                             min(sz,
 1409                                 max(0, bp->b_validend - (bp->b_bcount-resid)));
 1410                 }
 1411 
 1412                 nbp->vb_xfer = vnx;     /* patch it back in to vnx */
 1413 
 1414                 /*
 1415                  * Just sort by block number
 1416                  */
 1417                 nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
 1418                 s = splbio();
 1419                 if (vnx->vx_error != 0) {
 1420                         putvndbuf(nbp);
 1421                         goto out;
 1422                 }
 1423                 vnx->vx_pending++;
 1424 
 1425                 /* assoc new buffer with underlying vnode */
 1426                 bgetvp(vp, &nbp->vb_buf);
 1427 
 1428                 /* sort it in and start I/O if we are not over our limit */
 1429                 disksort(&sdp->swd_tab, &nbp->vb_buf);
 1430                 sw_reg_start(sdp);
 1431                 splx(s);
 1432 
 1433                 /*
 1434                  * advance to the next I/O
 1435                  */
 1436                 byteoff += sz;
 1437                 addr += sz;
 1438         }
 1439 
 1440         s = splbio();
 1441 
 1442 out: /* Arrive here at splbio */
 1443         vnx->vx_flags &= ~VX_BUSY;
 1444         if (vnx->vx_pending == 0) {
 1445                 if (vnx->vx_error != 0) {
 1446                         bp->b_error = vnx->vx_error;
 1447                         bp->b_flags |= B_ERROR;
 1448                 }
 1449                 putvndxfer(vnx);
 1450                 biodone(bp);
 1451         }
 1452         splx(s);
 1453 }
 1454 
 1455 /*
 1456  * sw_reg_start: start an I/O request on the requested swapdev
 1457  *
 1458  * => reqs are sorted by disksort (above)
 1459  */
 1460 static void
 1461 sw_reg_start(sdp)
 1462         struct swapdev  *sdp;
 1463 {
 1464         struct buf      *bp;
 1465         UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
 1466 
 1467         /* recursion control */
 1468         if ((sdp->swd_flags & SWF_BUSY) != 0)
 1469                 return;
 1470 
 1471         sdp->swd_flags |= SWF_BUSY;
 1472 
 1473         while (sdp->swd_tab.b_active < sdp->swd_maxactive) {
 1474                 bp = sdp->swd_tab.b_actf;
 1475                 if (bp == NULL)
 1476                         break;
 1477                 sdp->swd_tab.b_actf = bp->b_actf;
 1478                 sdp->swd_tab.b_active++;
 1479 
 1480                 UVMHIST_LOG(pdhist,
 1481                     "sw_reg_start:  bp %p vp %p blkno 0x%lx cnt 0x%lx",
 1482                     bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
 1483                 if ((bp->b_flags & B_READ) == 0)
 1484                         bp->b_vp->v_numoutput++;
 1485 
 1486                 VOP_STRATEGY(bp);
 1487         }
 1488         sdp->swd_flags &= ~SWF_BUSY;
 1489 }
 1490 
 1491 /*
 1492  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
 1493  *
 1494  * => note that we can recover the vndbuf struct by casting the buf ptr
 1495  */
 1496 static void
 1497 sw_reg_iodone(bp)
 1498         struct buf *bp;
 1499 {
 1500         struct vndbuf *vbp = (struct vndbuf *) bp;
 1501         struct vndxfer *vnx = vbp->vb_xfer;
 1502         struct buf *pbp = vnx->vx_bp;           /* parent buffer */
 1503         struct swapdev  *sdp = vnx->vx_sdp;
 1504         int resid;
 1505         UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
 1506 
 1507         UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=0x%lx addr=%p",
 1508             vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
 1509         UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
 1510             vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
 1511 
 1512         splassert(IPL_BIO);
 1513 
 1514         resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
 1515         pbp->b_resid -= resid;
 1516         vnx->vx_pending--;
 1517 
 1518         if (vbp->vb_buf.b_error) {
 1519                 UVMHIST_LOG(pdhist, "  got error=%ld !",
 1520                     vbp->vb_buf.b_error, 0, 0, 0);
 1521 
 1522                 /* pass error upward */
 1523                 vnx->vx_error = vbp->vb_buf.b_error;
 1524         }
 1525 
 1526         /*
 1527          * disassociate this buffer from the vnode (if any).
 1528          */
 1529         if (vbp->vb_buf.b_vp != NULL) {
 1530                 brelvp(&vbp->vb_buf);
 1531         }
 1532 
 1533         /*
 1534          * kill vbp structure
 1535          */
 1536         putvndbuf(vbp);
 1537 
 1538         /*
 1539          * wrap up this transaction if it has run to completion or, in
 1540          * case of an error, when all auxiliary buffers have returned.
 1541          */
 1542         if (vnx->vx_error != 0) {
 1543                 /* pass error upward */
 1544                 pbp->b_flags |= B_ERROR;
 1545                 pbp->b_error = vnx->vx_error;
 1546                 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
 1547                         putvndxfer(vnx);
 1548                         biodone(pbp);
 1549                 }
 1550         } else if (pbp->b_resid == 0) {
 1551                 KASSERT(vnx->vx_pending == 0);
 1552                 if ((vnx->vx_flags & VX_BUSY) == 0) {
 1553                         UVMHIST_LOG(pdhist, "  iodone error=%ld !",
 1554                             pbp, vnx->vx_error, 0, 0);
 1555                         putvndxfer(vnx);
 1556                         biodone(pbp);
 1557                 }
 1558         }
 1559 
 1560         /*
 1561          * done!   start next swapdev I/O if one is pending
 1562          */
 1563         sdp->swd_tab.b_active--;
 1564         sw_reg_start(sdp);
 1565 }
 1566 
 1567 
 1568 /*
 1569  * uvm_swap_alloc: allocate space on swap
 1570  *
 1571  * => allocation is done "round robin" down the priority list, as we
 1572  *      allocate in a priority we "rotate" the circle queue.
 1573  * => space can be freed with uvm_swap_free
 1574  * => we return the page slot number in /dev/drum (0 == invalid slot)
 1575  * => we lock uvm.swap_data_lock
 1576  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
 1577  */
 1578 int
 1579 uvm_swap_alloc(nslots, lessok)
 1580         int *nslots;    /* IN/OUT */
 1581         boolean_t lessok;
 1582 {
 1583         struct swapdev *sdp;
 1584         struct swappri *spp;
 1585         u_long  result;
 1586         UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
 1587 
 1588         /*
 1589          * no swap devices configured yet?   definite failure.
 1590          */
 1591         if (uvmexp.nswapdev < 1)
 1592                 return 0;
 1593         
 1594         /*
 1595          * lock data lock, convert slots into blocks, and enter loop
 1596          */
 1597         simple_lock(&uvm.swap_data_lock);
 1598 
 1599 ReTry:  /* XXXMRG */
 1600         for (spp = LIST_FIRST(&swap_priority); spp != NULL;
 1601              spp = LIST_NEXT(spp, spi_swappri)) {
 1602                 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
 1603                      sdp != (void *)&spp->spi_swapdev;
 1604                      sdp = CIRCLEQ_NEXT(sdp,swd_next)) {
 1605                         /* if it's not enabled, then we can't swap from it */
 1606                         if ((sdp->swd_flags & SWF_ENABLE) == 0)
 1607                                 continue;
 1608                         if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
 1609                                 continue;
 1610                         if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, 0,
 1611                                          EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
 1612                                          &result) != 0) {
 1613                                 continue;
 1614                         }
 1615 
 1616                         /*
 1617                          * successful allocation!  now rotate the circleq.
 1618                          */
 1619                         CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
 1620                         CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
 1621                         sdp->swd_npginuse += *nslots;
 1622                         uvmexp.swpginuse += *nslots;
 1623                         simple_unlock(&uvm.swap_data_lock);
 1624                         /* done!  return drum slot number */
 1625                         UVMHIST_LOG(pdhist,
 1626                             "success!  returning %ld slots starting at %ld",
 1627                             *nslots, result + sdp->swd_drumoffset, 0, 0);
 1628                         return(result + sdp->swd_drumoffset);
 1629                 }
 1630         }
 1631 
 1632         /* XXXMRG: BEGIN HACK */
 1633         if (*nslots > 1 && lessok) {
 1634                 *nslots = 1;
 1635                 goto ReTry;     /* XXXMRG: ugh!  extent should support this for us */
 1636         }
 1637         /* XXXMRG: END HACK */
 1638 
 1639         simple_unlock(&uvm.swap_data_lock);
 1640         return 0;               /* failed */
 1641 }
 1642 
 1643 /*
 1644  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
 1645  *
 1646  * => we lock uvm.swap_data_lock
 1647  */
 1648 void
 1649 uvm_swap_markbad(startslot, nslots)
 1650         int startslot;
 1651         int nslots;
 1652 {
 1653         struct swapdev *sdp;
 1654         UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
 1655 
 1656         simple_lock(&uvm.swap_data_lock);
 1657         sdp = swapdrum_getsdp(startslot);
 1658         if (sdp != NULL) {
 1659                 /*
 1660                  * we just keep track of how many pages have been marked bad
 1661                  * in this device, to make everything add up in swap_off().
 1662                  * we assume here that the range of slots will all be within
 1663                  * one swap device.
 1664                  */
 1665                 sdp->swd_npgbad += nslots;
 1666                 UVMHIST_LOG(pdhist, "now %ld bad", sdp->swd_npgbad, 0,0,0);
 1667         }
 1668         simple_unlock(&uvm.swap_data_lock);
 1669 }
 1670 
 1671 /*
 1672  * uvm_swap_free: free swap slots
 1673  *
 1674  * => this can be all or part of an allocation made by uvm_swap_alloc
 1675  * => we lock uvm.swap_data_lock
 1676  */
 1677 void
 1678 uvm_swap_free(startslot, nslots)
 1679         int startslot;
 1680         int nslots;
 1681 {
 1682         struct swapdev *sdp;
 1683         UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
 1684 
 1685         UVMHIST_LOG(pdhist, "freeing %ld slots starting at %ld", nslots,
 1686             startslot, 0, 0);
 1687 
 1688         /*
 1689          * ignore attempts to free the "bad" slot.
 1690          */
 1691 
 1692         if (startslot == SWSLOT_BAD) {
 1693                 return;
 1694         }
 1695 
 1696         /*
 1697          * convert drum slot offset back to sdp, free the blocks 
 1698          * in the extent, and return.   must hold pri lock to do 
 1699          * lookup and access the extent.
 1700          */
 1701 
 1702         simple_lock(&uvm.swap_data_lock);
 1703         sdp = swapdrum_getsdp(startslot);
 1704         KASSERT(uvmexp.nswapdev >= 1);
 1705         KASSERT(sdp != NULL);
 1706         KASSERT(sdp->swd_npginuse >= nslots);
 1707         if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots,
 1708                         EX_MALLOCOK|EX_NOWAIT) != 0) {
 1709                 printf("warning: resource shortage: %d pages of swap lost\n",
 1710                         nslots);
 1711         }
 1712 
 1713         sdp->swd_npginuse -= nslots;
 1714         uvmexp.swpginuse -= nslots;
 1715 #ifdef UVM_SWAP_ENCRYPT
 1716         {
 1717                 int i;
 1718                 if (swap_encrypt_initialized) {
 1719                         /* Dereference keys */
 1720                         for (i = 0; i < nslots; i++)
 1721                                 if (uvm_swap_needdecrypt(sdp, startslot + i))
 1722                                         SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i));
 1723 
 1724                         /* Mark range as not decrypt */
 1725                         uvm_swap_markdecrypt(sdp, startslot, nslots, 0);
 1726                 }
 1727         }
 1728 #endif /* UVM_SWAP_ENCRYPT */
 1729         simple_unlock(&uvm.swap_data_lock);
 1730 }
 1731 
 1732 /*
 1733  * uvm_swap_put: put any number of pages into a contig place on swap
 1734  *
 1735  * => can be sync or async
 1736  * => XXXMRG: consider making it an inline or macro
 1737  */
 1738 int
 1739 uvm_swap_put(swslot, ppsp, npages, flags)
 1740         int swslot;
 1741         struct vm_page **ppsp;
 1742         int     npages;
 1743         int     flags;
 1744 {
 1745         int     result;
 1746 
 1747         result = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
 1748             ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
 1749 
 1750         return (result);
 1751 }
 1752 
 1753 /*
 1754  * uvm_swap_get: get a single page from swap
 1755  *
 1756  * => usually a sync op (from fault)
 1757  * => XXXMRG: consider making it an inline or macro
 1758  */
 1759 int
 1760 uvm_swap_get(page, swslot, flags)
 1761         struct vm_page *page;
 1762         int swslot, flags;
 1763 {
 1764         int     result;
 1765 
 1766         uvmexp.nswget++;
 1767         KASSERT(flags & PGO_SYNCIO);
 1768         if (swslot == SWSLOT_BAD) {
 1769                 return VM_PAGER_ERROR;
 1770         }
 1771 
 1772         /*
 1773          * this page is (about to be) no longer only in swap.
 1774          */
 1775         simple_lock(&uvm.swap_data_lock);
 1776         uvmexp.swpgonly--;
 1777         simple_unlock(&uvm.swap_data_lock);
 1778 
 1779         result = uvm_swap_io(&page, swslot, 1, B_READ | 
 1780             ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
 1781 
 1782         if (result != VM_PAGER_OK && result != VM_PAGER_PEND) {
 1783                 /*
 1784                  * oops, the read failed so it really is still only in swap.
 1785                  */
 1786                 simple_lock(&uvm.swap_data_lock);
 1787                 uvmexp.swpgonly++;
 1788                 simple_unlock(&uvm.swap_data_lock);
 1789         }
 1790 
 1791         return (result);
 1792 }
 1793 
 1794 /*
 1795  * uvm_swap_io: do an i/o operation to swap
 1796  */
 1797 
 1798 static int
 1799 uvm_swap_io(pps, startslot, npages, flags)
 1800         struct vm_page **pps;
 1801         int startslot, npages, flags;
 1802 {
 1803         daddr64_t startblk;
 1804         struct  buf *bp;
 1805         vaddr_t kva;
 1806         int     result, s, mapinflags, pflag;
 1807         boolean_t write, async;
 1808 #ifdef UVM_SWAP_ENCRYPT
 1809         vaddr_t dstkva;
 1810         struct vm_page *tpps[MAXBSIZE >> PAGE_SHIFT];
 1811         struct swapdev *sdp;
 1812         int     encrypt = 0;
 1813 #endif
 1814         UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
 1815 
 1816         UVMHIST_LOG(pdhist, "<- called, startslot=%ld, npages=%ld, flags=%ld",
 1817             startslot, npages, flags, 0);
 1818 
 1819         write = (flags & B_READ) == 0;
 1820         async = (flags & B_ASYNC) != 0;
 1821 
 1822         /*
 1823          * convert starting drum slot to block number
 1824          */
 1825         startblk = btodb((u_int64_t)startslot << PAGE_SHIFT);
 1826 
 1827         /*
 1828          * first, map the pages into the kernel (XXX: currently required
 1829          * by buffer system).
 1830          */
 1831         mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
 1832         if (!async)
 1833                 mapinflags |= UVMPAGER_MAPIN_WAITOK;
 1834         kva = uvm_pagermapin(pps, npages, mapinflags);
 1835         if (kva == 0)
 1836                 return (VM_PAGER_AGAIN);
 1837 
 1838 #ifdef UVM_SWAP_ENCRYPT
 1839         if (write) {
 1840                 /*
 1841                  * Check if we need to do swap encryption on old pages.
 1842                  * Later we need a different scheme, that swap encrypts
 1843                  * all pages of a process that had at least one page swap
 1844                  * encrypted.  Then we might not need to copy all pages
 1845                  * in the cluster, and avoid the memory overheard in 
 1846                  * swapping.
 1847                  */
 1848                 if (uvm_doswapencrypt)
 1849                         encrypt = 1;
 1850         }
 1851 
 1852         if (swap_encrypt_initialized  || encrypt) { 
 1853                 /*
 1854                  * we need to know the swap device that we are swapping to/from
 1855                  * to see if the pages need to be marked for decryption or
 1856                  * actually need to be decrypted.
 1857                  * XXX - does this information stay the same over the whole 
 1858                  * execution of this function?
 1859                  */
 1860                 simple_lock(&uvm.swap_data_lock);
 1861                 sdp = swapdrum_getsdp(startslot);
 1862                 simple_unlock(&uvm.swap_data_lock);
 1863         }
 1864 
 1865         /* 
 1866          * encrypt to swap
 1867          */
 1868         if (write && encrypt) {
 1869                 int i, opages;
 1870                 caddr_t src, dst;
 1871                 struct swap_key *key;
 1872                 u_int64_t block;
 1873                 int swmapflags;
 1874 
 1875                 /* We always need write access. */
 1876                 swmapflags = UVMPAGER_MAPIN_READ;
 1877                 if (!async)
 1878                         swmapflags |= UVMPAGER_MAPIN_WAITOK;
 1879 
 1880                 if (!uvm_swap_allocpages(tpps, npages)) {
 1881                         uvm_pagermapout(kva, npages);
 1882                         return (VM_PAGER_AGAIN);
 1883                 }
 1884                 
 1885                 dstkva = uvm_pagermapin(tpps, npages, swmapflags);
 1886                 if (dstkva == 0) {
 1887                         uvm_pagermapout(kva, npages);
 1888                         uvm_swap_freepages(tpps, npages);
 1889                         return (VM_PAGER_AGAIN);
 1890                 }
 1891 
 1892                 src = (caddr_t) kva;
 1893                 dst = (caddr_t) dstkva;
 1894                 block = startblk;
 1895                 for (i = 0; i < npages; i++) {
 1896                         key = SWD_KEY(sdp, startslot + i);
 1897                         SWAP_KEY_GET(sdp, key); /* add reference */
 1898 
 1899                         /* mark for async writes */
 1900                         atomic_setbits_int(&tpps[i]->pg_flags, PQ_ENCRYPT);
 1901                         swap_encrypt(key, src, dst, block, 1 << PAGE_SHIFT);
 1902                         src += 1 << PAGE_SHIFT;
 1903                         dst += 1 << PAGE_SHIFT;
 1904                         block += btodb(1 << PAGE_SHIFT);
 1905                 }
 1906 
 1907                 uvm_pagermapout(kva, npages);
 1908 
 1909                 /* dispose of pages we dont use anymore */
 1910                 opages = npages;
 1911                 uvm_pager_dropcluster(NULL, NULL, pps, &opages, 
 1912                                       PGO_PDFREECLUST);
 1913 
 1914                 kva = dstkva;
 1915         }
 1916 #endif /* UVM_SWAP_ENCRYPT */
 1917 
 1918         /* 
 1919          * now allocate a buf for the i/o.
 1920          * [make sure we don't put the pagedaemon to sleep...]
 1921          */
 1922         s = splbio();
 1923         pflag = (async || curproc == uvm.pagedaemon_proc) ? 0 : PR_WAITOK;
 1924         bp = pool_get(&bufpool, pflag);
 1925         splx(s);
 1926 
 1927         /*
 1928          * if we failed to get a swapbuf, return "try again"
 1929          */
 1930         if (bp == NULL) {
 1931 #ifdef UVM_SWAP_ENCRYPT
 1932                 if (write && encrypt) {
 1933                         int i;
 1934 
 1935                         /* swap encrypt needs cleanup */
 1936                         for (i = 0; i < npages; i++)
 1937                                 SWAP_KEY_PUT(sdp, SWD_KEY(sdp, startslot + i));
 1938 
 1939                         uvm_pagermapout(kva, npages);
 1940                         uvm_swap_freepages(tpps, npages);
 1941                 }
 1942 #endif
 1943                 return (VM_PAGER_AGAIN);
 1944         }
 1945         
 1946 #ifdef UVM_SWAP_ENCRYPT
 1947         /* 
 1948          * prevent ASYNC reads.
 1949          * uvm_swap_io is only called from uvm_swap_get, uvm_swap_get
 1950          * assumes that all gets are SYNCIO.  Just make sure here.
 1951          * XXXARTUBC - might not be true anymore.
 1952          */
 1953         if (!write) {
 1954                 flags &= ~B_ASYNC;
 1955                 async = 0;
 1956         }
 1957 #endif
 1958         /*
 1959          * fill in the bp.   we currently route our i/o through
 1960          * /dev/drum's vnode [swapdev_vp].
 1961          */
 1962         bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
 1963         bp->b_proc = &proc0;    /* XXX */
 1964         bp->b_vnbufs.le_next = NOLIST;
 1965         bp->b_data = (caddr_t)kva;
 1966         bp->b_blkno = startblk;
 1967         LIST_INIT(&bp->b_dep);
 1968         s = splbio();
 1969         bp->b_vp = NULL;
 1970         buf_replacevnode(bp, swapdev_vp);
 1971         splx(s);
 1972         bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
 1973 
 1974         /* 
 1975          * for pageouts we must set "dirtyoff" [NFS client code needs it].
 1976          * and we bump v_numoutput (counter of number of active outputs).
 1977          */
 1978         if (write) {
 1979                 bp->b_dirtyoff = 0;
 1980                 bp->b_dirtyend = npages << PAGE_SHIFT;
 1981 #ifdef UVM_SWAP_ENCRYPT
 1982                 /* mark the pages in the drum for decryption */
 1983                 if (swap_encrypt_initialized)
 1984                         uvm_swap_markdecrypt(sdp, startslot, npages, encrypt);
 1985 #endif
 1986                 s = splbio();
 1987                 swapdev_vp->v_numoutput++;
 1988                 splx(s);
 1989         }
 1990 
 1991         /*
 1992          * for async ops we must set up the iodone handler.
 1993          */
 1994         if (async) {
 1995                 bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
 1996                                          B_PDAEMON : 0);
 1997                 bp->b_iodone = uvm_aio_biodone;
 1998                 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
 1999         }
 2000         UVMHIST_LOG(pdhist,
 2001             "about to start io: data = %p blkno = 0x%lx, bcount = %ld",
 2002             bp->b_data, bp->b_blkno, bp->b_bcount, 0);
 2003 
 2004         /*
 2005          * now we start the I/O, and if async, return.
 2006          */
 2007         VOP_STRATEGY(bp);
 2008         if (async)
 2009                 return (VM_PAGER_PEND);
 2010 
 2011         /*
 2012          * must be sync i/o.   wait for it to finish
 2013          */
 2014         (void) biowait(bp);
 2015         result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;
 2016 
 2017 #ifdef UVM_SWAP_ENCRYPT
 2018         /* 
 2019          * decrypt swap
 2020          */
 2021         if (swap_encrypt_initialized &&
 2022             (bp->b_flags & B_READ) && !(bp->b_flags & B_ERROR)) {
 2023                 int i;
 2024                 caddr_t data = bp->b_data;
 2025                 u_int64_t block = startblk;
 2026                 struct swap_key *key = NULL;
 2027 
 2028                 for (i = 0; i < npages; i++) {
 2029                         /* Check if we need to decrypt */
 2030                         if (uvm_swap_needdecrypt(sdp, startslot + i)) {
 2031                                 key = SWD_KEY(sdp, startslot + i);
 2032                                 swap_decrypt(key, data, data, block,
 2033                                              1 << PAGE_SHIFT);
 2034                         }
 2035                         data += 1 << PAGE_SHIFT;
 2036                         block += btodb(1 << PAGE_SHIFT);
 2037                 }
 2038         }
 2039 #endif
 2040         /*
 2041          * kill the pager mapping
 2042          */
 2043         uvm_pagermapout(kva, npages);
 2044 
 2045 #ifdef UVM_SWAP_ENCRYPT
 2046         /*
 2047          *  Not anymore needed, free after encryption
 2048          */
 2049         if ((bp->b_flags & B_READ) == 0 && encrypt)
 2050                 uvm_swap_freepages(tpps, npages);
 2051 #endif
 2052         /*
 2053          * now dispose of the buf
 2054          */
 2055         s = splbio();
 2056         if (bp->b_vp)
 2057                 brelvp(bp);
 2058 
 2059         if (write && bp->b_vp)
 2060                 vwakeup(bp->b_vp);
 2061         pool_put(&bufpool, bp);
 2062         splx(s);
 2063 
 2064         /*
 2065          * finally return.
 2066          */
 2067         UVMHIST_LOG(pdhist, "<- done (sync)  result=%ld", result, 0, 0, 0);
 2068         return (result);
 2069 }
 2070 
 2071 static void
 2072 swapmount()
 2073 {
 2074         struct swapdev *sdp;
 2075         struct swappri *spp;
 2076         struct vnode *vp;
 2077         dev_t swap_dev = swdevt[0].sw_dev;
 2078 
 2079         /*
 2080          * No locking here since we happen to know that we will just be called
 2081          * once before any other process has forked.
 2082          */
 2083 
 2084         if (swap_dev == NODEV) {
 2085                 printf("swapmount: no device\n");
 2086                 return;
 2087         }
 2088 
 2089         if (bdevvp(swap_dev, &vp)) {
 2090                 printf("swapmount: no device 2\n");
 2091                 return;
 2092         }
 2093 
 2094         sdp = malloc(sizeof(*sdp), M_VMSWAP, M_WAITOK);
 2095         spp = malloc(sizeof(*spp), M_VMSWAP, M_WAITOK);
 2096         memset(sdp, 0, sizeof(*sdp));
 2097 
 2098         sdp->swd_flags = SWF_FAKE;
 2099         sdp->swd_dev = swap_dev;
 2100         sdp->swd_vp = vp;
 2101         swaplist_insert(sdp, spp, 0);
 2102         sdp->swd_pathlen = strlen("swap_device") + 1;
 2103         sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
 2104         if (copystr("swap_device", sdp->swd_path, sdp->swd_pathlen, 0))
 2105                 panic("swapmount: copystr");
 2106 
 2107         if (swap_on(curproc, sdp)) {
 2108                 swaplist_find(vp, 1);
 2109                 swaplist_trim();
 2110                 vput(sdp->swd_vp);
 2111                 free(sdp->swd_path, M_VMSWAP);
 2112                 free(sdp, M_VMSWAP);
 2113                 return;
 2114         }
 2115 
 2116         VOP_UNLOCK(vp, 0, curproc);
 2117 }

/* [<][>][^][v][top][bottom][index][help] */