1 /* $OpenBSD: pmapae.c,v 1.13 2007/07/20 19:48:15 mk Exp $ */
2
3 /*
4 * Copyright (c) 2006 Michael Shalayeff
5 * All rights reserved.
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER IN
16 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
17 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19 /*
20 *
21 * Copyright (c) 1997 Charles D. Cranor and Washington University.
22 * All rights reserved.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by Charles D. Cranor and
35 * Washington University.
36 * 4. The name of the author may not be used to endorse or promote products
37 * derived from this software without specific prior written permission.
38 *
39 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
40 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
42 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
43 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
45 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
46 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
48 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 *
50 * from OpenBSD: pmap.c,v 1.85 2005/11/18 17:05:04 brad Exp
51 */
52 /*
53 * pmap.c: i386 pmap module rewrite
54 * Chuck Cranor <chuck@ccrc.wustl.edu>
55 * 11-Aug-97
56 *
57 * history of this pmap module: in addition to my own input, i used
58 * the following references for this rewrite of the i386 pmap:
59 *
60 * [1] the NetBSD i386 pmap. this pmap appears to be based on the
61 * BSD hp300 pmap done by Mike Hibler at University of Utah.
62 * it was then ported to the i386 by William Jolitz of UUNET
63 * Technologies, Inc. Then Charles M. Hannum of the NetBSD
64 * project fixed some bugs and provided some speed ups.
65 *
66 * [2] the FreeBSD i386 pmap. this pmap seems to be the
67 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
68 * and David Greenman.
69 *
70 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated
71 * between several processors. the VAX version was done by
72 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386
73 * version was done by Lance Berc, Mike Kupfer, Bob Baron,
74 * David Golub, and Richard Draves. the alpha version was
75 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou
76 * (NetBSD/alpha).
77 */
78 /*
79 * PAE support
80 * Michael Shalayeff <mickey@lucifier.net>
81 *
82 * This module implements PAE mode for i386.
83 *
84 */
85
86 #include <sys/param.h>
87 #include <sys/systm.h>
88 #include <sys/proc.h>
89 #include <sys/malloc.h>
90 #include <sys/pool.h>
91 #include <sys/user.h>
92 #include <sys/kernel.h>
93 #include <sys/mutex.h>
94
95 #include <uvm/uvm.h>
96
97 #include <machine/atomic.h>
98 #include <machine/cpu.h>
99 #include <machine/specialreg.h>
100 #include <machine/gdt.h>
101
102 #include <dev/isa/isareg.h>
103 #include <sys/msgbuf.h>
104 #include <stand/boot/bootarg.h>
105
106 /*
107 * this file contains the code for the "pmap module." the module's
108 * job is to manage the hardware's virtual to physical address mappings.
109 * note that there are two levels of mapping in the VM system:
110 *
111 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
112 * to map ranges of virtual address space to objects/files. for
113 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
114 * to the file /bin/ls starting at offset zero." note that
115 * the upper layer mapping is not concerned with how individual
116 * vm_pages are mapped.
117 *
118 * [2] the lower layer of the VM system (the pmap) maintains the mappings
119 * from virtual addresses. it is concerned with which vm_page is
120 * mapped where. for example, when you run /bin/ls and start
121 * at page 0x1000 the fault routine may lookup the correct page
122 * of the /bin/ls file and then ask the pmap layer to establish
123 * a mapping for it.
124 *
125 * note that information in the lower layer of the VM system can be
126 * thrown away since it can easily be reconstructed from the info
127 * in the upper layer.
128 *
129 * data structures we use include:
130 *
131 * - struct pmap: describes the address space of one thread
132 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
133 * - struct pv_head: there is one pv_head per managed page of
134 * physical memory. the pv_head points to a list of pv_entry
135 * structures which describe all the <PMAP,VA> pairs that this
136 * page is mapped in. this is critical for page based operations
137 * such as pmap_page_protect() [change protection on _all_ mappings
138 * of a page]
139 * - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's.
140 * if we run out of pv_entry's we allocate a new pv_page and free
141 * its pv_entrys.
142 */
143 /*
144 * i386 PAE hardware Page Tables structure:
145 *
146 * the i386 PAE Page Table is a three-level PT which maps 4GB of VA.
147 * the pagesize is 4K (4096 [0x1000] bytes) or 2MB.
148 *
149 * the first level table is called "page directory index" and consists
150 * of 4 page directory index entries (PDIE) each 64 bits in size.
151 *
152 * the second level table is called a "page directory" and it contains
153 * 512 page directory entries (PDEs). each PDE is
154 * 8 bytes (a long long), so a PD fits in a single 4K page. this page is
155 * the page directory page (PDP). each PDE in a PDP maps 1GB of space
156 * (512 * 2MB = 1GB). a PDE contains the physical address of the
157 * second level table: the page table. or, if 2MB pages are being used,
158 * then the PDE contains the PA of the 2MB page being mapped.
159 *
160 * a page table consists of 512 page table entries (PTEs). each PTE is
161 * 8 bytes (a long long), so a page table also fits in a single 4K page.
162 * a 4K page being used as a page table is called a page table page (PTP).
163 * each PTE in a PTP maps one 4K page (512 * 4K = 2MB). a PTE contains
164 * the physical address of the page it maps and some flag bits (described
165 * below).
166 *
167 * the processor has a special register, "cr3", which points to the
168 * the PDP which is currently controlling the mappings of the virtual
169 * address space.
170 *
171 * the following picture shows the translation process for a 4K page:
172 *
173 * %cr3 register [PA of PDPT]
174 * |
175 * | bits <31-30> of VA
176 * | index the DPE (0-3)
177 * | |
178 * v v
179 * +-----------+
180 * | PDP Ptr |
181 * | 4 entries |
182 * +-----------+
183 * |
184 * PA of PDP
185 * |
186 * |
187 * | bits <29-21> of VA bits <20-12> of VA bits <11-0>
188 * | index the PDP (0 - 512) index the PTP are the page offset
189 * | | | |
190 * | v | |
191 * +-->+---------+ | |
192 * | PD Page | PA of v |
193 * | |-----PTP----->+------------+ |
194 * | 512 PDE | | page table |--PTE--+ |
195 * | entries | | (aka PTP) | | |
196 * +---------+ | 512 PTE | | |
197 * | entries | | |
198 * +------------+ | |
199 * | |
200 * bits <35-12> bits <11-0>
201 * p h y s i c a l a d d r
202 *
203 * the i386 caches PTEs in a TLB. it is important to flush out old
204 * TLB mappings when making a change to a mappings. writing to the
205 * %cr3 will flush the entire TLB. newer processors also have an
206 * instruction that will invalidate the mapping of a single page (which
207 * is useful if you are changing a single mappings because it preserves
208 * all the cached TLB entries).
209 *
210 * as shows, bits 31-12 of the PTE contain PA of the page being mapped.
211 * the rest of the PTE is defined as follows:
212 * bit# name use
213 * 63 NX no-execute bit (0=ITLB, 1=DTLB), optional
214 * 11 n/a available for OS use, hardware ignores it
215 * 10 n/a available for OS use, hardware ignores it
216 * 9 n/a available for OS use, hardware ignores it
217 * 8 G global bit (see discussion below)
218 * 7 PS page size [for PDEs] (0=4k, 1=4M <if supported>)
219 * 6 D dirty (modified) page
220 * 5 A accessed (referenced) page
221 * 4 PCD cache disable
222 * 3 PWT prevent write through (cache)
223 * 2 U/S user/supervisor bit (0=supervisor only, 1=both u&s)
224 * 1 R/W read/write bit (0=read only, 1=read-write)
225 * 0 P present (valid)
226 *
227 * notes:
228 * - on the i386 the R/W bit is ignored if processor is in supervisor
229 * state (bug!)
230 * - PS is only supported on newer processors
231 * - PTEs with the G bit are global in the sense that they are not
232 * flushed from the TLB when %cr3 is written (to flush, use the
233 * "flush single page" instruction). this is only supported on
234 * newer processors. this bit can be used to keep the kernel's
235 * TLB entries around while context switching. since the kernel
236 * is mapped into all processes at the same place it does not make
237 * sense to flush these entries when switching from one process'
238 * pmap to another.
239 */
240 /*
241 * A pmap describes a process' 4GB virtual address space. This
242 * virtual address space can be broken up into 2048 2MB regions which
243 * are described by PDEs in the PDP. The PDEs are defined as follows:
244 *
245 * Ranges are inclusive -> exclusive, just like vm_map_entry start/end.
246 * The following assumes that KERNBASE is 0xd0000000.
247 *
248 * PDE#s VA range Usage
249 * 0->1660 0x0 -> 0xcf800000 user address space, note that the
250 * max user address is 0xcfbfe000
251 * the final two pages in the last 4MB
252 * used to be reserved for the UAREA
253 * but now are no longer used.
254 * 1660 0xcf800000-> recursive mapping of PDP (used for
255 * 0xd0000000 linear mapping of PTPs).
256 * 1664->2044 0xd0000000-> kernel address space (constant
257 * 0xff800000 across all pmaps/processes).
258 * 2044 0xff800000-> "alternate" recursive PDP mapping
259 * <end> (for other pmaps).
260 *
261 *
262 * Note: A recursive PDP mapping provides a way to map all the PTEs for
263 * a 4GB address space into a linear chunk of virtual memory. In other
264 * words, the PTE for page 0 is the first int mapped into the 2MB recursive
265 * area. The PTE for page 1 is the second int. The very last int in the
266 * 2MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
267 * address).
268 *
269 * All pmaps' PDs must have the same values in slots 1660->2043 so that
270 * the kernel is always mapped in every process. These values are loaded
271 * into the PD at pmap creation time.
272 *
273 * At any one time only one pmap can be active on a processor. This is
274 * the pmap whose PDP is pointed to by processor register %cr3. This pmap
275 * will have all its PTEs mapped into memory at the recursive mapping
276 * point (slots #1660-3 as show above). When the pmap code wants to find the
277 * PTE for a virtual address, all it has to do is the following:
278 *
279 * Address of PTE = (1660 * 2MB) + (VA / NBPG) * sizeof(pt_entry_t)
280 * = 0xcf800000 + (VA / 4096) * 8
281 *
282 * What happens if the pmap layer is asked to perform an operation
283 * on a pmap that is not the one which is currently active? In that
284 * case we take the PA of the PDP of non-active pmap and put it in
285 * slots 2044-7 of the active pmap. This causes the non-active pmap's
286 * PTEs to get mapped in the final 4MB of the 4GB address space
287 * (e.g. starting at 0xffc00000).
288 *
289 * The following figure shows the effects of the recursive PDP mapping:
290 *
291 * PDP (%cr3->PDPTP)
292 * +----+
293 * | 0| -> PTP#0 that maps VA 0x0 -> 0x200000
294 * | |
295 * | |
296 * |1660| -> points back to PDP (%cr3) mapping VA 0xcf800000 -> 0xd0000000
297 * |1661| (PDP is 4 pages)
298 * |1662|
299 * |1663|
300 * |1664| -> first kernel PTP (maps 0xd0000000 -> 0xe0200000)
301 * | |
302 * |2044| -> points to alternate pmap's PDP (maps 0xff800000 -> end)
303 * |2045|
304 * |2046|
305 * |2047|
306 * +----+
307 *
308 * Note that the PDE#1660 VA (0xcf8033e0) is defined as "PTE_BASE".
309 * Note that the PDE#2044 VA (0xff803fe0) is defined as "APTE_BASE".
310 *
311 * Starting at VA 0xcf8033e0 the current active PDPs (%cr3) acts as a
312 * PDPTP and references four consequetly mapped pages:
313 *
314 * PTP#1660-3 == PDP(%cr3) => maps VA 0xcf800000 -> 0xd0000000
315 * +----+
316 * | 0| -> maps the contents of PTP#0 at VA 0xcf800000->0xcf801000
317 * | |
318 * | |
319 * |1660| -> maps the contents of PTP#1660 (the PDP) at VA 0xcfe7c000
320 * |1661|
321 * |1662|
322 * |1663|
323 * |1664| -> maps the contents of first kernel PTP
324 * | |
325 * |2047|
326 * +----+
327 *
328 * Note that mapping of the PDP at PTP#1660's VA (0xcfe7c000) is
329 * defined as "PDP_BASE".... within that mapping there are two
330 * defines:
331 * "PDP_PDE" (0xcfe7f3e0) is the VA of the PDE in the PDP
332 * which points back to itself.
333 * "APDP_PDE" (0xfff02fe0) is the VA of the PDE in the PDP which
334 * establishes the recursive mapping of the alternate pmap.
335 * To set the alternate PDP, one just has to put the correct
336 * PA info in *APDP_PDE.
337 *
338 * Note that in the APTE_BASE space, the APDP appears at VA
339 * "APDP_BASE" (0xffffc000).
340 *
341 * unfortunately we cannot use recursive PDPT from the page tables
342 * because in their infinite wisdom they have defined cr3 32 bits!
343 *
344 */
345 /*
346 * memory allocation
347 *
348 * - there are three data structures that we must dynamically allocate:
349 *
350 * [A] new process' page directory page (PDP)
351 * - plan 1: done at pmap_create() we use
352 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this
353 * allocation.
354 *
355 * if we are low in free physical memory then we sleep in
356 * uvm_km_alloc -- in this case this is ok since we are creating
357 * a new pmap and should not be holding any locks.
358 *
359 * if the kernel is totally out of virtual space
360 * (i.e. uvm_km_alloc returns NULL), then we panic.
361 *
362 * XXX: the fork code currently has no way to return an "out of
363 * memory, try again" error code since uvm_fork [fka vm_fork]
364 * is a void function.
365 *
366 * [B] new page tables pages (PTP)
367 * call pae_pagealloc()
368 * => success: zero page, add to pm_pdir
369 * => failure: we are out of free vm_pages, let pmap_enter()
370 * tell UVM about it.
371 *
372 * note: for kernel PTPs, we start with NKPTP of them. as we map
373 * kernel memory (at uvm_map time) we check to see if we've grown
374 * the kernel pmap. if so, we call the optional function
375 * pmap_growkernel() to grow the kernel PTPs in advance.
376 *
377 * [C] pv_entry structures
378 * - plan 1: try to allocate one off the free list
379 * => success: done!
380 * => failure: no more free pv_entrys on the list
381 * - plan 2: try to allocate a new pv_page to add a chunk of
382 * pv_entrys to the free list
383 * [a] obtain a free, unmapped, VA in kmem_map. either
384 * we have one saved from a previous call, or we allocate
385 * one now using a "vm_map_lock_try" in uvm_map
386 * => success: we have an unmapped VA, continue to [b]
387 * => failure: unable to lock kmem_map or out of VA in it.
388 * move on to plan 3.
389 * [b] allocate a page in kmem_object for the VA
390 * => success: map it in, free the pv_entry's, DONE!
391 * => failure: kmem_object locked, no free vm_pages, etc.
392 * save VA for later call to [a], go to plan 3.
393 * If we fail, we simply let pmap_enter() tell UVM about it.
394 */
395 /*
396 * locking
397 *
398 * we have the following locks that we must contend with:
399 *
400 * "simple" locks:
401 *
402 * - pmap lock (per pmap, part of uvm_object)
403 * this lock protects the fields in the pmap structure including
404 * the non-kernel PDEs in the PDP, and the PTEs. it also locks
405 * in the alternate PTE space (since that is determined by the
406 * entry in the PDP).
407 *
408 * - pvh_lock (per pv_head)
409 * this lock protects the pv_entry list which is chained off the
410 * pv_head structure for a specific managed PA. it is locked
411 * when traversing the list (e.g. adding/removing mappings,
412 * syncing R/M bits, etc.)
413 *
414 * - pvalloc_lock
415 * this lock protects the data structures which are used to manage
416 * the free list of pv_entry structures.
417 *
418 * - pmaps_lock
419 * this lock protects the list of active pmaps (headed by "pmaps").
420 * we lock it when adding or removing pmaps from this list.
421 *
422 */
423
424 /*
425 * locking data structures
426 */
427
428 #define PMAP_MAP_TO_HEAD_LOCK() /* null */
429 #define PMAP_MAP_TO_HEAD_UNLOCK() /* null */
430
431 #define PMAP_HEAD_TO_MAP_LOCK() /* null */
432 #define PMAP_HEAD_TO_MAP_UNLOCK() /* null */
433
434 #define PG_FRAME 0xffffff000ULL /* page frame mask */
435 #define PG_LGFRAME 0xfffe00000ULL /* large (2M) page frame mask */
436
437 /*
438 * Redefine the PDSHIFT, NBPD
439 */
440 #undef PDSHIFT
441 #define PD_MASK 0xffe00000 /* page directory address bits */
442 #define PDSHIFT 21 /* page directory address shift */
443 #define PT_MASK 0x001ff000 /* page table address bits */
444 #undef NBPD
445 #define NBPD (1U << PDSHIFT) /* # bytes mapped by PD (2MB) */
446
447 /*
448 *
449 */
450 #undef PDSLOT_PTE
451 #define PDSLOT_PTE (1660U) /* 1660: for recursive PDP map */
452 #undef PDSLOT_KERN
453 #define PDSLOT_KERN (1664U) /* 1664: start of kernel space */
454 #undef PDSLOT_APTE
455 #define PDSLOT_APTE (2044U) /* 2044: alternative recursive slot */
456
457 /*
458 * The following defines give the virtual addresses of various MMU
459 * data structures:
460 * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
461 * PTD_BASE and APTD_BASE: the base VA of the recursive mapping of the PTD
462 * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
463 */
464 #define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD) )
465 #define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD) )
466 #define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG)))
467 #define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG)))
468 #define PDP_PDE (PDP_BASE + PDSLOT_PTE)
469 #define APDP_PDE (PDP_BASE + PDSLOT_APTE)
470
471 #define PTES_PER_PTP (NBPG / sizeof(pt_entry_t)) /* # of PTEs in a PTP */
472
473 /*
474 * various address macros
475 *
476 * vtopte: return a pointer to the PTE mapping a VA
477 *
478 */
479 #define vtopte(VA) (PTE_BASE + atop((vaddr_t)VA))
480
481 /*
482 * pdei/ptei: generate index into PDP/PTP from a VA
483 */
484 #define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT)
485 #define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT)
486
487 /*
488 * Mach derived conversion macros
489 */
490 #define i386_round_pdr(x) ((((unsigned)(x)) + ~PD_MASK) & PD_MASK)
491
492 /*
493 * PTP macros:
494 * A PTP's index is the PD index of the PDE that points to it.
495 * A PTP's offset is the byte-offset in the PTE space that this PTP is at.
496 * A PTP's VA is the first VA mapped by that PTP.
497 *
498 * Note that NBPG == number of bytes in a PTP (4096 bytes == 1024 entries)
499 * NBPD == number of bytes a PTP can map (4MB)
500 */
501
502 #define ptp_i2o(I) ((I) * NBPG) /* index => offset */
503 #define ptp_o2i(O) ((O) / NBPG) /* offset => index */
504 #define ptp_i2v(I) ((I) * NBPD) /* index => VA */
505 #define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */
506
507 /*
508 * Access PD and PT
509 */
510 #define PDE(pm,i) (((pd_entry_t *)(pm)->pm_pdir)[(i)])
511
512 /*
513 * here we define the data types for PDEs and PTEs
514 */
515 typedef u_int64_t pd_entry_t; /* PDE */
516 typedef u_int64_t pt_entry_t; /* PTE */
517
518 /*
519 * Number of PTE's per cache line. 8 byte pte, 32-byte cache line
520 * Used to avoid false sharing of cache lines.
521 */
522 #define NPTECL 4
523
524 /*
525 * other data structures
526 */
527
528 extern u_int32_t protection_codes[]; /* maps MI prot to i386 prot code */
529 extern boolean_t pmap_initialized; /* pmap_init done yet? */
530
531 /*
532 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
533 * I386_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing
534 * due to false sharing.
535 */
536
537 #ifdef MULTIPROCESSOR
538 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
539 #define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG)
540 #else
541 #define PTESLEW(pte, id) (pte)
542 #define VASLEW(va,id) (va)
543 #endif
544
545 /*
546 * special VAs and the PTEs that map them
547 */
548
549 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte;
550 extern caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp;
551
552 extern int pmap_pg_g;
553 extern struct pmap_head pmaps;
554
555 /*
556 * a towards larger memory prioritised version opf uvm_pagealloc()
557 */
558 #define pae_pagealloc(obj, off, anon, flags) \
559 uvm_pagealloc_strat((obj), (off), (anon), (flags), \
560 UVM_PGA_STRAT_FALLBACK, VM_FREELIST_ABOVE4G)
561
562 /*
563 * local prototypes
564 */
565
566 struct vm_page *pmap_alloc_ptp_pae(struct pmap *, int, boolean_t);
567 #define ALLOCPV_NEED 0 /* need PV now */
568 #define ALLOCPV_TRY 1 /* just try to allocate, don't steal */
569 #define ALLOCPV_NONEED 2 /* don't need PV, just growing cache */
570 struct vm_page *pmap_get_ptp_pae(struct pmap *, int, boolean_t);
571 pt_entry_t *pmap_map_ptes_pae(struct pmap *);
572 void pmap_remove_ptes_pae(struct pmap *, struct vm_page *,
573 vaddr_t, vaddr_t, vaddr_t, int32_t *);
574 boolean_t pmap_remove_pte_pae(struct pmap *, struct vm_page *,
575 pt_entry_t *, vaddr_t, int32_t *);
576 void pmap_unmap_ptes_pae(struct pmap *);
577 vaddr_t pmap_tmpmap_pa_pae(paddr_t);
578 pt_entry_t *pmap_tmpmap_pvepte_pae(struct pv_entry *);
579 void pmap_tmpunmap_pa_pae(void);
580 void pmap_tmpunmap_pvepte_pae(struct pv_entry *);
581
582 /*
583 * pmap_tmpmap_pa: map a page in for tmp usage
584 */
585
586 vaddr_t
587 pmap_tmpmap_pa_pae(paddr_t pa)
588 {
589 #ifdef MULTIPROCESSOR
590 int id = cpu_number();
591 #endif
592 pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
593 caddr_t ptpva = VASLEW(pmap_ptpp, id);
594 #if defined(DIAGNOSTIC)
595 if (*ptpte)
596 panic("pmap_tmpmap_pa: ptp_pte in use?");
597 #endif
598 *ptpte = PG_V | PG_RW | pa; /* always a new mapping */
599 return((vaddr_t)ptpva);
600 }
601
602 /*
603 * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
604 */
605
606 void
607 pmap_tmpunmap_pa_pae()
608 {
609 #ifdef MULTIPROCESSOR
610 int id = cpu_number();
611 #endif
612 pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
613 caddr_t ptpva = VASLEW(pmap_ptpp, id);
614 #if defined(DIAGNOSTIC)
615 if (!pmap_valid_entry(*ptpte))
616 panic("pmap_tmpunmap_pa: our pte invalid?");
617 #endif
618 *ptpte = 0; /* zap! */
619 pmap_update_pg((vaddr_t)ptpva);
620 #ifdef MULTIPROCESSOR
621 /*
622 * No need for tlb shootdown here, since ptp_pte is per-CPU.
623 */
624 #endif
625 }
626
627 /*
628 * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry
629 *
630 * => do NOT use this on kernel mappings [why? because pv_ptp may be NULL]
631 */
632
633 pt_entry_t *
634 pmap_tmpmap_pvepte_pae(struct pv_entry *pve)
635 {
636 #ifdef DIAGNOSTIC
637 if (pve->pv_pmap == pmap_kernel())
638 panic("pmap_tmpmap_pvepte: attempt to map kernel");
639 #endif
640
641 /* is it current pmap? use direct mapping... */
642 if (pmap_is_curpmap(pve->pv_pmap))
643 return(vtopte(pve->pv_va));
644
645 return(((pt_entry_t *)pmap_tmpmap_pa_pae(VM_PAGE_TO_PHYS(pve->pv_ptp)))
646 + ptei((unsigned)pve->pv_va));
647 }
648
649 /*
650 * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte
651 */
652
653 void
654 pmap_tmpunmap_pvepte_pae(struct pv_entry *pve)
655 {
656 /* was it current pmap? if so, return */
657 if (pmap_is_curpmap(pve->pv_pmap))
658 return;
659
660 pmap_tmpunmap_pa_pae();
661 }
662
663 /*
664 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
665 *
666 * => we lock enough pmaps to keep things locked in
667 * => must be undone with pmap_unmap_ptes before returning
668 */
669
670 pt_entry_t *
671 pmap_map_ptes_pae(struct pmap *pmap)
672 {
673 pd_entry_t opde;
674
675 /* the kernel's pmap is always accessible */
676 if (pmap == pmap_kernel()) {
677 return(PTE_BASE);
678 }
679
680 /* if curpmap then we are always mapped */
681 if (pmap_is_curpmap(pmap)) {
682 simple_lock(&pmap->pm_obj.vmobjlock);
683 return(PTE_BASE);
684 }
685
686 /* need to lock both curpmap and pmap: use ordered locking */
687 if ((unsigned) pmap < (unsigned) curpcb->pcb_pmap) {
688 simple_lock(&pmap->pm_obj.vmobjlock);
689 simple_lock(&curpcb->pcb_pmap->pm_obj.vmobjlock);
690 } else {
691 simple_lock(&curpcb->pcb_pmap->pm_obj.vmobjlock);
692 simple_lock(&pmap->pm_obj.vmobjlock);
693 }
694
695 /* need to load a new alternate pt space into curpmap? */
696 opde = *APDP_PDE;
697 if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdidx[0]) {
698 APDP_PDE[0] = pmap->pm_pdidx[0] | PG_RW | PG_V;
699 APDP_PDE[1] = pmap->pm_pdidx[1] | PG_RW | PG_V;
700 APDP_PDE[2] = pmap->pm_pdidx[2] | PG_RW | PG_V;
701 APDP_PDE[3] = pmap->pm_pdidx[3] | PG_RW | PG_V;
702 if (pmap_valid_entry(opde))
703 pmap_apte_flush(curpcb->pcb_pmap);
704 }
705 return(APTE_BASE);
706 }
707
708 /*
709 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
710 */
711
712 void
713 pmap_unmap_ptes_pae(struct pmap *pmap)
714 {
715 if (pmap == pmap_kernel())
716 return;
717
718 if (pmap_is_curpmap(pmap)) {
719 simple_unlock(&pmap->pm_obj.vmobjlock);
720 } else {
721 #if defined(MULTIPROCESSOR)
722 APDP_PDE[0] = 0;
723 APDP_PDE[1] = 0;
724 APDP_PDE[2] = 0;
725 APDP_PDE[3] = 0;
726 pmap_apte_flush(curpcb->pcb_pmap);
727 #endif
728 simple_unlock(&pmap->pm_obj.vmobjlock);
729 simple_unlock(&curpcb->pcb_pmap->pm_obj.vmobjlock);
730 }
731 }
732
733 u_int32_t
734 pmap_pte_set_pae(vaddr_t va, paddr_t pa, u_int32_t bits)
735 {
736 pt_entry_t pte, *ptep = vtopte(va);
737
738 pte = i386_atomic_testset_uq(ptep, pa | bits);
739 return (pte & ~PG_FRAME);
740 }
741
742 u_int32_t
743 pmap_pte_setbits_pae(vaddr_t va, u_int32_t set, u_int32_t clr)
744 {
745 pt_entry_t *ptep = vtopte(va);
746 pt_entry_t pte = *ptep;
747
748 i386_atomic_testset_uq(ptep, (pte | set) & ~(pt_entry_t)clr);
749 return (pte & ~PG_FRAME);
750
751 }
752
753 u_int32_t
754 pmap_pte_bits_pae(vaddr_t va)
755 {
756 pt_entry_t *ptep = vtopte(va);
757
758 return (*ptep & ~PG_FRAME);
759 }
760
761 paddr_t
762 pmap_pte_paddr_pae(vaddr_t va)
763 {
764 pt_entry_t *ptep = vtopte(va);
765
766 return (*ptep & PG_FRAME);
767 }
768
769 /*
770 * Switch over to PAE page tables
771 */
772 void
773 pmap_bootstrap_pae()
774 {
775 extern paddr_t avail_end, avail_end2;
776 extern int cpu_pae, nkpde;
777 struct pmap *kpm = pmap_kernel();
778 struct vm_page *ptp;
779 paddr_t ptaddr;
780 u_int32_t bits;
781 vaddr_t va, eva;
782 int i, pn, pe;
783
784 if (!cpu_pae || avail_end >= avail_end2 || !(cpu_feature & CPUID_PAE)){
785 avail_end2 = avail_end;
786 return;
787 }
788
789 va = (vaddr_t)kpm->pm_pdir;
790 kpm->pm_pdidx[0] = (va + 0*NBPG - KERNBASE) | PG_V;
791 kpm->pm_pdidx[1] = (va + 1*NBPG - KERNBASE) | PG_V;
792 kpm->pm_pdidx[2] = (va + 2*NBPG - KERNBASE) | PG_V;
793 kpm->pm_pdidx[3] = (va + 3*NBPG - KERNBASE) | PG_V;
794 /* map pde recursively into itself */
795 PDE(kpm, PDSLOT_PTE+0) = kpm->pm_pdidx[0] | PG_KW;
796 PDE(kpm, PDSLOT_PTE+1) = kpm->pm_pdidx[1] | PG_KW;
797 PDE(kpm, PDSLOT_PTE+2) = kpm->pm_pdidx[2] | PG_KW;
798 PDE(kpm, PDSLOT_PTE+3) = kpm->pm_pdidx[3] | PG_KW;
799
800 /* transfer all kernel mappings over into pae tables */
801 for (va = KERNBASE, eva = va + (nkpde << 22);
802 va < eva; va += PAGE_SIZE) {
803 if (!pmap_valid_entry(PDE(kpm, pdei(va)))) {
804 ptp = pae_pagealloc(&kpm->pm_obj, va, NULL,
805 UVM_PGA_ZERO);
806 ptaddr = VM_PAGE_TO_PHYS(ptp);
807 PDE(kpm, pdei(va)) = ptaddr | PG_KW | PG_V;
808 pmap_pte_set_86((vaddr_t)vtopte(va),
809 ptaddr, PG_KW | PG_V);
810
811 /* count PTP as resident */
812 kpm->pm_stats.resident_count++;
813 }
814 bits = pmap_pte_bits_86(va) | pmap_pg_g;
815 if (pmap_valid_entry(bits))
816 pmap_pte_set_pae(va, pmap_pte_paddr_86(va), bits);
817 }
818
819 if (!cpu_paenable(&kpm->pm_pdidx[0])) {
820 extern struct user *proc0paddr;
821
822 proc0paddr->u_pcb.pcb_cr3 = kpm->pm_pdirpa =
823 (vaddr_t)kpm - KERNBASE;
824 kpm->pm_pdirsize = 4 * NBPG;
825
826 csrc_pte = vtopte(pmap_csrcp);
827 cdst_pte = vtopte(pmap_cdstp);
828 zero_pte = vtopte(pmap_zerop);
829 ptp_pte = vtopte(pmap_ptpp);
830
831 nkpde *= 2;
832 nkptp_max = 2048 - PDSLOT_KERN - 4;
833 vm_max_address = (PDSLOT_PTE << PDSHIFT) +
834 (PDSLOT_PTE << PGSHIFT);
835
836 pmap_pte_set_p = pmap_pte_set_pae;
837 pmap_pte_setbits_p = pmap_pte_setbits_pae;
838 pmap_pte_bits_p = pmap_pte_bits_pae;
839 pmap_pte_paddr_p = pmap_pte_paddr_pae;
840 pmap_change_attrs_p = pmap_change_attrs_pae;
841 pmap_enter_p = pmap_enter_pae;
842 pmap_extract_p = pmap_extract_pae;
843 pmap_growkernel_p = pmap_growkernel_pae;
844 pmap_page_remove_p = pmap_page_remove_pae;
845 pmap_remove_p = pmap_remove_pae;
846 pmap_test_attrs_p = pmap_test_attrs_pae;
847 pmap_unwire_p = pmap_unwire_pae;
848 pmap_write_protect_p = pmap_write_protect_pae;
849 pmap_pinit_pd_p = pmap_pinit_pd_pae;
850 pmap_zero_phys_p = pmap_zero_phys_pae;
851 pmap_zero_page_uncached_p = pmap_zero_page_uncached_pae;
852 pmap_copy_page_p = pmap_copy_page_pae;
853 pmap_try_steal_pv_p = pmap_try_steal_pv_pae;
854
855 bzero((void *)kpm->pm_pdir + 8, (PDSLOT_PTE-1) * 8);
856 /* TODO also reclaim old PDPs */
857 for (i = 0; i < vm_nphysseg; i++)
858 if (vm_physmem[i].start > atop(0xfffff000)) {
859 vm_physmem[i].avail_end = vm_physmem[i].end;
860 /* free vm_pages (uvm had already zeroed 'em) */
861 for (pn = 0, pe = vm_physmem[i].end -
862 vm_physmem[i].start; pn < pe ; pn++) {
863 uvmexp.npages++;
864 /* add page to free pool */
865 uvm_pagefree(&vm_physmem[i].pgs[pn]);
866 }
867
868 }
869 uvm_page_rehash();
870 }
871 }
872
873 /*
874 * p v _ e n t r y f u n c t i o n s
875 */
876
877 /*
878 * pv_entry allocation functions:
879 * the main pv_entry allocation functions are:
880 * pmap_alloc_pv: allocate a pv_entry structure
881 * pmap_free_pv: free one pv_entry
882 * pmap_free_pvs: free a list of pv_entrys
883 *
884 * the rest are helper functions
885 */
886
887 /*
888 * pmap_try_steal_pv: try and steal a pv_entry from a pmap
889 *
890 * => return true if we did it!
891 */
892
893 boolean_t
894 pmap_try_steal_pv_pae(struct pv_head *pvh, struct pv_entry *cpv,
895 struct pv_entry *prevpv)
896 {
897 pt_entry_t *ptep, opte;
898 #ifdef MULTIPROCESSOR
899 int32_t cpumask = 0;
900 #endif
901
902 /*
903 * we never steal kernel mappings or mappings from pmaps we can't lock
904 */
905
906 if (cpv->pv_pmap == pmap_kernel() ||
907 !simple_lock_try(&cpv->pv_pmap->pm_obj.vmobjlock))
908 return(FALSE);
909
910 /*
911 * yes, we can try and steal it. first we need to remove the
912 * mapping from the pmap.
913 */
914
915 ptep = pmap_tmpmap_pvepte_pae(cpv);
916 if (*ptep & PG_W) {
917 ptep = NULL; /* wired page, avoid stealing this one */
918 } else {
919 opte = i386_atomic_testset_uq(ptep, 0); /* zap! */
920 #ifdef MULTIPROCESSOR
921 pmap_tlb_shootdown(cpv->pv_pmap, cpv->pv_va, opte, &cpumask);
922 pmap_tlb_shootnow(cpumask);
923 #else
924 /* Don't bother deferring in the single CPU case. */
925 if (pmap_is_curpmap(cpv->pv_pmap))
926 pmap_update_pg(cpv->pv_va);
927 #endif
928 pmap_tmpunmap_pvepte_pae(cpv);
929 }
930 if (ptep == NULL) {
931 simple_unlock(&cpv->pv_pmap->pm_obj.vmobjlock);
932 return(FALSE); /* wired page, abort! */
933 }
934 cpv->pv_pmap->pm_stats.resident_count--;
935 if (cpv->pv_ptp && cpv->pv_ptp->wire_count)
936 /* drop PTP's wired count */
937 cpv->pv_ptp->wire_count--;
938
939 /*
940 * XXX: if wire_count goes to one the PTP could be freed, however,
941 * we'd have to lock the page queues (etc.) to do that and it could
942 * cause deadlock headaches. besides, the pmap we just stole from
943 * may want the mapping back anyway, so leave the PTP around.
944 */
945
946 /*
947 * now we need to remove the entry from the pvlist
948 */
949
950 if (cpv == pvh->pvh_list)
951 pvh->pvh_list = cpv->pv_next;
952 else
953 prevpv->pv_next = cpv->pv_next;
954 return(TRUE);
955 }
956
957 /*
958 * p t p f u n c t i o n s
959 */
960
961 /*
962 * pmap_alloc_ptp: allocate a PTP for a PMAP
963 *
964 * => pmap should already be locked by caller
965 * => we use the ptp's wire_count to count the number of active mappings
966 * in the PTP (we start it at one to prevent any chance this PTP
967 * will ever leak onto the active/inactive queues)
968 * => we should not be holding any pv_head locks (in case we are forced
969 * to call pmap_steal_ptp())
970 * => we may need to lock pv_head's if we have to steal a PTP
971 * => just_try: true if we want a PTP, but not enough to steal one
972 * from another pmap (e.g. during optional functions like pmap_copy)
973 */
974
975 struct vm_page *
976 pmap_alloc_ptp_pae(struct pmap *pmap, int pde_index, boolean_t just_try)
977 {
978 struct vm_page *ptp;
979
980 ptp = pae_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
981 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
982 if (ptp == NULL)
983 return(NULL);
984
985 /* got one! */
986 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY);
987 ptp->wire_count = 1; /* no mappings yet */
988 PDE(pmap, pde_index) =
989 (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V);
990 pmap->pm_stats.resident_count++; /* count PTP as resident */
991 pmap->pm_ptphint = ptp;
992 return(ptp);
993 }
994
995 /*
996 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
997 *
998 * => pmap should NOT be pmap_kernel()
999 * => pmap should be locked
1000 */
1001
1002 struct vm_page *
1003 pmap_get_ptp_pae(struct pmap *pmap, int pde_index, boolean_t just_try)
1004 {
1005 struct vm_page *ptp;
1006
1007 if (pmap_valid_entry(PDE(pmap, pde_index))) {
1008
1009 /* valid... check hint (saves us a PA->PG lookup) */
1010 if (pmap->pm_ptphint &&
1011 (PDE(pmap, pde_index) & PG_FRAME) ==
1012 VM_PAGE_TO_PHYS(pmap->pm_ptphint))
1013 return(pmap->pm_ptphint);
1014
1015 ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
1016 #ifdef DIAGNOSTIC
1017 if (ptp == NULL)
1018 panic("pmap_get_ptp: unmanaged user PTP");
1019 #endif
1020 pmap->pm_ptphint = ptp;
1021 return(ptp);
1022 }
1023
1024 /* allocate a new PTP (updates ptphint) */
1025 return (pmap_alloc_ptp_pae(pmap, pde_index, just_try));
1026 }
1027
1028 /*
1029 * pmap_pinit_pd: given a freshly allocated pmap structure, give it a PD
1030 */
1031 void
1032 pmap_pinit_pd_pae(struct pmap *pmap)
1033 {
1034 extern int nkpde;
1035 vaddr_t va;
1036
1037 /* allocate PDP */
1038 pmap->pm_pdir = uvm_km_alloc(kernel_map, 4 * NBPG);
1039 if (pmap->pm_pdir == NULL)
1040 panic("pmap_pinit: kernel_map out of virtual space!");
1041 /* page index is in the pmap! */
1042 pmap_extract(pmap_kernel(), (vaddr_t)pmap, &pmap->pm_pdirpa);
1043 /* fill out the PDPT entries */
1044 va = (vaddr_t)pmap->pm_pdir;
1045 pmap_extract(pmap_kernel(), va + 0*NBPG, &pmap->pm_pdidx[0]);
1046 pmap_extract(pmap_kernel(), va + 1*NBPG, &pmap->pm_pdidx[1]);
1047 pmap_extract(pmap_kernel(), va + 2*NBPG, &pmap->pm_pdidx[2]);
1048 pmap_extract(pmap_kernel(), va + 3*NBPG, &pmap->pm_pdidx[3]);
1049 pmap->pm_pdidx[0] |= PG_V;
1050 pmap->pm_pdidx[1] |= PG_V;
1051 pmap->pm_pdidx[2] |= PG_V;
1052 pmap->pm_pdidx[3] |= PG_V;
1053 pmap->pm_pdirsize = 4 * NBPG;
1054
1055 /* init PDP */
1056 /* zero init area */
1057 bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t));
1058 /* put in recursive PDE to map the PTEs */
1059 PDE(pmap, PDSLOT_PTE+0) = pmap->pm_pdidx[0] | PG_KW;
1060 PDE(pmap, PDSLOT_PTE+1) = pmap->pm_pdidx[1] | PG_KW;
1061 PDE(pmap, PDSLOT_PTE+2) = pmap->pm_pdidx[2] | PG_KW;
1062 PDE(pmap, PDSLOT_PTE+3) = pmap->pm_pdidx[3] | PG_KW;
1063
1064 /*
1065 * we need to lock pmaps_lock to prevent nkpde from changing on
1066 * us. note that there is no need to splvm to protect us from
1067 * malloc since malloc allocates out of a submap and we should have
1068 * already allocated kernel PTPs to cover the range...
1069 */
1070 simple_lock(&pmaps_lock);
1071 /* put in kernel VM PDEs */
1072 bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN),
1073 nkpde * sizeof(pd_entry_t));
1074 /* zero the rest */
1075 bzero(&PDE(pmap, PDSLOT_KERN + nkpde), pmap->pm_pdirsize -
1076 ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
1077 LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1078 simple_unlock(&pmaps_lock);
1079 }
1080
1081 /*
1082 * some misc. functions
1083 */
1084
1085 /*
1086 * pmap_extract: extract a PA for the given VA
1087 */
1088
1089 boolean_t
1090 pmap_extract_pae(struct pmap *pmap, vaddr_t va, paddr_t *pap)
1091 {
1092 paddr_t retval;
1093 pt_entry_t *ptes;
1094
1095 if (PDE(pmap, pdei(va))) {
1096 ptes = pmap_map_ptes_pae(pmap);
1097 retval = (paddr_t)(ptes[atop(va)] & PG_FRAME);
1098 pmap_unmap_ptes_pae(pmap);
1099 if (pap != NULL)
1100 *pap = retval | (va & ~PG_FRAME);
1101 return (TRUE);
1102 }
1103 return (FALSE);
1104 }
1105
1106 extern void (*pagezero)(void *, size_t);
1107
1108 /*
1109 * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are
1110 * initialized.
1111 */
1112 void
1113 pmap_zero_phys_pae(paddr_t pa)
1114 {
1115 #ifdef MULTIPROCESSOR
1116 int id = cpu_number();
1117 #endif
1118 pt_entry_t *zpte = PTESLEW(zero_pte, id);
1119 caddr_t zerova = VASLEW(pmap_zerop, id);
1120
1121 #ifdef DIAGNOSTIC
1122 if (*zpte)
1123 panic("pmap_zero_phys: lock botch");
1124 #endif
1125 *zpte = (pa & PG_FRAME) | PG_V | PG_RW; /* map in */
1126 pmap_update_pg((vaddr_t)zerova); /* flush TLB */
1127 pagezero(zerova, PAGE_SIZE); /* zero */
1128 *zpte = 0; /* zap! */
1129 }
1130
1131 /*
1132 * pmap_zero_page_uncached: the same, except uncached.
1133 */
1134
1135 boolean_t
1136 pmap_zero_page_uncached_pae(paddr_t pa)
1137 {
1138 #ifdef MULTIPROCESSOR
1139 int id = cpu_number();
1140 #endif
1141 pt_entry_t *zpte = PTESLEW(zero_pte, id);
1142 caddr_t zerova = VASLEW(pmap_zerop, id);
1143
1144 #ifdef DIAGNOSTIC
1145 if (*zpte)
1146 panic("pmap_zero_page_uncached: lock botch");
1147 #endif
1148
1149 *zpte = (pa & PG_FRAME) | PG_V | PG_RW | PG_N); /* map in */
1150 pmap_update_pg((vaddr_t)zerova); /* flush TLB */
1151 pagezero(zerova, PAGE_SIZE); /* zero */
1152 *zpte = 0; /* zap! */
1153
1154 return (TRUE);
1155 }
1156
1157 /*
1158 * pmap_copy_page: copy a page
1159 */
1160
1161 void
1162 pmap_copy_page_pae(struct vm_page *srcpg, struct vm_page *dstpg)
1163 {
1164 paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg);
1165 paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg);
1166 #ifdef MULTIPROCESSOR
1167 int id = cpu_number();
1168 #endif
1169 pt_entry_t *spte = PTESLEW(csrc_pte,id);
1170 pt_entry_t *dpte = PTESLEW(cdst_pte,id);
1171 caddr_t csrcva = VASLEW(pmap_csrcp, id);
1172 caddr_t cdstva = VASLEW(pmap_cdstp, id);
1173
1174 #ifdef DIAGNOSTIC
1175 if (*spte || *dpte)
1176 panic("pmap_copy_page: lock botch");
1177 #endif
1178
1179 *spte = (srcpa & PG_FRAME) | PG_V | PG_RW;
1180 *dpte = (dstpa & PG_FRAME) | PG_V | PG_RW;
1181 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1182 bcopy(csrcva, cdstva, PAGE_SIZE);
1183 *spte = *dpte = 0; /* zap! */
1184 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
1185 #ifdef MULTIPROCESSOR
1186 /* Using per-cpu VA; no shootdown required here. */
1187 #endif
1188 }
1189
1190 /*
1191 * p m a p r e m o v e f u n c t i o n s
1192 *
1193 * functions that remove mappings
1194 */
1195
1196 /*
1197 * pmap_remove_ptes: remove PTEs from a PTP
1198 *
1199 * => must have proper locking on pmap_master_lock
1200 * => caller must hold pmap's lock
1201 * => PTP must be mapped into KVA
1202 * => PTP should be null if pmap == pmap_kernel()
1203 */
1204
1205 void
1206 pmap_remove_ptes_pae(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
1207 vaddr_t startva, vaddr_t endva, int32_t *cpumaskp)
1208 {
1209 struct pv_entry *pv_tofree = NULL; /* list of pv_entrys to free */
1210 struct pv_entry *pve;
1211 pt_entry_t *pte = (pt_entry_t *) ptpva;
1212 pt_entry_t opte;
1213 int bank, off;
1214
1215 /*
1216 * note that ptpva points to the PTE that maps startva. this may
1217 * or may not be the first PTE in the PTP.
1218 *
1219 * we loop through the PTP while there are still PTEs to look at
1220 * and the wire_count is greater than 1 (because we use the wire_count
1221 * to keep track of the number of real PTEs in the PTP).
1222 */
1223
1224 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
1225 ; pte++, startva += NBPG) {
1226 if (!pmap_valid_entry(*pte))
1227 continue; /* VA not mapped */
1228
1229 opte = i386_atomic_testset_uq(pte, 0); /* zap! */
1230
1231 if (opte & PG_W)
1232 pmap->pm_stats.wired_count--;
1233 pmap->pm_stats.resident_count--;
1234
1235 if (opte & PG_U)
1236 pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
1237
1238 if (ptp) {
1239 ptp->wire_count--; /* dropping a PTE */
1240 /* Make sure that the PDE is flushed */
1241 if ((ptp->wire_count <= 1) && !(opte & PG_U))
1242 pmap_tlb_shootdown(pmap, startva, opte,
1243 cpumaskp);
1244 }
1245
1246 /*
1247 * if we are not on a pv_head list we are done.
1248 */
1249
1250 if ((opte & PG_PVLIST) == 0) {
1251 #ifdef DIAGNOSTIC
1252 if (vm_physseg_find(atop(opte & PG_FRAME), &off)
1253 != -1)
1254 panic("pmap_remove_ptes: managed page without "
1255 "PG_PVLIST for 0x%lx", startva);
1256 #endif
1257 continue;
1258 }
1259
1260 bank = vm_physseg_find(atop(opte & PG_FRAME), &off);
1261 #ifdef DIAGNOSTIC
1262 if (bank == -1)
1263 panic("pmap_remove_ptes: unmanaged page marked "
1264 "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
1265 startva, (u_long)(opte & PG_FRAME));
1266 #endif
1267
1268 /* sync R/M bits */
1269 simple_lock(&vm_physmem[bank].pmseg.pvhead[off].pvh_lock);
1270 vm_physmem[bank].pmseg.attrs[off] |= (opte & (PG_U|PG_M));
1271 pve = pmap_remove_pv(&vm_physmem[bank].pmseg.pvhead[off], pmap,
1272 startva);
1273 simple_unlock(&vm_physmem[bank].pmseg.pvhead[off].pvh_lock);
1274
1275 if (pve) {
1276 pve->pv_next = pv_tofree;
1277 pv_tofree = pve;
1278 }
1279
1280 /* end of "for" loop: time for next pte */
1281 }
1282 if (pv_tofree)
1283 pmap_free_pvs(pmap, pv_tofree);
1284 }
1285
1286
1287 /*
1288 * pmap_remove_pte: remove a single PTE from a PTP
1289 *
1290 * => must have proper locking on pmap_master_lock
1291 * => caller must hold pmap's lock
1292 * => PTP must be mapped into KVA
1293 * => PTP should be null if pmap == pmap_kernel()
1294 * => returns true if we removed a mapping
1295 */
1296
1297 boolean_t
1298 pmap_remove_pte_pae(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
1299 vaddr_t va, int32_t *cpumaskp)
1300 {
1301 pt_entry_t opte;
1302 int bank, off;
1303 struct pv_entry *pve;
1304
1305 if (!pmap_valid_entry(*pte))
1306 return(FALSE); /* VA not mapped */
1307
1308 opte = *pte; /* save the old PTE */
1309 *pte = 0; /* zap! */
1310
1311 pmap_exec_account(pmap, va, opte, 0);
1312
1313 if (opte & PG_W)
1314 pmap->pm_stats.wired_count--;
1315 pmap->pm_stats.resident_count--;
1316
1317 if (opte & PG_U)
1318 pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
1319
1320 if (ptp) {
1321 ptp->wire_count--; /* dropping a PTE */
1322 /* Make sure that the PDE is flushed */
1323 if ((ptp->wire_count <= 1) && !(opte & PG_U))
1324 pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
1325
1326 }
1327
1328 /*
1329 * if we are not on a pv_head list we are done.
1330 */
1331
1332 if ((opte & PG_PVLIST) == 0) {
1333 #ifdef DIAGNOSTIC
1334 if (vm_physseg_find(atop(opte & PG_FRAME), &off) != -1)
1335 panic("pmap_remove_pte: managed page without "
1336 "PG_PVLIST for 0x%lx", va);
1337 #endif
1338 return(TRUE);
1339 }
1340
1341 bank = vm_physseg_find(atop(opte & PG_FRAME), &off);
1342 #ifdef DIAGNOSTIC
1343 if (bank == -1)
1344 panic("pmap_remove_pte: unmanaged page marked "
1345 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
1346 (u_long)(opte & PG_FRAME));
1347 #endif
1348
1349 /* sync R/M bits */
1350 simple_lock(&vm_physmem[bank].pmseg.pvhead[off].pvh_lock);
1351 vm_physmem[bank].pmseg.attrs[off] |= (opte & (PG_U|PG_M));
1352 pve = pmap_remove_pv(&vm_physmem[bank].pmseg.pvhead[off], pmap, va);
1353 simple_unlock(&vm_physmem[bank].pmseg.pvhead[off].pvh_lock);
1354
1355 if (pve)
1356 pmap_free_pv(pmap, pve);
1357 return(TRUE);
1358 }
1359
1360 /*
1361 * pmap_remove: top level mapping removal function
1362 *
1363 * => caller should not be holding any pmap locks
1364 */
1365
1366 void
1367 pmap_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
1368 {
1369 pt_entry_t *ptes, opte;
1370 boolean_t result;
1371 paddr_t ptppa;
1372 vaddr_t blkendva;
1373 struct vm_page *ptp;
1374 int32_t cpumask = 0;
1375 TAILQ_HEAD(, vm_page) empty_ptps;
1376
1377 /*
1378 * we lock in the pmap => pv_head direction
1379 */
1380
1381 TAILQ_INIT(&empty_ptps);
1382
1383 PMAP_MAP_TO_HEAD_LOCK();
1384 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1385 /*
1386 * removing one page? take shortcut function.
1387 */
1388
1389 if (sva + PAGE_SIZE == eva) {
1390
1391 if (pmap_valid_entry(PDE(pmap, pdei(sva)))) {
1392
1393 /* PA of the PTP */
1394 ptppa = PDE(pmap, pdei(sva)) & PG_FRAME;
1395
1396 /* get PTP if non-kernel mapping */
1397
1398 if (pmap == pmap_kernel()) {
1399 /* we never free kernel PTPs */
1400 ptp = NULL;
1401 } else {
1402 if (pmap->pm_ptphint &&
1403 VM_PAGE_TO_PHYS(pmap->pm_ptphint) ==
1404 ptppa) {
1405 ptp = pmap->pm_ptphint;
1406 } else {
1407 ptp = PHYS_TO_VM_PAGE(ptppa);
1408 #ifdef DIAGNOSTIC
1409 if (ptp == NULL)
1410 panic("pmap_remove: unmanaged "
1411 "PTP detected");
1412 #endif
1413 }
1414 }
1415
1416 /* do it! */
1417 result = pmap_remove_pte_pae(pmap, ptp,
1418 &ptes[atop(sva)], sva, &cpumask);
1419
1420 /*
1421 * if mapping removed and the PTP is no longer
1422 * being used, free it!
1423 */
1424
1425 if (result && ptp && ptp->wire_count <= 1) {
1426 opte = i386_atomic_testset_uq(&PDE(pmap,
1427 pdei(sva)), 0); /* zap! */
1428 #ifdef MULTIPROCESSOR
1429 /*
1430 * XXXthorpej Redundant shootdown can happen
1431 * here if we're using APTE space.
1432 */
1433 #endif
1434 pmap_tlb_shootdown(curpcb->pcb_pmap,
1435 ((vaddr_t)ptes) + ptp->offset, opte,
1436 &cpumask);
1437 #ifdef MULTIPROCESSOR
1438 /*
1439 * Always shoot down the pmap's self-mapping
1440 * of the PTP.
1441 * XXXthorpej Redundant shootdown can happen
1442 * here if pmap == curpcb->pcb_pmap (not APTE
1443 * space).
1444 */
1445 pmap_tlb_shootdown(pmap,
1446 ((vaddr_t)PTE_BASE) + ptp->offset, opte,
1447 &cpumask);
1448 #endif
1449 pmap->pm_stats.resident_count--;
1450 if (pmap->pm_ptphint == ptp)
1451 pmap->pm_ptphint =
1452 TAILQ_FIRST(&pmap->pm_obj.memq);
1453 ptp->wire_count = 0;
1454 /* Postpone free to after shootdown. */
1455 uvm_pagerealloc(ptp, NULL, 0);
1456 TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
1457 }
1458 }
1459 pmap_tlb_shootnow(cpumask);
1460 pmap_unmap_ptes_pae(pmap); /* unlock pmap */
1461 PMAP_MAP_TO_HEAD_UNLOCK();
1462 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1463 TAILQ_REMOVE(&empty_ptps, ptp, listq);
1464 uvm_pagefree(ptp);
1465 }
1466 return;
1467 }
1468
1469 for (/* null */ ; sva < eva ; sva = blkendva) {
1470
1471 /* determine range of block */
1472 blkendva = i386_round_pdr(sva+1);
1473 if (blkendva > eva)
1474 blkendva = eva;
1475
1476 /*
1477 * XXXCDC: our PTE mappings should never be removed
1478 * with pmap_remove! if we allow this (and why would
1479 * we?) then we end up freeing the pmap's page
1480 * directory page (PDP) before we are finished using
1481 * it when we hit in in the recursive mapping. this
1482 * is BAD.
1483 *
1484 * long term solution is to move the PTEs out of user
1485 * address space. and into kernel address space (up
1486 * with APTE). then we can set VM_MAXUSER_ADDRESS to
1487 * be VM_MAX_ADDRESS.
1488 */
1489
1490 if (pdei(sva) == PDSLOT_PTE)
1491 /* XXXCDC: ugly hack to avoid freeing PDP here */
1492 continue;
1493
1494 if (!pmap_valid_entry(PDE(pmap, pdei(sva))))
1495 /* valid block? */
1496 continue;
1497
1498 /* PA of the PTP */
1499 ptppa = PDE(pmap, pdei(sva)) & PG_FRAME;
1500
1501 /* get PTP if non-kernel mapping */
1502 if (pmap == pmap_kernel()) {
1503 /* we never free kernel PTPs */
1504 ptp = NULL;
1505 } else {
1506 if (pmap->pm_ptphint &&
1507 VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
1508 ptp = pmap->pm_ptphint;
1509 } else {
1510 ptp = PHYS_TO_VM_PAGE(ptppa);
1511 #ifdef DIAGNOSTIC
1512 if (ptp == NULL)
1513 panic("pmap_remove: unmanaged PTP "
1514 "detected");
1515 #endif
1516 }
1517 }
1518 pmap_remove_ptes_pae(pmap, ptp, (vaddr_t)&ptes[atop(sva)],
1519 sva, blkendva, &cpumask);
1520
1521 /* if PTP is no longer being used, free it! */
1522 if (ptp && ptp->wire_count <= 1) {
1523 opte = i386_atomic_testset_uq(&PDE(pmap, pdei(sva)),0);
1524 #if defined(MULTIPROCESSOR)
1525 /*
1526 * XXXthorpej Redundant shootdown can happen here
1527 * if we're using APTE space.
1528 */
1529 #endif
1530 pmap_tlb_shootdown(curpcb->pcb_pmap,
1531 ((vaddr_t)ptes) + ptp->offset, opte, &cpumask);
1532 #if defined(MULTIPROCESSOR)
1533 /*
1534 * Always shoot down the pmap's self-mapping
1535 * of the PTP.
1536 * XXXthorpej Redundant shootdown can happen here
1537 * if pmap == curpcb->pcb_pmap (not APTE space).
1538 */
1539 pmap_tlb_shootdown(pmap,
1540 ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask);
1541 #endif
1542 pmap->pm_stats.resident_count--;
1543 if (pmap->pm_ptphint == ptp) /* update hint? */
1544 pmap->pm_ptphint =
1545 TAILQ_FIRST(&pmap->pm_obj.memq);
1546 ptp->wire_count = 0;
1547 /* Postpone free to after shootdown. */
1548 uvm_pagerealloc(ptp, NULL, 0);
1549 TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
1550 }
1551 }
1552
1553 pmap_tlb_shootnow(cpumask);
1554 pmap_unmap_ptes_pae(pmap);
1555 PMAP_MAP_TO_HEAD_UNLOCK();
1556 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1557 TAILQ_REMOVE(&empty_ptps, ptp, listq);
1558 uvm_pagefree(ptp);
1559 }
1560 }
1561
1562 /*
1563 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
1564 *
1565 * => we set pv_head => pmap locking
1566 * => R/M bits are sync'd back to attrs
1567 */
1568
1569 void
1570 pmap_page_remove_pae(struct vm_page *pg)
1571 {
1572 int bank, off;
1573 struct pv_head *pvh;
1574 struct pv_entry *pve;
1575 pt_entry_t *ptes, opte;
1576 int32_t cpumask = 0;
1577 TAILQ_HEAD(, vm_page) empty_ptps;
1578 struct vm_page *ptp;
1579
1580 /* XXX: vm_page should either contain pv_head or have a pointer to it */
1581 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
1582 if (bank == -1) {
1583 printf("pmap_page_remove: unmanaged page?\n");
1584 return;
1585 }
1586
1587 pvh = &vm_physmem[bank].pmseg.pvhead[off];
1588 if (pvh->pvh_list == NULL) {
1589 return;
1590 }
1591
1592 TAILQ_INIT(&empty_ptps);
1593
1594 /* set pv_head => pmap locking */
1595 PMAP_HEAD_TO_MAP_LOCK();
1596
1597 /* XXX: needed if we hold head->map lock? */
1598 simple_lock(&pvh->pvh_lock);
1599
1600 for (pve = pvh->pvh_list ; pve != NULL ; pve = pve->pv_next) {
1601 ptes = pmap_map_ptes_pae(pve->pv_pmap); /* locks pmap */
1602
1603 #ifdef DIAGNOSTIC
1604 if (pve->pv_va >= uvm.pager_sva && pve->pv_va < uvm.pager_eva)
1605 printf("pmap_page_remove: found pager VA on pv_list\n");
1606 if (pve->pv_ptp && (PDE(pve->pv_pmap,
1607 pdei(pve->pv_va)) & PG_FRAME) !=
1608 VM_PAGE_TO_PHYS(pve->pv_ptp)) {
1609 printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
1610 pg, pve->pv_va, pve->pv_ptp);
1611 printf("pmap_page_remove: PTP's phys addr: "
1612 "actual=%llx, recorded=%llx\n",
1613 (PDE(pve->pv_pmap, pdei(pve->pv_va)) &
1614 PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
1615 panic("pmap_page_remove: mapped managed page has "
1616 "invalid pv_ptp field");
1617 }
1618 #endif
1619
1620 opte = ptes[atop(pve->pv_va)];
1621 ptes[atop(pve->pv_va)] = 0; /* zap! */
1622
1623 if (opte & PG_W)
1624 pve->pv_pmap->pm_stats.wired_count--;
1625 pve->pv_pmap->pm_stats.resident_count--;
1626
1627 /* Shootdown only if referenced */
1628 if (opte & PG_U)
1629 pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
1630 &cpumask);
1631
1632 /* sync R/M bits */
1633 vm_physmem[bank].pmseg.attrs[off] |= (opte & (PG_U|PG_M));
1634
1635 /* update the PTP reference count. free if last reference. */
1636 if (pve->pv_ptp) {
1637 pve->pv_ptp->wire_count--;
1638 if (pve->pv_ptp->wire_count <= 1) {
1639 /*
1640 * Do we have to shootdown the page just to
1641 * get the pte out of the TLB ?
1642 */
1643 if(!(opte & PG_U))
1644 pmap_tlb_shootdown(pve->pv_pmap,
1645 pve->pv_va, opte, &cpumask);
1646
1647 opte = i386_atomic_testset_uq(&PDE(pve->pv_pmap,
1648 pdei(pve->pv_va)), 0);
1649 pmap_tlb_shootdown(curpcb->pcb_pmap,
1650 ((vaddr_t)ptes) + pve->pv_ptp->offset,
1651 opte, &cpumask);
1652 #if defined(MULTIPROCESSOR)
1653 /*
1654 * Always shoot down the other pmap's
1655 * self-mapping of the PTP.
1656 */
1657 pmap_tlb_shootdown(pve->pv_pmap,
1658 ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset,
1659 opte, &cpumask);
1660 #endif
1661 pve->pv_pmap->pm_stats.resident_count--;
1662 /* update hint? */
1663 if (pve->pv_pmap->pm_ptphint == pve->pv_ptp)
1664 pve->pv_pmap->pm_ptphint =
1665 TAILQ_FIRST(&pve->pv_pmap->pm_obj.memq);
1666 pve->pv_ptp->wire_count = 0;
1667 /* Postpone free to after shootdown. */
1668 uvm_pagerealloc(pve->pv_ptp, NULL, 0);
1669 TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp,
1670 listq);
1671 }
1672 }
1673 pmap_unmap_ptes_pae(pve->pv_pmap); /* unlocks pmap */
1674 }
1675 pmap_free_pvs(NULL, pvh->pvh_list);
1676 pvh->pvh_list = NULL;
1677 simple_unlock(&pvh->pvh_lock);
1678 PMAP_HEAD_TO_MAP_UNLOCK();
1679 pmap_tlb_shootnow(cpumask);
1680 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
1681 TAILQ_REMOVE(&empty_ptps, ptp, listq);
1682 uvm_pagefree(ptp);
1683 }
1684 }
1685
1686 /*
1687 * p m a p a t t r i b u t e f u n c t i o n s
1688 * functions that test/change managed page's attributes
1689 * since a page can be mapped multiple times we must check each PTE that
1690 * maps it by going down the pv lists.
1691 */
1692
1693 /*
1694 * pmap_test_attrs: test a page's attributes
1695 *
1696 * => we set pv_head => pmap locking
1697 */
1698
1699 boolean_t
1700 pmap_test_attrs_pae(struct vm_page *pg, int testbits)
1701 {
1702 int bank, off;
1703 char *myattrs;
1704 struct pv_head *pvh;
1705 struct pv_entry *pve;
1706 pt_entry_t *ptes, pte;
1707
1708 /* XXX: vm_page should either contain pv_head or have a pointer to it */
1709 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
1710 if (bank == -1) {
1711 printf("pmap_test_attrs: unmanaged page?\n");
1712 return(FALSE);
1713 }
1714
1715 /*
1716 * before locking: see if attributes are already set and if so,
1717 * return!
1718 */
1719
1720 myattrs = &vm_physmem[bank].pmseg.attrs[off];
1721 if (*myattrs & testbits)
1722 return(TRUE);
1723
1724 /* test to see if there is a list before bothering to lock */
1725 pvh = &vm_physmem[bank].pmseg.pvhead[off];
1726 if (pvh->pvh_list == NULL) {
1727 return(FALSE);
1728 }
1729
1730 /* nope, gonna have to do it the hard way */
1731 PMAP_HEAD_TO_MAP_LOCK();
1732 /* XXX: needed if we hold head->map lock? */
1733 simple_lock(&pvh->pvh_lock);
1734
1735 for (pve = pvh->pvh_list; pve != NULL && (*myattrs & testbits) == 0;
1736 pve = pve->pv_next) {
1737 ptes = pmap_map_ptes_pae(pve->pv_pmap);
1738 pte = ptes[atop(pve->pv_va)];
1739 pmap_unmap_ptes_pae(pve->pv_pmap);
1740 *myattrs |= pte;
1741 }
1742
1743 /*
1744 * note that we will exit the for loop with a non-null pve if
1745 * we have found the bits we are testing for.
1746 */
1747
1748 simple_unlock(&pvh->pvh_lock);
1749 PMAP_HEAD_TO_MAP_UNLOCK();
1750 return((*myattrs & testbits) != 0);
1751 }
1752
1753 /*
1754 * pmap_change_attrs: change a page's attributes
1755 *
1756 * => we set pv_head => pmap locking
1757 * => we return TRUE if we cleared one of the bits we were asked to
1758 */
1759
1760 boolean_t
1761 pmap_change_attrs_pae(struct vm_page *pg, int setbits, int clearbits)
1762 {
1763 u_int32_t result;
1764 int bank, off;
1765 struct pv_head *pvh;
1766 struct pv_entry *pve;
1767 pt_entry_t *ptes, npte, opte;
1768 char *myattrs;
1769 int32_t cpumask = 0;
1770
1771 /* XXX: vm_page should either contain pv_head or have a pointer to it */
1772 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
1773 if (bank == -1) {
1774 printf("pmap_change_attrs: unmanaged page?\n");
1775 return(FALSE);
1776 }
1777
1778 PMAP_HEAD_TO_MAP_LOCK();
1779 pvh = &vm_physmem[bank].pmseg.pvhead[off];
1780 /* XXX: needed if we hold head->map lock? */
1781 simple_lock(&pvh->pvh_lock);
1782
1783 myattrs = &vm_physmem[bank].pmseg.attrs[off];
1784 result = *myattrs & clearbits;
1785 *myattrs = (*myattrs | setbits) & ~clearbits;
1786
1787 for (pve = pvh->pvh_list; pve != NULL; pve = pve->pv_next) {
1788 #ifdef DIAGNOSTIC
1789 if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va))))
1790 panic("pmap_change_attrs: mapping without PTP "
1791 "detected");
1792 #endif
1793
1794 ptes = pmap_map_ptes_pae(pve->pv_pmap); /* locks pmap */
1795 npte = ptes[atop(pve->pv_va)];
1796 result |= (npte & clearbits);
1797 npte = (npte | setbits) & ~(pt_entry_t)clearbits;
1798 if (ptes[atop(pve->pv_va)] != npte) {
1799 opte = i386_atomic_testset_uq(&ptes[atop(pve->pv_va)],
1800 npte);
1801 pmap_tlb_shootdown(pve->pv_pmap,
1802 atop(pve->pv_va), opte, &cpumask);
1803 }
1804 pmap_unmap_ptes_pae(pve->pv_pmap); /* unlocks pmap */
1805 }
1806
1807 simple_unlock(&pvh->pvh_lock);
1808 PMAP_HEAD_TO_MAP_UNLOCK();
1809 pmap_tlb_shootnow(cpumask);
1810
1811 return(result != 0);
1812 }
1813
1814 /*
1815 * p m a p p r o t e c t i o n f u n c t i o n s
1816 */
1817
1818 /*
1819 * pmap_page_protect: change the protection of all recorded mappings
1820 * of a managed page
1821 *
1822 * => NOTE: this is an inline function in pmap.h
1823 */
1824
1825 /* see pmap.h */
1826
1827 /*
1828 * pmap_protect: set the protection in of the pages in a pmap
1829 *
1830 * => NOTE: this is an inline function in pmap.h
1831 */
1832
1833 /* see pmap.h */
1834
1835 /*
1836 * pmap_write_protect: write-protect pages in a pmap
1837 */
1838 void
1839 pmap_write_protect_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva,
1840 vm_prot_t prot)
1841 {
1842 pt_entry_t *ptes, *spte, *epte, opte, npte;
1843 vaddr_t blockend;
1844 u_int32_t md_prot;
1845 int32_t cpumask = 0;
1846
1847 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1848
1849 /* should be ok, but just in case ... */
1850 sva &= PG_FRAME;
1851 eva &= PG_FRAME;
1852
1853 for (/* null */ ; sva < eva ; sva = blockend) {
1854
1855 blockend = (sva & PD_MASK) + NBPD;
1856 if (blockend > eva)
1857 blockend = eva;
1858
1859 /*
1860 * XXXCDC: our PTE mappings should never be write-protected!
1861 *
1862 * long term solution is to move the PTEs out of user
1863 * address space. and into kernel address space (up
1864 * with APTE). then we can set VM_MAXUSER_ADDRESS to
1865 * be VM_MAX_ADDRESS.
1866 */
1867
1868 /* XXXCDC: ugly hack to avoid freeing PDP here */
1869 if (pdei(sva) == PDSLOT_PTE)
1870 continue;
1871
1872 /* empty block? */
1873 if (!pmap_valid_entry(PDE(pmap, pdei(sva))))
1874 continue;
1875
1876 md_prot = protection_codes[prot];
1877 if (sva < VM_MAXUSER_ADDRESS)
1878 md_prot |= PG_u;
1879 else if (sva < VM_MAX_ADDRESS)
1880 /* XXX: write-prot our PTES? never! */
1881 md_prot |= (PG_u | PG_RW);
1882
1883 spte = &ptes[atop(sva)];
1884 epte = &ptes[atop(blockend)];
1885
1886 for (/*null */; spte < epte ; spte++, sva += PAGE_SIZE) {
1887
1888 if (!pmap_valid_entry(*spte)) /* no mapping? */
1889 continue;
1890
1891 npte = (*spte & ~(pt_entry_t)PG_PROT) | md_prot;
1892
1893 if (npte != *spte) {
1894 pmap_exec_account(pmap, sva, *spte, npte);
1895 opte = *spte;
1896 *spte = npte;
1897 pmap_tlb_shootdown(pmap, sva, opte, &cpumask);
1898 }
1899 }
1900 }
1901
1902 pmap_tlb_shootnow(cpumask);
1903 pmap_unmap_ptes_pae(pmap); /* unlocks pmap */
1904 }
1905
1906 /*
1907 * end of protection functions
1908 */
1909
1910 /*
1911 * pmap_unwire: clear the wired bit in the PTE
1912 *
1913 * => mapping should already be in map
1914 */
1915
1916 void
1917 pmap_unwire_pae(struct pmap *pmap, vaddr_t va)
1918 {
1919 pt_entry_t *ptes;
1920
1921 if (pmap_valid_entry(PDE(pmap, pdei(va)))) {
1922 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1923
1924 #ifdef DIAGNOSTIC
1925 if (!pmap_valid_entry(ptes[atop(va)]))
1926 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
1927 #endif
1928 if ((ptes[atop(va)] & PG_W) != 0) {
1929 ptes[atop(va)] &= ~PG_W;
1930 pmap->pm_stats.wired_count--;
1931 }
1932 #ifdef DIAGNOSTIC
1933 else {
1934 printf("pmap_unwire: wiring for pmap %p va 0x%lx "
1935 "didn't change!\n", pmap, va);
1936 }
1937 #endif
1938 pmap_unmap_ptes_pae(pmap); /* unlocks map */
1939 }
1940 #ifdef DIAGNOSTIC
1941 else {
1942 panic("pmap_unwire: invalid PDE");
1943 }
1944 #endif
1945 }
1946
1947 /*
1948 * pmap_copy: copy mappings from one pmap to another
1949 *
1950 * => optional function
1951 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
1952 */
1953
1954 /*
1955 * defined as macro in pmap.h
1956 */
1957
1958 /*
1959 * pmap_enter: enter a mapping into a pmap
1960 *
1961 * => must be done "now" ... no lazy-evaluation
1962 * => we set pmap => pv_head locking
1963 */
1964
1965 int
1966 pmap_enter_pae(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
1967 int flags)
1968 {
1969 pt_entry_t *ptes, opte, npte;
1970 struct vm_page *ptp;
1971 struct pv_head *pvh;
1972 struct pv_entry *pve;
1973 int bank, off, error;
1974 boolean_t wired = (flags & PMAP_WIRED) != 0;
1975
1976 #ifdef DIAGNOSTIC
1977 /* sanity check: totally out of range? */
1978 if (va >= VM_MAX_KERNEL_ADDRESS)
1979 panic("pmap_enter: too big");
1980
1981 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
1982 panic("pmap_enter: trying to map over PDP/APDP!");
1983
1984 /* sanity check: kernel PTPs should already have been pre-allocated */
1985 if (va >= VM_MIN_KERNEL_ADDRESS &&
1986 !pmap_valid_entry(PDE(pmap, pdei(va))))
1987 panic("pmap_enter: missing kernel PTP!");
1988 #endif
1989
1990 /* get lock */
1991 PMAP_MAP_TO_HEAD_LOCK();
1992
1993 /*
1994 * map in ptes and get a pointer to our PTP (unless we are the kernel)
1995 */
1996
1997 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
1998 if (pmap == pmap_kernel()) {
1999 ptp = NULL;
2000 } else {
2001 ptp = pmap_get_ptp_pae(pmap, pdei(va), FALSE);
2002 if (ptp == NULL) {
2003 if (flags & PMAP_CANFAIL) {
2004 error = ENOMEM;
2005 goto out;
2006 }
2007 panic("pmap_enter: get ptp failed");
2008 }
2009 }
2010 opte = ptes[atop(va)]; /* old PTE */
2011
2012 /*
2013 * is there currently a valid mapping at our VA?
2014 */
2015
2016 if (pmap_valid_entry(opte)) {
2017
2018 /*
2019 * first, update pm_stats. resident count will not
2020 * change since we are replacing/changing a valid
2021 * mapping. wired count might change...
2022 */
2023
2024 if (wired && (opte & PG_W) == 0)
2025 pmap->pm_stats.wired_count++;
2026 else if (!wired && (opte & PG_W) != 0)
2027 pmap->pm_stats.wired_count--;
2028
2029 /*
2030 * is the currently mapped PA the same as the one we
2031 * want to map?
2032 */
2033
2034 if ((opte & PG_FRAME) == pa) {
2035
2036 /* if this is on the PVLIST, sync R/M bit */
2037 if (opte & PG_PVLIST) {
2038 bank = vm_physseg_find(atop(pa), &off);
2039 #ifdef DIAGNOSTIC
2040 if (bank == -1)
2041 panic("pmap_enter: same pa PG_PVLIST "
2042 "mapping with unmanaged page "
2043 "pa = 0x%lx (0x%lx)", pa,
2044 atop(pa));
2045 #endif
2046 pvh = &vm_physmem[bank].pmseg.pvhead[off];
2047 simple_lock(&pvh->pvh_lock);
2048 vm_physmem[bank].pmseg.attrs[off] |= opte;
2049 simple_unlock(&pvh->pvh_lock);
2050 } else {
2051 pvh = NULL; /* ensure !PG_PVLIST */
2052 }
2053 goto enter_now;
2054 }
2055
2056 /*
2057 * changing PAs: we must remove the old one first
2058 */
2059
2060 /*
2061 * if current mapping is on a pvlist,
2062 * remove it (sync R/M bits)
2063 */
2064
2065 if (opte & PG_PVLIST) {
2066 bank = vm_physseg_find(atop(opte & PG_FRAME), &off);
2067 #ifdef DIAGNOSTIC
2068 if (bank == -1)
2069 panic("pmap_enter: PG_PVLIST mapping with "
2070 "unmanaged page "
2071 "pa = 0x%lx (0x%lx)", pa, atop(pa));
2072 #endif
2073 pvh = &vm_physmem[bank].pmseg.pvhead[off];
2074 simple_lock(&pvh->pvh_lock);
2075 pve = pmap_remove_pv(pvh, pmap, va);
2076 vm_physmem[bank].pmseg.attrs[off] |= opte;
2077 simple_unlock(&pvh->pvh_lock);
2078 } else {
2079 pve = NULL;
2080 }
2081 } else { /* opte not valid */
2082 pve = NULL;
2083 pmap->pm_stats.resident_count++;
2084 if (wired)
2085 pmap->pm_stats.wired_count++;
2086 if (ptp)
2087 ptp->wire_count++; /* count # of valid entrys */
2088 }
2089
2090 /*
2091 * at this point pm_stats has been updated. pve is either NULL
2092 * or points to a now-free pv_entry structure (the latter case is
2093 * if we called pmap_remove_pv above).
2094 *
2095 * if this entry is to be on a pvlist, enter it now.
2096 */
2097
2098 bank = vm_physseg_find(atop(pa), &off);
2099 if (pmap_initialized && bank != -1) {
2100 pvh = &vm_physmem[bank].pmseg.pvhead[off];
2101 if (pve == NULL) {
2102 pve = pmap_alloc_pv(pmap, ALLOCPV_NEED);
2103 if (pve == NULL) {
2104 if (flags & PMAP_CANFAIL) {
2105 error = ENOMEM;
2106 goto out;
2107 }
2108 panic("pmap_enter: no pv entries available");
2109 }
2110 }
2111 /* lock pvh when adding */
2112 pmap_enter_pv(pvh, pve, pmap, va, ptp);
2113 } else {
2114
2115 /* new mapping is not PG_PVLIST. free pve if we've got one */
2116 pvh = NULL; /* ensure !PG_PVLIST */
2117 if (pve)
2118 pmap_free_pv(pmap, pve);
2119 }
2120
2121 enter_now:
2122 /*
2123 * at this point pvh is !NULL if we want the PG_PVLIST bit set
2124 */
2125
2126 npte = pa | protection_codes[prot] | PG_V;
2127 pmap_exec_account(pmap, va, opte, npte);
2128 if (pvh)
2129 npte |= PG_PVLIST;
2130 if (wired)
2131 npte |= PG_W;
2132 if (va < VM_MAXUSER_ADDRESS)
2133 npte |= PG_u;
2134 else if (va < VM_MAX_ADDRESS)
2135 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
2136 if (pmap == pmap_kernel())
2137 npte |= pmap_pg_g;
2138
2139 ptes[atop(va)] = npte; /* zap! */
2140
2141 if ((opte & ~(pt_entry_t)(PG_M|PG_U)) != npte) {
2142 #ifdef MULTIPROCESSOR
2143 int32_t cpumask = 0;
2144
2145 pmap_tlb_shootdown(pmap, va, opte, &cpumask);
2146 pmap_tlb_shootnow(cpumask);
2147 #else
2148 /* Don't bother deferring in the single CPU case. */
2149 if (pmap_is_curpmap(pmap))
2150 pmap_update_pg(va);
2151 #endif
2152 }
2153
2154 error = 0;
2155
2156 out:
2157 pmap_unmap_ptes_pae(pmap);
2158 PMAP_MAP_TO_HEAD_UNLOCK();
2159 return error;
2160 }
2161
2162 /*
2163 * pmap_growkernel: increase usage of KVM space
2164 *
2165 * => we allocate new PTPs for the kernel and install them in all
2166 * the pmaps on the system.
2167 */
2168
2169 vaddr_t
2170 pmap_growkernel_pae(vaddr_t maxkvaddr)
2171 {
2172 extern int nkpde;
2173 struct pmap *kpm = pmap_kernel(), *pm;
2174 int needed_kpde; /* needed number of kernel PTPs */
2175 int s;
2176 paddr_t ptaddr;
2177
2178 needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
2179 / NBPD;
2180 if (needed_kpde <= nkpde)
2181 goto out; /* we are OK */
2182
2183 /*
2184 * whoops! we need to add kernel PTPs
2185 */
2186
2187 s = splhigh(); /* to be safe */
2188 simple_lock(&kpm->pm_obj.vmobjlock);
2189
2190 for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
2191
2192 if (uvm.page_init_done == FALSE) {
2193
2194 /*
2195 * we're growing the kernel pmap early (from
2196 * uvm_pageboot_alloc()). this case must be
2197 * handled a little differently.
2198 */
2199
2200 if (uvm_page_physget(&ptaddr) == FALSE)
2201 panic("pmap_growkernel: out of memory");
2202 pmap_zero_phys(ptaddr);
2203
2204 PDE(kpm, PDSLOT_KERN + nkpde) = ptaddr | PG_RW | PG_V;
2205
2206 /* count PTP as resident */
2207 kpm->pm_stats.resident_count++;
2208 continue;
2209 }
2210
2211 /*
2212 * THIS *MUST* BE CODED SO AS TO WORK IN THE
2213 * pmap_initialized == FALSE CASE! WE MAY BE
2214 * INVOKED WHILE pmap_init() IS RUNNING!
2215 */
2216
2217 while (!pmap_alloc_ptp_pae(kpm, PDSLOT_KERN + nkpde, FALSE))
2218 uvm_wait("pmap_growkernel");
2219
2220 /* PG_u not for kernel */
2221 PDE(kpm, PDSLOT_KERN + nkpde) &= ~PG_u;
2222
2223 /* distribute new kernel PTP to all active pmaps */
2224 simple_lock(&pmaps_lock);
2225 LIST_FOREACH(pm, &pmaps, pm_list) {
2226 PDE(pm, PDSLOT_KERN + nkpde) =
2227 PDE(kpm, PDSLOT_KERN + nkpde);
2228 }
2229 simple_unlock(&pmaps_lock);
2230 }
2231
2232 simple_unlock(&kpm->pm_obj.vmobjlock);
2233 splx(s);
2234
2235 out:
2236 return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
2237 }
2238
2239 #ifdef DEBUG
2240 void pmap_dump_pae(struct pmap *, vaddr_t, vaddr_t);
2241
2242 /*
2243 * pmap_dump: dump all the mappings from a pmap
2244 *
2245 * => caller should not be holding any pmap locks
2246 */
2247
2248 void
2249 pmap_dump_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
2250 {
2251 pt_entry_t *ptes, *pte;
2252 vaddr_t blkendva;
2253
2254 /*
2255 * if end is out of range truncate.
2256 * if (end == start) update to max.
2257 */
2258
2259 if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
2260 eva = VM_MAXUSER_ADDRESS;
2261
2262 /*
2263 * we lock in the pmap => pv_head direction
2264 */
2265
2266 PMAP_MAP_TO_HEAD_LOCK();
2267 ptes = pmap_map_ptes_pae(pmap); /* locks pmap */
2268
2269 /*
2270 * dumping a range of pages: we dump in PTP sized blocks (4MB)
2271 */
2272
2273 for (/* null */ ; sva < eva ; sva = blkendva) {
2274
2275 /* determine range of block */
2276 blkendva = i386_round_pdr(sva+1);
2277 if (blkendva > eva)
2278 blkendva = eva;
2279
2280 /* valid block? */
2281 if (!pmap_valid_entry(PDE(pmap, pdei(sva))))
2282 continue;
2283
2284 pte = &ptes[atop(sva)];
2285 for (/* null */; sva < blkendva ; sva += NBPG, pte++) {
2286 if (!pmap_valid_entry(*pte))
2287 continue;
2288 printf("va %#lx -> pa %#x (pte=%#x)\n",
2289 sva, *pte, *pte & PG_FRAME);
2290 }
2291 }
2292 pmap_unmap_ptes_pae(pmap);
2293 PMAP_MAP_TO_HEAD_UNLOCK();
2294 }
2295 #endif