diff --git a/sys/amd64/conf/X1 b/sys/amd64/conf/X1 index 88e58e4..0d86db4 100644 --- a/sys/amd64/conf/X1 +++ b/sys/amd64/conf/X1 @@ -3,6 +3,9 @@ options PREEMPTION options SMP options HWPMC_HOOKS device acpi +options ACPI_DMAR +options MAXCPU=128 +options MAXMEMDOM=128 ident X1 nooptions INCLUDE_CONFIG_FILE diff --git a/sys/amd64/include/param.h b/sys/amd64/include/param.h index ad6c733..bd4f5bd 100644 --- a/sys/amd64/include/param.h +++ b/sys/amd64/include/param.h @@ -118,6 +118,7 @@ #define PML4MASK (NBPML4-1) #define MAXPAGESIZES 3 /* maximum number of supported page sizes */ +#define MAXPAGESIZE (1 * 1024 * 1024 * 1024) #define IOPAGES 2 /* pages of i/o permission bitmap */ /* diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 19feecd..d928972 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -324,6 +324,7 @@ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif +static int bqgen; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. @@ -476,6 +477,7 @@ bufspacewakeup(void) on & ~VFS_BIO_NEED_BUFSPACE)) break; } + atomic_add_int(&bqgen, 1); if (need_wakeup) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); @@ -559,6 +561,7 @@ bufcountadd(struct buf *bp) if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask)) break; } + atomic_add_int(&bqgen, 1); if (need_wakeup) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); @@ -2088,14 +2091,12 @@ allocbufkva(struct buf *bp, int maxsize, int gbflags) */ static void getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, - int defrag) + int defrag, int bqgen_old) { struct thread *td; char *waitmsg; int cnt, error, flags, norunbuf, wait; - mtx_assert(&bqclean, MA_OWNED); - if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; @@ -2107,7 +2108,6 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, flags = VFS_BIO_NEED_ANY; } atomic_set_int(&needsbuffer, flags); - mtx_unlock(&bqclean); bd_speedup(); /* heeeelp */ if ((gbflags & GB_NOWAIT_BD) != 0) @@ -2146,6 +2146,14 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, if ((needsbuffer & flags) == 0) break; } + /* + * We did not found a buffer on the queues, but queues + * were modified behind us and we could not noticed. + * Avoid sleep to ensure that we are not blocked + * forever without a thread that can wake us up. + */ + if (bqgen != bqgen_old) + break; error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo); if (error != 0) @@ -2257,7 +2265,6 @@ restart: * where we cannot backup. */ nbp = NULL; - mtx_lock(&bqclean); if (!defrag && unmapped) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); @@ -2338,35 +2345,35 @@ restart: break; } } - /* - * If we are defragging then we need a buffer with - * b_kvasize != 0. XXX this situation should no longer - * occur, if defrag is non-zero the buffer's b_kvasize - * should also be non-zero at this point. XXX - */ - if (defrag && bp->b_kvasize == 0) { - printf("Warning: defrag empty buffer %p\n", bp); - continue; - } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) - continue; + goto restart; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ - if (bp->b_vflags & BV_BKGRDINPROG) { + if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || + bp->b_qindex != qindex) { BUF_UNLOCK(bp); - continue; + goto restart; } - KASSERT(bp->b_qindex == qindex, - ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); + /* + * If we are defragging then we need a buffer with + * b_kvasize != 0. XXX this situation should no longer + * occur, if defrag is non-zero the buffer's b_kvasize + * should also be non-zero at this point. XXX + */ + if (defrag && bp->b_kvasize == 0) { + printf("Warning: defrag empty buffer %p\n", bp); + continue; + } + mtx_lock(&bqclean); bremfreel(bp); mtx_unlock(&bqclean); /* @@ -2442,7 +2449,7 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, int gbflags) { struct buf *bp; - int defrag, metadata; + int defrag, metadata, bqgen_old; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); @@ -2464,8 +2471,10 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: + bqgen_old = bqgen; bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED, metadata); + mtx_assert(&bqclean, MA_NOTOWNED); if (bp != NULL) defrag = 0; @@ -2476,12 +2485,13 @@ restart: * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { - mtx_assert(&bqclean, MA_OWNED); - getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); + if (bqgen != bqgen_old) { + goto restart; + } + getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag, + bqgen_old); mtx_assert(&bqclean, MA_NOTOWNED); } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { - mtx_assert(&bqclean, MA_NOTOWNED); - bfreekva(bp); bp->b_flags |= B_UNMAPPED; bp->b_kvabase = bp->b_data = unmapped_buf; @@ -2490,8 +2500,6 @@ restart: atomic_add_long(&unmapped_bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } else { - mtx_assert(&bqclean, MA_NOTOWNED); - /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order diff --git a/sys/sys/mman.h b/sys/sys/mman.h index f0e01b6..ead0a64 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -95,6 +95,7 @@ #ifdef __LP64__ #define MAP_32BIT 0x00080000 /* map in the low 2GB of address space */ #endif +#define MAP_SHARED_PHYS 0x00100000 /* * Request specific alignment (n == log2 of the desired alignment). diff --git a/sys/vm/phys_pager.c b/sys/vm/phys_pager.c index 9e98006..d7061a7 100644 --- a/sys/vm/phys_pager.c +++ b/sys/vm/phys_pager.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include /* list of phys pager objects */ static struct pagerlst phys_pager_object_list; @@ -64,7 +65,8 @@ phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { vm_object_t object, object1; - vm_pindex_t pindex; + vm_pindex_t oosize, pindex, reserv; + boolean_t new_obj; /* * Offset should be page aligned. @@ -73,6 +75,7 @@ phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, return (NULL); pindex = OFF_TO_IDX(foff + PAGE_MASK + size); + new_obj = FALSE; if (handle != NULL) { mtx_lock(&phys_pager_mtx); @@ -90,29 +93,58 @@ phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, mtx_lock(&phys_pager_mtx); object = vm_pager_object_lookup(&phys_pager_object_list, handle); - if (object != NULL) { + if (object == NULL) { /* - * We raced with other thread while - * allocating object. + * Otherwise, we raced with other + * thread while allocating object. */ - if (pindex > object->size) - object->size = pindex; - } else { object = object1; object1 = NULL; object->handle = handle; - TAILQ_INSERT_TAIL(&phys_pager_object_list, object, - pager_object_list); + TAILQ_INSERT_TAIL(&phys_pager_object_list, + object, pager_object_list); + new_obj = TRUE; } - } else { - if (pindex > object->size) - object->size = pindex; } mtx_unlock(&phys_pager_mtx); vm_object_deallocate(object1); } else { object = vm_object_allocate(OBJT_PHYS, pindex); + new_obj = TRUE; + } + VM_OBJECT_WLOCK(object); + oosize = new_obj ? 0 : object->size; + if (pindex > object->size) + object->size = pindex; + if (object->cred != NULL || cred != NULL) { + if (object->cred == NULL) { + KASSERT(object->charge == 0, + ("PHYS object with non-zero charge")); + } else { + /* XXXKIB assert cred == object->cred ? */ + cred = object->cred; + KASSERT(object->charge == 0 || + (object->flags & OBJ_WIRECOUNT) != 0, + ("charged phys_obj but no OBJ_WIRECOUNT")); + } + reserv = pindex - oosize; + if (reserv != 0) { + if (atomic_fetchadd_int(&vm_cnt.v_wire_count, reserv) + + reserv > vm_page_max_wired || + !swap_reserve_by_cred(ptoa(reserv), cred)) { + atomic_subtract_int(&vm_cnt.v_wire_count, + reserv); + object->size = oosize; + VM_OBJECT_WUNLOCK(object); + vm_object_deallocate(object); + return (NULL); + } + vm_object_set_flag(object, OBJ_WIRECOUNT); + object->cred = cred; + object->charge += ptoa(reserv); + } } + VM_OBJECT_WUNLOCK(object); return (object); } diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index b8d67bd..7bdeae2 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -2856,7 +2856,7 @@ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; - vm_pindex_t offidxstart, offidxend, count, size1; + vm_pindex_t offidxstart, offidxend, count; vm_ooffset_t size; vm_map_entry_unlink(map, entry); @@ -2895,15 +2895,8 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && offidxstart < object->size) { - size1 = object->size; - object->size = offidxstart; - if (object->cred != NULL) { - size1 -= object->size; - KASSERT(object->charge >= ptoa(size1), - ("vm_map_entry_delete: object->charge < 0")); - swap_release_by_cred(ptoa(size1), object->cred); - object->charge -= ptoa(size1); - } + vm_object_discharge(object, ptoa(object->size - + offidxstart)); } } VM_OBJECT_WUNLOCK(object); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 9506efe..bf9c08c 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -95,6 +95,15 @@ int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); +static int shared_anon_use_phys; +SYSCTL_INT(_vm, OID_AUTO, shm_anon_use_phys, CTLFLAG_RW, + &shared_anon_use_phys, 0, + "Enable/Disable locking of shared anonymous memory pages in core"); +static int shared_anon_phys_preload; +SYSCTL_INT(_vm, OID_AUTO, shm_anon_phys_preload, CTLFLAG_RW, + &shared_anon_phys_preload, 0, + ""); + #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif @@ -247,6 +256,7 @@ sys_mmap(td, uap) if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_RENAME | MAP_NORESERVE | MAP_HASSEMAPHORE | MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | MAP_PREFAULT_READ | + MAP_SHARED_PHYS | #ifdef MAP_32BIT MAP_32BIT | #endif @@ -259,6 +269,9 @@ sys_mmap(td, uap) if (prot != PROT_NONE && (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) return (EINVAL); + if ((flags & MAP_SHARED_PHYS) != 0 && + (flags & (MAP_SHARED | MAP_ANON)) != (MAP_SHARED | MAP_ANON)) + return (EINVAL); /* * Align the file position to a page boundary, @@ -333,7 +346,11 @@ sys_mmap(td, uap) * Mapping blank space is trivial. */ handle = NULL; - handle_type = OBJT_DEFAULT; + if ((flags & (MAP_SHARED | MAP_SHARED_PHYS)) == + (MAP_SHARED | MAP_SHARED_PHYS) && shared_anon_use_phys) + handle_type = OBJT_PHYS; + else + handle_type = OBJT_DEFAULT; maxprot = VM_PROT_ALL; cap_maxprot = VM_PROT_ALL; } else { @@ -1528,16 +1545,18 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, objtype_t handle_type, void *handle, vm_ooffset_t foff) { - boolean_t fitit; - vm_object_t object = NULL; - struct thread *td = curthread; + vm_object_t object; + vm_page_t m; + struct thread *td; + vm_pindex_t pi, psize; int docow, error, findspace, rv; - boolean_t writecounted; + boolean_t fitit, writecounted; if (size == 0) return (0); size = round_page(size); + td = curthread; if (map == &td->td_proc->p_vmspace->vm_map) { PROC_LOCK(td->td_proc); @@ -1589,6 +1608,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, fitit = FALSE; } writecounted = FALSE; + object = NULL; /* * Lookup/allocate object. @@ -1606,6 +1626,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, error = vm_mmap_shm(td, size, prot, &maxprot, &flags, handle, foff, &object); break; + case OBJT_PHYS: case OBJT_DEFAULT: if (handle == NULL) { error = 0; @@ -1619,7 +1640,30 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, if (error) return (error); if (flags & MAP_ANON) { - object = NULL; + if (handle_type == OBJT_PHYS) { + object = vm_pager_allocate(OBJT_PHYS, NULL, size, + prot, 0, td->td_ucred); + if (object == NULL) + return (ENOMEM); + if ((flags & MAP_PREFAULT_READ) != 0 && + shared_anon_phys_preload) { + psize = OFF_TO_IDX(size); + VM_OBJECT_WLOCK(object); + for (pi = 0; pi < psize; pi++) { + m = vm_page_grab(object, pi, + VM_ALLOC_NOBUSY | VM_ALLOC_ZERO); + m->valid = VM_PAGE_BITS_ALL; + if (should_yield()) { + VM_OBJECT_WUNLOCK(object); + kern_yield(PRI_USER); + VM_OBJECT_WLOCK(object); + } + } + VM_OBJECT_WUNLOCK(object); + } + } else { + object = NULL; + } docow = 0; /* * Unnamed anonymous regions always start at 0. diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index c3af9d7..6e38492 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -659,6 +659,28 @@ doterm: } /* + * Release the allocation and wired charges. + */ +void +vm_object_discharge(vm_object_t object, vm_ooffset_t discharge) +{ + + if ((object->flags & OBJ_WIRECOUNT) != 0) { + KASSERT(object->cred != NULL, ("wirecount and no cred")); + atomic_subtract_int(&vm_cnt.v_wire_count, + atop(discharge)); + } + if (object->cred != NULL) { + KASSERT(object->type == OBJT_DEFAULT || + object->type == OBJT_SWAP || object->type == OBJT_PHYS, + ("%s: non-swap obj %p has cred", __func__, object)); + KASSERT(object->charge >= discharge, ("XXX")); + swap_release_by_cred(discharge, object->cred); + object->charge -= discharge; + } +} + +/* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ @@ -673,15 +695,9 @@ vm_object_destroy(vm_object_t object) TAILQ_REMOVE(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); - /* - * Release the allocation charge. - */ + vm_object_discharge(object, object->charge); + KASSERT(object->charge == 0, ("vm_object_discharge failure")); if (object->cred != NULL) { - KASSERT(object->type == OBJT_DEFAULT || - object->type == OBJT_SWAP, - ("%s: non-swap obj %p has cred", __func__, object)); - swap_release_by_cred(object->charge, object->cred); - object->charge = 0; crfree(object->cred); object->cred = NULL; } diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index ab3c7d3..f3c1a38 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -185,6 +185,7 @@ struct vm_object { #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ +#define OBJ_WIRECOUNT 0x0080 /* accounted pages as wired */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ @@ -276,6 +277,7 @@ boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); +void vm_object_discharge(vm_object_t object, vm_ooffset_t discharge); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 28dd645..74172f4 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -334,6 +334,11 @@ vm_page_startup(vm_offset_t vaddr) mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); + + TUNABLE_INT_FETCH("vm.domain_split_factor", &vm_domain_split_factor); + vm_ndomains *= vm_domain_split_factor; + if (vm_ndomains > MAXMEMDOM) + panic("vm_ndomains %d > MAXMEMDOM %d", vm_ndomains, MAXMEMDOM); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index f12b76c..a45648d 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -147,7 +147,7 @@ struct vm_page { uint16_t hold_count; /* page hold count (P) */ uint16_t flags; /* page PG_* flags (P) */ uint8_t aflags; /* access is atomic */ - uint8_t oflags; /* page VPO_* flags (O) */ + uint8_t oflags; /* page VPO_* flags (OM) */ uint8_t queue; /* page queue index (P,Q) */ int8_t psind; /* pagesizes[] index (O) */ int8_t segind; @@ -163,8 +163,9 @@ struct vm_page { /* * Page flags stored in oflags: * - * Access to these page flags is synchronized by the lock on the object - * containing the page (O). + * Access to these page flags is synchronized by the exclusive lock on + * the object containing the page, or combination of shared object + * lock and the page lock (OM). * * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG) * indicates that the page is not under PV management but diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index ca9d7f9..e690605 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1717,6 +1717,8 @@ vm_pageout(void) swap_pager_swap_init(); #if MAXMEMDOM > 1 for (i = 1; i < vm_ndomains; i++) { + if (vm_dom[i].vmd_page_count == 0) + continue; error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index be3d5be..b560a67 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -73,6 +73,7 @@ _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, struct mem_affinity *mem_affinity; int vm_ndomains = 1; +int vm_domain_split_factor = 1; struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; int vm_phys_nsegs; @@ -122,7 +123,8 @@ static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool, int order); static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain); -static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind); +static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, + boolean_t split); static int vm_phys_paddr_to_segind(vm_paddr_t pa); static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order); @@ -327,16 +329,113 @@ _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain) seg->free_queues = &vm_phys_free_queues[domain][flind]; } +static vm_paddr_t max_phys; + static void -vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind) +vm_phys_init_maxphys(void) { int i; - if (mem_affinity == NULL) { + if (max_phys != 0) + return; + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + if (phys_avail[i + 1] > max_phys) + max_phys = phys_avail[i + 1]; + } +} + +/* + * When vm.domain_split_factor loader tunable is set to a value other + * than one, we split each physical segment, reported by platform, + * into lesser chunks, up to vm_domain_split_factor sub-segments, + * which get dedicated vm_domain assigned. + * + * The split distributes the physical pages among domains, and each + * domain carries its own page queues and page queues locks. On + * machines with large memory, it reduces contention on the page queue + * locks and increases parallelism in the threaded page daemon. + * + * vm_phys_create_seg_noaff_split() is used when system firmware does + * not provide memory affinity information, in which case distribution + * of the physical memory by domains is unrestricted. The split is + * done by chunks, which size is obtained by dividing the max physical + * address by the split factor. + * + * vm_phys_create_seg_aff_split() works with memory affinity data and + * ensures that no segments from the different NUMA domains are put + * into one vm domain. + * + * The functions use slightly different algorithms which are easier to + * handle when kept apart. + */ +static void +vm_phys_create_seg_noaff_split(vm_paddr_t start, vm_paddr_t end, int flind, + boolean_t split) +{ + vm_paddr_t split_end, split_start, split_sz; + int domain; + + if (!split || vm_domain_split_factor == 1) { _vm_phys_create_seg(start, end, flind, 0); return; } + split_sz = roundup2(max_phys / vm_domain_split_factor, MAXPAGESIZE); +printf("vm_phys_create_seg_noaff %jx %jx %jx %jx\n", (uintmax_t)start, (uintmax_t)end, (uintmax_t)split_sz, (uintmax_t)MAXPAGESIZE); + for (split_start = start; split_start < end; split_start = split_end) { + domain = split_start / split_sz; + split_end = split_start + split_sz; + split_end = roundup2(split_end, MAXPAGESIZE); + if (end < split_end) + split_end = end; +printf("create_seg_noaff %jx %jx flind %d dom %d\n", (uintmax_t)split_start, (uintmax_t)split_end, flind, domain); + _vm_phys_create_seg(split_start, split_end, flind, domain); + } +} + +static void +vm_phys_create_seg_aff_split(vm_paddr_t start, vm_paddr_t end, int flind, + int hwdomain, boolean_t split) +{ + vm_paddr_t split_end, split_start, split_sz; + int domain = 0; + + if (!split) { + _vm_phys_create_seg(start, end, flind, hwdomain); + return; + } + + domain = hwdomain * vm_domain_split_factor; + split_sz = roundup2((end - start) / vm_domain_split_factor, + MAXPAGESIZE); +printf("vm_phys_create_seg_aff %jx %jx %jx %jx\n", (uintmax_t)start, (uintmax_t)end, (uintmax_t)split_sz, (uintmax_t)MAXPAGESIZE); + + for (split_start = start; split_start < end; split_start = split_end) { + /* + * When splitting, try to align the end of domain on + * the boundary of the largest superpage. Roundup is + * needed because start may be not aligned. + */ + split_end = split_start + split_sz; + split_end = roundup2(split_end, MAXPAGESIZE); + if (end < split_end) + split_end = end; +printf("create_seg_aff %jx %jx flind %d dom %d\n", (uintmax_t)split_start, (uintmax_t)split_end, flind, domain); + _vm_phys_create_seg(split_start, split_end, flind, domain++); + } +} + +static void +vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, + boolean_t split) +{ + int i; + + if (mem_affinity == NULL) { + vm_phys_create_seg_noaff_split(start, end, flind, split); + return; + } + for (i = 0;; i++) { if (mem_affinity[i].end == 0) panic("Reached end of affinity info"); @@ -346,12 +445,12 @@ vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind) panic("No affinity info for start %jx", (uintmax_t)start); if (mem_affinity[i].end >= end) { - _vm_phys_create_seg(start, end, flind, - mem_affinity[i].domain); + vm_phys_create_seg_aff_split(start, end, flind, + mem_affinity[i].domain, split); break; } - _vm_phys_create_seg(start, mem_affinity[i].end, flind, - mem_affinity[i].domain); + vm_phys_create_seg_aff_split(start, mem_affinity[i].end, flind, + mem_affinity[i].domain, split); start = mem_affinity[i].end; } } @@ -365,17 +464,19 @@ vm_phys_init(void) struct vm_freelist *fl; int dom, flind, i, oind, pind; + vm_phys_init_maxphys(); for (i = 0; phys_avail[i + 1] != 0; i += 2) { #ifdef VM_FREELIST_ISADMA if (phys_avail[i] < 16777216) { if (phys_avail[i + 1] > 16777216) { vm_phys_create_seg(phys_avail[i], 16777216, - VM_FREELIST_ISADMA); + VM_FREELIST_ISADMA, FALSE); vm_phys_create_seg(16777216, phys_avail[i + 1], - VM_FREELIST_DEFAULT); + VM_FREELIST_DEFAULT, FALSE); } else { vm_phys_create_seg(phys_avail[i], - phys_avail[i + 1], VM_FREELIST_ISADMA); + phys_avail[i + 1], VM_FREELIST_ISADMA, + FALSE); } if (VM_FREELIST_ISADMA >= vm_nfreelists) vm_nfreelists = VM_FREELIST_ISADMA + 1; @@ -385,19 +486,22 @@ vm_phys_init(void) if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) { if (phys_avail[i] < VM_HIGHMEM_ADDRESS) { vm_phys_create_seg(phys_avail[i], - VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT); + VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT, + TRUE); vm_phys_create_seg(VM_HIGHMEM_ADDRESS, - phys_avail[i + 1], VM_FREELIST_HIGHMEM); + phys_avail[i + 1], VM_FREELIST_HIGHMEM, + TRUE); } else { vm_phys_create_seg(phys_avail[i], - phys_avail[i + 1], VM_FREELIST_HIGHMEM); + phys_avail[i + 1], VM_FREELIST_HIGHMEM, + TRUE); } if (VM_FREELIST_HIGHMEM >= vm_nfreelists) vm_nfreelists = VM_FREELIST_HIGHMEM + 1; } else #endif vm_phys_create_seg(phys_avail[i], phys_avail[i + 1], - VM_FREELIST_DEFAULT); + VM_FREELIST_DEFAULT, TRUE); } for (dom = 0; dom < vm_ndomains; dom++) { for (flind = 0; flind < vm_nfreelists; flind++) { diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h index 6d94e07..ce83dd7 100644 --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -62,6 +62,7 @@ struct vm_phys_seg { extern struct mem_affinity *mem_affinity; extern int vm_ndomains; +extern int vm_domain_split_factor; extern struct vm_phys_seg vm_phys_segs[]; extern int vm_phys_nsegs;