diff --git a/sys/amd64/conf/X1 b/sys/amd64/conf/X1 index 88e58e4..0d86db4 100644 --- a/sys/amd64/conf/X1 +++ b/sys/amd64/conf/X1 @@ -3,6 +3,9 @@ options PREEMPTION options SMP options HWPMC_HOOKS device acpi +options ACPI_DMAR +options MAXCPU=128 +options MAXMEMDOM=128 ident X1 nooptions INCLUDE_CONFIG_FILE diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h index bda9722..14c5134 100644 --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -87,7 +87,7 @@ * largest physical address that is accessible by ISA DMA is split * into two PHYSSEG entries. */ -#define VM_PHYSSEG_MAX 31 +#define VM_PHYSSEG_MAX 63 /* * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 8093dd0..4db377d 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -323,6 +323,7 @@ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif +static int bqgen; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. @@ -475,6 +476,7 @@ bufspacewakeup(void) on & ~VFS_BIO_NEED_BUFSPACE)) break; } + atomic_add_int(&bqgen, 1); if (need_wakeup) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); @@ -558,6 +560,7 @@ bufcountadd(struct buf *bp) if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask)) break; } + atomic_add_int(&bqgen, 1); if (need_wakeup) wakeup(__DEVOLATILE(void *, &needsbuffer)); rw_runlock(&nblock); @@ -2084,14 +2087,12 @@ allocbufkva(struct buf *bp, int maxsize, int gbflags) */ static void getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, - int defrag) + int defrag, int bqgen_old) { struct thread *td; char *waitmsg; int cnt, error, flags, norunbuf, wait; - mtx_assert(&bqclean, MA_OWNED); - if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; @@ -2103,7 +2104,6 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, flags = VFS_BIO_NEED_ANY; } atomic_set_int(&needsbuffer, flags); - mtx_unlock(&bqclean); bd_speedup(); /* heeeelp */ if ((gbflags & GB_NOWAIT_BD) != 0) @@ -2142,6 +2142,14 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, if ((needsbuffer & flags) == 0) break; } + /* + * We did not found a buffer on the queues, but queues + * were modified behind us and we could not noticed. + * Avoid sleep to ensure that we are not blocked + * forever without a thread that can wake us up. + */ + if (bqgen != bqgen_old) + break; error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo); if (error != 0) @@ -2253,7 +2261,6 @@ restart: * where we cannot backup. */ nbp = NULL; - mtx_lock(&bqclean); if (!defrag && unmapped) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); @@ -2334,35 +2341,35 @@ restart: break; } } - /* - * If we are defragging then we need a buffer with - * b_kvasize != 0. XXX this situation should no longer - * occur, if defrag is non-zero the buffer's b_kvasize - * should also be non-zero at this point. XXX - */ - if (defrag && bp->b_kvasize == 0) { - printf("Warning: defrag empty buffer %p\n", bp); - continue; - } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) - continue; + goto restart; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ - if (bp->b_vflags & BV_BKGRDINPROG) { + if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || + bp->b_qindex != qindex) { BUF_UNLOCK(bp); - continue; + goto restart; } - KASSERT(bp->b_qindex == qindex, - ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); + /* + * If we are defragging then we need a buffer with + * b_kvasize != 0. XXX this situation should no longer + * occur, if defrag is non-zero the buffer's b_kvasize + * should also be non-zero at this point. XXX + */ + if (defrag && bp->b_kvasize == 0) { + printf("Warning: defrag empty buffer %p\n", bp); + continue; + } + mtx_lock(&bqclean); bremfreel(bp); mtx_unlock(&bqclean); /* @@ -2438,7 +2445,7 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, int gbflags) { struct buf *bp; - int defrag, metadata; + int defrag, metadata, bqgen_old; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); @@ -2460,8 +2467,10 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: + bqgen_old = bqgen; bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED, metadata); + mtx_assert(&bqclean, MA_NOTOWNED); if (bp != NULL) defrag = 0; @@ -2472,12 +2481,13 @@ restart: * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { - mtx_assert(&bqclean, MA_OWNED); - getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); + if (bqgen != bqgen_old) { + goto restart; + } + getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag, + bqgen_old); mtx_assert(&bqclean, MA_NOTOWNED); } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { - mtx_assert(&bqclean, MA_NOTOWNED); - bfreekva(bp); bp->b_flags |= B_UNMAPPED; bp->b_kvabase = bp->b_data = unmapped_buf; @@ -2486,8 +2496,6 @@ restart: atomic_add_long(&unmapped_bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } else { - mtx_assert(&bqclean, MA_NOTOWNED); - /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order diff --git a/sys/sys/mman.h b/sys/sys/mman.h index e89bee3..6020de7 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -95,6 +95,7 @@ #ifdef __LP64__ #define MAP_32BIT 0x00080000 /* map in the low 2GB of address space */ #endif +#define MAP_SHARED_PHYS 0x00100000 /* * Request specific alignment (n == log2 of the desired alignment). diff --git a/sys/vm/phys_pager.c b/sys/vm/phys_pager.c index 9e98006..d7061a7 100644 --- a/sys/vm/phys_pager.c +++ b/sys/vm/phys_pager.c @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include /* list of phys pager objects */ static struct pagerlst phys_pager_object_list; @@ -64,7 +65,8 @@ phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { vm_object_t object, object1; - vm_pindex_t pindex; + vm_pindex_t oosize, pindex, reserv; + boolean_t new_obj; /* * Offset should be page aligned. @@ -73,6 +75,7 @@ phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, return (NULL); pindex = OFF_TO_IDX(foff + PAGE_MASK + size); + new_obj = FALSE; if (handle != NULL) { mtx_lock(&phys_pager_mtx); @@ -90,29 +93,58 @@ phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, mtx_lock(&phys_pager_mtx); object = vm_pager_object_lookup(&phys_pager_object_list, handle); - if (object != NULL) { + if (object == NULL) { /* - * We raced with other thread while - * allocating object. + * Otherwise, we raced with other + * thread while allocating object. */ - if (pindex > object->size) - object->size = pindex; - } else { object = object1; object1 = NULL; object->handle = handle; - TAILQ_INSERT_TAIL(&phys_pager_object_list, object, - pager_object_list); + TAILQ_INSERT_TAIL(&phys_pager_object_list, + object, pager_object_list); + new_obj = TRUE; } - } else { - if (pindex > object->size) - object->size = pindex; } mtx_unlock(&phys_pager_mtx); vm_object_deallocate(object1); } else { object = vm_object_allocate(OBJT_PHYS, pindex); + new_obj = TRUE; + } + VM_OBJECT_WLOCK(object); + oosize = new_obj ? 0 : object->size; + if (pindex > object->size) + object->size = pindex; + if (object->cred != NULL || cred != NULL) { + if (object->cred == NULL) { + KASSERT(object->charge == 0, + ("PHYS object with non-zero charge")); + } else { + /* XXXKIB assert cred == object->cred ? */ + cred = object->cred; + KASSERT(object->charge == 0 || + (object->flags & OBJ_WIRECOUNT) != 0, + ("charged phys_obj but no OBJ_WIRECOUNT")); + } + reserv = pindex - oosize; + if (reserv != 0) { + if (atomic_fetchadd_int(&vm_cnt.v_wire_count, reserv) + + reserv > vm_page_max_wired || + !swap_reserve_by_cred(ptoa(reserv), cred)) { + atomic_subtract_int(&vm_cnt.v_wire_count, + reserv); + object->size = oosize; + VM_OBJECT_WUNLOCK(object); + vm_object_deallocate(object); + return (NULL); + } + vm_object_set_flag(object, OBJ_WIRECOUNT); + object->cred = cred; + object->charge += ptoa(reserv); + } } + VM_OBJECT_WUNLOCK(object); return (object); } diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index cedc59c..91f1195 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -290,6 +290,34 @@ RetryFault:; goto RetryFault; } + if ((fault_flags & (VM_FAULT_CHANGE_WIRING | VM_FAULT_DIRTY)) == 0 && + fs.first_object->type == OBJT_PHYS) { + VM_OBJECT_RLOCK(fs.first_object); + fs.first_m = vm_page_lookup(fs.first_object, fs.first_pindex); + if (fs.first_m != NULL) { + if (m_hold != NULL) { + *m_hold = fs.first_m; + vm_page_lock(fs.first_m); + vm_page_hold(fs.first_m); + vm_page_unlock(fs.first_m); + } + VM_OBJECT_RUNLOCK(fs.first_object); + /* + * The object cannot go away after unlock + * since it has a non-vnode type and the map + * references it. + */ + + pmap_enter(fs.map->pmap, vaddr, fault_type, fs.first_m, + prot, wired); + vm_map_lookup_done(fs.map, fs.entry); + curthread->td_ru.ru_minflt++; + return (KERN_SUCCESS); + } else { + VM_OBJECT_RUNLOCK(fs.first_object); + } + } + /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index 68ae0d2..80f2236 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -2847,7 +2847,7 @@ static void vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) { vm_object_t object; - vm_pindex_t offidxstart, offidxend, count, size1; + vm_pindex_t offidxstart, offidxend, count; vm_ooffset_t size; vm_map_entry_unlink(map, entry); @@ -2886,15 +2886,8 @@ vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry) swap_pager_freespace(object, offidxstart, count); if (offidxend >= object->size && offidxstart < object->size) { - size1 = object->size; - object->size = offidxstart; - if (object->cred != NULL) { - size1 -= object->size; - KASSERT(object->charge >= ptoa(size1), - ("vm_map_entry_delete: object->charge < 0")); - swap_release_by_cred(ptoa(size1), object->cred); - object->charge -= ptoa(size1); - } + vm_object_discharge(object, ptoa(object->size - + offidxstart)); } } VM_OBJECT_WUNLOCK(object); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index 1ae7189..71bed93 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -95,6 +95,15 @@ int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); +static int shared_anon_use_phys; +SYSCTL_INT(_vm, OID_AUTO, shm_anon_use_phys, CTLFLAG_RW, + &shared_anon_use_phys, 0, + "Enable/Disable locking of shared anonymous memory pages in core"); +static int shared_anon_phys_preload; +SYSCTL_INT(_vm, OID_AUTO, shm_anon_phys_preload, CTLFLAG_RW, + &shared_anon_phys_preload, 0, + ""); + #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif @@ -246,6 +255,9 @@ sys_mmap(td, uap) } if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); + if ((flags & MAP_SHARED_PHYS) != 0 && + (flags & (MAP_SHARED | MAP_ANON)) != (MAP_SHARED | MAP_ANON)) + return (EINVAL); /* * Align the file position to a page boundary, @@ -320,7 +332,11 @@ sys_mmap(td, uap) * Mapping blank space is trivial. */ handle = NULL; - handle_type = OBJT_DEFAULT; + if ((flags & (MAP_SHARED | MAP_SHARED_PHYS)) == + (MAP_SHARED | MAP_SHARED_PHYS) && shared_anon_use_phys) + handle_type = OBJT_PHYS; + else + handle_type = OBJT_DEFAULT; maxprot = VM_PROT_ALL; cap_maxprot = VM_PROT_ALL; } else { @@ -1508,16 +1524,18 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, objtype_t handle_type, void *handle, vm_ooffset_t foff) { - boolean_t fitit; - vm_object_t object = NULL; - struct thread *td = curthread; + vm_object_t object; + vm_page_t m; + struct thread *td; + vm_pindex_t pi, psize; int docow, error, findspace, rv; - boolean_t writecounted; + boolean_t fitit, writecounted; if (size == 0) return (0); size = round_page(size); + td = curthread; if (map == &td->td_proc->p_vmspace->vm_map) { PROC_LOCK(td->td_proc); @@ -1569,6 +1587,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, fitit = FALSE; } writecounted = FALSE; + object = NULL; /* * Lookup/allocate object. @@ -1586,6 +1605,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, error = vm_mmap_shm(td, size, prot, &maxprot, &flags, handle, foff, &object); break; + case OBJT_PHYS: case OBJT_DEFAULT: if (handle == NULL) { error = 0; @@ -1599,7 +1619,30 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, if (error) return (error); if (flags & MAP_ANON) { - object = NULL; + if (handle_type == OBJT_PHYS) { + object = vm_pager_allocate(OBJT_PHYS, NULL, size, + prot, 0, td->td_ucred); + if (object == NULL) + return (ENOMEM); + if ((flags & MAP_PREFAULT_READ) != 0 && + shared_anon_phys_preload) { + psize = OFF_TO_IDX(size); + VM_OBJECT_WLOCK(object); + for (pi = 0; pi < psize; pi++) { + m = vm_page_grab(object, pi, + VM_ALLOC_NOBUSY | VM_ALLOC_ZERO); + m->valid = VM_PAGE_BITS_ALL; + if (should_yield()) { + VM_OBJECT_WUNLOCK(object); + kern_yield(PRI_USER); + VM_OBJECT_WLOCK(object); + } + } + VM_OBJECT_WUNLOCK(object); + } + } else { + object = NULL; + } docow = 0; /* * Unnamed anonymous regions always start at 0. diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index c3af9d7..6e38492 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -659,6 +659,28 @@ doterm: } /* + * Release the allocation and wired charges. + */ +void +vm_object_discharge(vm_object_t object, vm_ooffset_t discharge) +{ + + if ((object->flags & OBJ_WIRECOUNT) != 0) { + KASSERT(object->cred != NULL, ("wirecount and no cred")); + atomic_subtract_int(&vm_cnt.v_wire_count, + atop(discharge)); + } + if (object->cred != NULL) { + KASSERT(object->type == OBJT_DEFAULT || + object->type == OBJT_SWAP || object->type == OBJT_PHYS, + ("%s: non-swap obj %p has cred", __func__, object)); + KASSERT(object->charge >= discharge, ("XXX")); + swap_release_by_cred(discharge, object->cred); + object->charge -= discharge; + } +} + +/* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ @@ -673,15 +695,9 @@ vm_object_destroy(vm_object_t object) TAILQ_REMOVE(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); - /* - * Release the allocation charge. - */ + vm_object_discharge(object, object->charge); + KASSERT(object->charge == 0, ("vm_object_discharge failure")); if (object->cred != NULL) { - KASSERT(object->type == OBJT_DEFAULT || - object->type == OBJT_SWAP, - ("%s: non-swap obj %p has cred", __func__, object)); - swap_release_by_cred(object->charge, object->cred); - object->charge = 0; crfree(object->cred); object->cred = NULL; } diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 1f6c7c2..bdaf284 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -185,6 +185,7 @@ struct vm_object { #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ +#define OBJ_WIRECOUNT 0x0080 /* accounted pages as wired */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ @@ -272,6 +273,7 @@ boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); +void vm_object_discharge(vm_object_t object, vm_ooffset_t discharge); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 4e30a3f..2569fbc 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -334,6 +334,9 @@ vm_page_startup(vm_offset_t vaddr) mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); + + TUNABLE_INT_FETCH("vm.ndomain_split_factor", &vm_ndom_split_factor); + vm_ndomains *= vm_ndom_split_factor; for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c index 1401b62..c2b80c1 100644 --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -73,6 +73,7 @@ _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX, struct mem_affinity *mem_affinity; int vm_ndomains = 1; +int vm_ndom_split_factor = 1; struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX]; int vm_phys_nsegs; @@ -328,6 +329,30 @@ _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain) } static void +vm_phys_create_seg_split(vm_paddr_t start, vm_paddr_t end, int flind, + int hwdomain) +{ + vm_paddr_t split_end, split_start, split_sz; + int domain; + + domain = hwdomain * vm_ndom_split_factor; + split_sz = roundup2((end - start) / vm_ndom_split_factor, PAGE_SIZE); + /* Do not split into segments less than 1M */ + if (split_sz < 1024 * 1024) { + _vm_phys_create_seg(start, end, flind, domain); + return; + } + + for (split_start = start; split_start < end;) { + split_end = split_start + split_sz; + if (end < split_end) + split_end = end; + _vm_phys_create_seg(split_start, split_end, flind, domain++); + split_start = split_end; + } +} + +static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind) { int i; @@ -346,11 +371,11 @@ vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind) panic("No affinity info for start %jx", (uintmax_t)start); if (mem_affinity[i].end >= end) { - _vm_phys_create_seg(start, end, flind, + vm_phys_create_seg_split(start, end, flind, mem_affinity[i].domain); break; } - _vm_phys_create_seg(start, mem_affinity[i].end, flind, + vm_phys_create_seg_split(start, mem_affinity[i].end, flind, mem_affinity[i].domain); start = mem_affinity[i].end; } diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h index 6d94e07..8123beb 100644 --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -62,6 +62,7 @@ struct vm_phys_seg { extern struct mem_affinity *mem_affinity; extern int vm_ndomains; +extern int vm_ndom_split_factor; extern struct vm_phys_seg vm_phys_segs[]; extern int vm_phys_nsegs;