mempolicy相关系统调用主要有set_mempolicy/get_mempolicy 、mbind主要配置task/process policy和vma policy,如下图:
set_mempolicy()系统调用主要作用用于修改当前进行NUMA mem policy策略以及以后该进程创建的子进程也会继承该特性,API为:
#include <numaif.h> long set_mempolicy(int mode, const unsigned long * nodemask,unsigned long maxnode);
使用该系统调用要链接numa库:-Lnuma.
参数:
- mode 部分主要包括MPOL_DEFAULT、MPOL_PREFERRED、MPOL_BIND、MPOL_INTERLEAVE、MPOL_LOCAL等字段,
- extern flag主要包含MPOL_F_STATIC_NODES和MPOL_F_RELATIVE_NODES字段,并在5.12版本加入MPOL_F_NUMA_BALANCING字段。
- 上述字段具体描述《linux内核那些事之mempolicy(1)》
kernel_set_mempolicy为set_mempolicy系统调用 kernel层处理函数:
/* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { int err; nodemask_t nodes; unsigned short flags; flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)mode >= MPOL_MAX) return -EINVAL; if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(mode, flags, &nodes); }
get_mempolicy 用于获取当前进程或者具体内存地址mempolicy策略,API为:
#include <numaif.h> long get_mempolicy(int mode, unsigned long * nodemask,unsigned long maxnode,void *addr, unsigned long flags);
参数:
kernel_get_mempolicy为对应get_mempolicy kernel处理函数:
/* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int uninitialized_var(pval); nodemask_t nodes; addr = untagged_addr(addr); if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; }
调用do_get_mempolicy 获取memplicy结果。
do_get_mempolicy为get_mempolicy核心处理函数:
/* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = find_vma_intersection(mm, addr, addr+1); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } if (vma->vm_ops && vma->vm_ops->get_policy) pol = vma->vm_ops->get_policy(vma, addr); else pol = vma->vm_policy; } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, lookup_node() * wil drop the mmap_lock, so after calling * lookup_node() only "pol" remains valid, "vma" * is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->v.nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; }
根据传入的参数不同场景获取numa 节点,不再详细分析。
mbind系统调用为设置vma mempolicy级别,可以只设置某一个内存区域NUMA节点策略:
#include <numaif.h> long mbind(void *addr, unsigned long len, int mode,const unsigned long *nodemask,unsigned long maxnode,unsignd int flags)
参数
当flags为MPOL_MF_STRICT且mode 不是MPOL_DEFAULT时,意思时要严格遵循设置numa设置,如果已经申请的物理页和 numa 配置节点策略不一致直接返回EIO。
flags为MPOL_MF_MOVE:内核会尝试将已经分配的内存 不符合设置numa节点要求给进行迁移,迁移到符合配置numa节点上去。
flags为MPOL_MF_MOVE_ALL:内核会尝试将已经分配的内存 不符合设置numa节点要求给进行迁移,迁移该物理页时不管其他进程是否也再使用都进行迁移。
do_mbind函数为mbind系统调用核心处理函数:
static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; int err; int ret; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); if (flags & MPOL_MF_LAZY) new->flags |= MPOL_F_MOF; /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", start, start + len, mode, mode_flags, nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { err = migrate_prep(); if (err) goto mpol_out; } { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); task_lock(current); err = mpol_set_nodemask(new, nmask, scratch); task_unlock(current); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; ret = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); if (ret < 0) { err = ret; goto up_out; } err = mbind_range(mm, start, end, new); if (!err) { int nr_failed = 0; if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); } if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT))) err = -EIO; } else { up_out: if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); } mmap_write_unlock(mm); mpol_out: mpol_put(new); return err; }