前文列表《OpenStack Nova 高性能虚拟机之 NUMA 架构亲和》
《OpenStack Nova 高性能虚拟机之 CPU 绑定》
《OpenStack 高性能虚拟机之大页内存》
《OpenStack 虚拟机启动流程 UML 分析》
numa_fit_instance_to_host
def numa_fit_instance_to_host(
host_topology, instance_topology, limits=None,
pci_requests=None, pci_stats=None):
"""Fit the instance topology onto the host topology.
Given a host, instance topology, and (optional) limits, attempt to
fit instance cells onto all permutations of host cells by calling
the _fit_instance_cell method, and return a new InstanceNUMATopology
with its cell ids set to host cell ids of the first successful
permutation, or None.
:param host_topology: objects.NUMATopology object to fit an
instance on
:param instance_topology: objects.InstanceNUMATopology to be fitted
:param limits: objects.NUMATopologyLimits that defines limits
:param pci_requests: instance pci_requests
:param pci_stats: pci_stats for the host
:returns: objects.InstanceNUMATopology with its cell IDs set to host
cell ids of the first successful permutation, or None
"""
if not (host_topology and instance_topology):
LOG.debug("Require both a host and instance NUMA topology to "
"fit instance on host.")
return
elif len(host_topology) < len(instance_topology):
LOG.debug("There are not enough NUMA nodes on the system to schedule "
"the instance correctly. Required: %(required)s, actual: "
"%(actual)s",
{'required': len(instance_topology),
'actual': len(host_topology)})
return
emulator_threads_policy = None
if 'emulator_threads_policy' in instance_topology:
emulator_threads_policy = instance_topology.emulator_threads_policy
host_cells = host_topology.cells
# If PCI device(s) are not required, prefer host cells that don't have
# devices attached. Presence of a given numa_node in a PCI pool is
# indicative of a PCI device being associated with that node
if not pci_requests and pci_stats:
# NOTE(fanguiju): 如果主机存在 SR-IOV 网卡,但此次请求不包含 PCI 设备,
# 则优先使用未挂载 SR-IOV 网卡的 Host NUMA Node 来完成此次请求,
# 保证 SR-IOV 虚拟机能够更好的被分配到与 SR-IOV 网卡相同的 Host NUMA Node 中。
host_cells = sorted(host_cells, key=lambda cell: cell.id in [
pool['numa_node'] for pool in pci_stats.pools])
# TODO(ndipanov): We may want to sort permutations differently
# depending on whether we want packing/spreading over NUMA nodes
# NOTE(fanguiju): 获取指定数量(Instance NUMA Node Count)的 Host NUMA Nodes 全排列组合,
# 并且循环这些组合逐一与 instance_numa_topology.cells 进行配对,
# 筛选出作为最终 Fit Instance NUMA Topology 的 Host NUMA Nodes 组合。
for host_cell_perm in itertools.permutations(
host_cells, len(instance_topology)):
cells = []
# NOTE(fanguiju): 将待验证 Host NUMA Cells 和 Instance NUMA Cells 组成元组列表。
for host_cell, instance_cell in zip(
host_cell_perm, instance_topology.cells):
try:
cpuset_reserved = 0
if (instance_topology.emulator_threads_isolated
and len(cells) == 0):
# For the case of isolate emulator threads, to
# make predictable where that CPU overhead is
# located we always configure it to be on host
# NUMA node associated to the guest NUMA node
# 0.
# NOTE(fanguiju): 如果 emulator_threads_policy 是 isolated 的话,
# 就要专门留出一个 pCPU 来给 Emulator 使用,
# 这个 pCPU 不算入 Instance NUMA Topology Request 中。
cpuset_reserved = 1
# (展开)
got_cell = _numa_fit_instance_cell(
host_cell, instance_cell, limits, cpuset_reserved)
except exception.MemoryPageSizeNotSupported:
# This exception will been raised if instance cell's
# custom pagesize is not supported with host cell in
# _numa_cell_supports_pagesize_request function.
break
if got_cell is None:
break
cells.append(got_cell)
if len(cells) != len(host_cell_perm):
continue
if not pci_requests or ((pci_stats is not None) and
pci_stats.support_requests(pci_requests, cells)):
return objects.InstanceNUMATopology(
cells=cells,
emulator_threads_policy=emulator_threads_policy)
_numa_fit_instance_cell
def _numa_fit_instance_cell(host_cell, instance_cell, limit_cell=None,
cpuset_reserved=0):
"""Ensure an instance cell can fit onto a host cell
Ensure an instance cell can fit onto a host cell and, if so, return
a new objects.InstanceNUMACell with the id set to that of the host.
Returns None if the instance cell exceeds the limits of the host.
:param host_cell: host cell to fit the instance cell onto
:param instance_cell: instance cell we want to fit
:param limit_cell: an objects.NUMATopologyLimit or None
:param cpuset_reserved: An int to indicate the number of CPUs overhead
:returns: objects.InstanceNUMACell with the id set to that of the
host, or None
"""
LOG.debug('Attempting to fit instance cell %(cell)s on host_cell '
'%(host_cell)s', {'cell': instance_cell, 'host_cell': host_cell})
# NOTE (ndipanov): do not allow an instance to overcommit against
# itself on any NUMA cell
if instance_cell.memory > host_cell.memory:
LOG.debug('Not enough host cell memory to fit instance cell. '
'Required: %(required)d, actual: %(actual)d',
{'required': instance_cell.memory,
'actual': host_cell.memory})
return
if len(instance_cell.cpuset) + cpuset_reserved > len(host_cell.cpuset):
LOG.debug('Not enough host cell CPUs to fit instance cell. Required: '
'%(required)d + %(cpuset_reserved)d as overhead, '
'actual: %(actual)d',
{'required': len(instance_cell.cpuset),
'actual': len(host_cell.cpuset),
'cpuset_reserved': cpuset_reserved})
return
if instance_cell.cpu_pinning_requested:
LOG.debug('Pinning has been requested')
# NOTE(fanguiju):(展开)CPU 绑定的核心逻辑
new_instance_cell = _numa_fit_instance_cell_with_pinning(
host_cell, instance_cell, cpuset_reserved)
if not new_instance_cell:
return
new_instance_cell.pagesize = instance_cell.pagesize
instance_cell = new_instance_cell
elif limit_cell:
LOG.debug('No pinning requested, considering limitations on usable cpu'
' and memory')
memory_usage = host_cell.memory_usage + instance_cell.memory
cpu_usage = host_cell.cpu_usage + len(instance_cell.cpuset)
cpu_limit = len(host_cell.cpuset) * limit_cell.cpu_allocation_ratio
ram_limit = host_cell.memory * limit_cell.ram_allocation_ratio
if memory_usage > ram_limit:
LOG.debug('Host cell has limitations on usable memory. There is '
'not enough free memory to schedule this instance. '
'Usage: %(usage)d, limit: %(limit)d',
{'usage': memory_usage, 'limit': ram_limit})
return
if cpu_usage > cpu_limit:
LOG.debug('Host cell has limitations on usable CPUs. There are '
'not enough free CPUs to schedule this instance. '
'Usage: %(usage)d, limit: %(limit)d',
{'usage': cpu_usage, 'limit': cpu_limit})
return
pagesize = None
if instance_cell.pagesize:
pagesize = _numa_cell_supports_pagesize_request(
host_cell, instance_cell)
if not pagesize:
LOG.debug('Host does not support requested memory pagesize. '
'Requested: %d kB', instance_cell.pagesize)
return
LOG.debug('Selected memory pagesize: %(selected_mem_pagesize)d kB. '
'Requested memory pagesize: %(requested_mem_pagesize)d '
'(small = -1, large = -2, any = -3)',
{'selected_mem_pagesize': pagesize,
'requested_mem_pagesize': instance_cell.pagesize})
instance_cell.id = host_cell.id
instance_cell.pagesize = pagesize
return instance_cell
_numa_fit_instance_cell_with_pinning
def _numa_fit_instance_cell_with_pinning(host_cell, instance_cell,
num_cpu_reserved=0):
"""Determine if cells can be pinned to a host cell.
:param host_cell: objects.NUMACell instance - the host cell that
the instance should be pinned to
:param instance_cell: objects.InstanceNUMACell instance without any
pinning information
:param num_cpu_reserved: int - number of pCPUs reserved for hypervisor
:returns: objects.InstanceNUMACell instance with pinning information,
or None if instance cannot be pinned to the given host
"""
required_cpus = len(instance_cell.cpuset) + num_cpu_reserved
if host_cell.avail_cpus < required_cpus:
LOG.debug('Not enough available CPUs to schedule instance. '
'Oversubscription is not possible with pinned instances. '
'Required: %(required)d (%(vcpus)d + %(num_cpu_reserved)d), '
'actual: %(actual)d',
{'required': required_cpus,
'vcpus': len(instance_cell.cpuset),
'actual': host_cell.avail_cpus,
'num_cpu_reserved': num_cpu_reserved})
return
if host_cell.avail_memory < instance_cell.memory:
LOG.debug('Not enough available memory to schedule instance. '
'Oversubscription is not possible with pinned instances. '
'Required: %(required)s, available: %(available)s, '
'total: %(total)s. ',
{'required': instance_cell.memory,
'available': host_cell.avail_memory,
'total': host_cell.memory})
return
if host_cell.siblings:
LOG.debug('Using thread siblings for packing')
# Try to pack the instance cell onto cores
numa_cell = _pack_instance_onto_cores(
host_cell.free_siblings, instance_cell, host_cell.id,
max(map(len, host_cell.siblings)),
num_cpu_reserved=num_cpu_reserved)
else:
if (instance_cell.cpu_thread_policy ==
fields.CPUThreadAllocationPolicy.REQUIRE):
LOG.info("Host does not support hyperthreading or "
"hyperthreading is disabled, but 'require' "
"threads policy was requested.")
return
# Straightforward to pin to available cpus when there is no
# hyperthreading on the host
free_cpus = [set([cpu]) for cpu in host_cell.free_cpus]
# NOTE(fanguiju): (展开)对于开启了超线程的 Host,
# 需要考虑 CPU 亲和性(vCPU 尽量 fit 在同一个 Core 的多个 Thread 上)
numa_cell = _pack_instance_onto_cores(
free_cpus, instance_cell, host_cell.id,
num_cpu_reserved=num_cpu_reserved)
if not numa_cell:
LOG.debug('Failed to map instance cell CPUs to host cell CPUs')
return numa_cell
_pack_instance_onto_cores
def _pack_instance_onto_cores(available_siblings,
instance_cell,
host_cell_id,
threads_per_core=1,
num_cpu_reserved=0):
"""Pack an instance onto a set of siblings.
Calculate the pinning for the given instance and its topology,
making sure that hyperthreads of the instance match up with those
of the host when the pinning takes effect. Also ensure that the
physical cores reserved for hypervisor on this host NUMA node do
not break any thread policies.
Currently the strategy for packing is to prefer siblings and try use
cores evenly by using emptier cores first. This is achieved by the
way we order cores in the sibling_sets structure, and the order in
which we iterate through it.
The main packing loop that iterates over the sibling_sets dictionary
will not currently try to look for a fit that maximizes number of
siblings, but will simply rely on the iteration ordering and picking
the first viable placement.
:param available_siblings: list of sets of CPU IDs corresponding to
available siblings per core
:param instance_cell: An instance of objects.InstanceNUMACell
describing the pinning requirements of the
instance
:param threads_per_core: number of threads per core in host's cell
:param num_cpu_reserved: number of pCPUs reserved for hypervisor
:returns: An instance of objects.InstanceNUMACell containing the
pinning information, the physical cores reserved and
potentially a new topology to be exposed to the
instance. None if there is no valid way to satisfy the
sibling requirements for the instance.
"""
LOG.debug('Packing an instance onto a set of siblings: '
' available_siblings: %(siblings)s'
' instance_cell: %(cells)s'
' host_cell_id: %(host_cell_id)s'
' threads_per_core: %(threads_per_core)s',
{'siblings': available_siblings,
'cells': instance_cell,
'host_cell_id': host_cell_id,
'threads_per_core': threads_per_core})
# We build up a data structure that answers the question: 'Given the
# number of threads I want to pack, give me a list of all the available
# sibling sets (or groups thereof) that can accommodate it'
sibling_sets = collections.defaultdict(list)
for sib in available_siblings:
# NOTE(fanguiju): `threads_no` mean that Number of host threads per
# cores which can be used to pin vCPUs according to the policies
for threads_no in range(1, len(sib) + 1):
sibling_sets[threads_no].append(sib)
# sibling_set 的意义在于根据一个 Core 下可以使用的 Thread 的数量进行了分组
# 如下表示了:可使用 Thread >= 1 的,和可使用 Thread >=2 的 sibling_set
# defaultdict(<type 'list'>,
# {1: [CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])],
# 2: [CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]})
LOG.debug('Built sibling_sets: %(siblings)s', {'siblings': sibling_sets})
pinning = None
threads_no = 1
def _orphans(instance_cell, threads_per_core):
"""Number of instance CPUs which will not fill up a host core.
Best explained by an example: consider set of free host cores as such:
[(0, 1), (3, 5), (6, 7, 8)]
This would be a case of 2 threads_per_core AKA an entry for 2 in the
sibling_sets structure.
If we attempt to pack a 5 core instance on it - due to the fact that we
iterate the list in order, we will end up with a single core of the
instance pinned to a thread "alone" (with id 6), and we would have one
'orphan' vcpu.
"""
# NOTE(fanguiju):该函数主要用于判断不会填满 Core 中的 Siblings Threads 的数量。
# e.g.
# [(0, 1), (2, 3)]
# x x x
# 上述不被填满的数量为 1,即:3 % 2 == 1。上述 Siblings Thread 3 就是一个 orphans(孤儿)
# len(instance_cell) == len(instance_cell.cpuset)
return len(instance_cell) % threads_per_core
def _threads(instance_cell, threads_per_core):
"""Threads to expose to the instance via the VirtCPUTopology.
This is calculated by taking the GCD of the number of threads we are
considering at the moment, and the number of orphans. An example for
instance_cell = 6
threads_per_core = 4
So we can fit the instance as such:
[(0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11)]
x x x x x x
We can't expose 4 threads, as that will not be a valid topology (all
cores exposed to the guest have to have an equal number of threads),
and 1 would be too restrictive, but we want all threads that guest sees
to be on the same physical core, so we take GCD of 4 (max number of
threads) and 2 (number of 'orphan' CPUs) and get 2 as the number of
threads.
"""
# NOTE(fanguiju):此次用于求得 threads_per_core 和 orphans thread 数量的最大公约数。
# e.g. 最大公约数 (6,4) 为 2,则表示 Guest Socket Topology 中 Core 的 Thread 数量为 2。
# 因为 Guest Socket Topology 要求所有 Core 的 Thread 的数量都是相同的,
# 不能够一个 Core 的 Threads 数量为 6,而另一个 Core 的 Threads 数量为 4。
# 应该是 3 个 Core 的 Threads 均为 2,这就是求得最大公约数的原因。
# 而且我们优先不考虑每个 Core 只有一个 Thread 的情况。
return fractions.gcd(threads_per_core, _orphans(instance_cell,
threads_per_core))
def _get_pinning(threads_no, sibling_set, instance_cores,
num_cpu_reserved=0):
"""Determines pCPUs/vCPUs mapping
Determines the pCPUs/vCPUs mapping regarding the number of
threads which can be used per cores and pCPUs reserved.
:param threads_no: Number of host threads per cores which can
be used to pin vCPUs according to the
policies.
:param sibling_set: List of available threads per host cores
on a specific host NUMA node.
:param instance_cores: Set of vCPUs requested.
:param num_cpu_reserved: Number of additional host CPUs which
needs to be reserved.
NOTE: Depending on how host is configured (HT/non-HT) a thread can
be considered as an entire core.
"""
# threads_no * len(sibling_set) 得到不同策略下的 host_cell 可以被 pin 的(Core 或 Thread)的数量。
if threads_no * len(sibling_set) < (
len(instance_cores) + num_cpu_reserved):
return None, None
# Determines usable cores according the "threads number"
# constraint.
#
# For a sibling_set=[(0, 1, 2, 3), (4, 5, 6, 7)] and thread_no 1:
# usable_cores=[(0), (4),]
#
# For a sibling_set=[(0, 1, 2, 3), (4, 5, 6, 7)] and thread_no 2:
# usable_cores=[(0, 1), (4, 5)]
# NOTE(fanguiju):根据 thread_no 的数量从 sibling_set 抽取出满足的 sub-siblings_set
# e.g.
# (Pdb) sibling_set
# [CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
# (Pdb) threads_no
# 1
# (Pdb) usable_cores
# [[25], [8], [9], [24], [27], [2], [3], [26]]
usable_cores = list(map(lambda s: list(s)[:threads_no], sibling_set))
# Determines the mapping vCPUs/pCPUs based on the sets of
# usable cores.
#
# For an instance_cores=[2, 3], usable_cores=[(0), (4)]
# vcpus_pinning=[(2, 0), (3, 4)]
# (vCPU, pCPU)
# NOTE(fanguiju): 根据 instance_cores 从 usable_cores 中抽取出满足的 mapping
# e.g.
# (Pdb) instance_cores
# CoercedSet([0, 1])
# (Pdb) usable_cores
# [[25], [8], [9], [24], [27], [2], [3], [26]]
# (Pdb) vcpus_pinning
# [(0, 25), (1, 8)]
vcpus_pinning = list(zip(sorted(instance_cores),
itertools.chain(*usable_cores)))
msg = ("Computed NUMA topology CPU pinning: usable pCPUs: "
"%(usable_cores)s, vCPUs mapping: %(vcpus_pinning)s")
msg_args = {
'usable_cores': usable_cores,
'vcpus_pinning': vcpus_pinning,
}
LOG.info(msg, msg_args)
cpuset_reserved = None
if num_cpu_reserved:
# Updates the pCPUs used based on vCPUs pinned to
#
# For vcpus_pinning=[(0, 2), (1, 3)], usable_cores=[(2, 3), (4, 5)]
# usable_cores=[(), (4, 5)]
# NOTE(fanguiju): 如果有需要预留的 CPU,则会先将已经 pinned 的 pCPU 去掉,
# 然后再从剩余的 pCPU 中选出用于预留的 CPU。
for vcpu, pcpu in vcpus_pinning:
for sib in usable_cores:
if pcpu in sib:
sib.remove(pcpu)
# Determines the pCPUs reserved for hypervisor
#
# For usable_cores=[(), (4, 5)], num_cpu_reserved=1
# cpuset_reserved=[4]
# NOTE(fanguiju): 从剩余的 pCPU 中选出用于预留的 CPU。
cpuset_reserved = set(list(
itertools.chain(*usable_cores))[:num_cpu_reserved])
msg = ("Computed NUMA topology reserved pCPUs: usable pCPUs: "
"%(usable_cores)s, reserved pCPUs: %(cpuset_reserved)s")
msg_args = {
'usable_cores': usable_cores,
'cpuset_reserved': cpuset_reserved,
}
LOG.info(msg, msg_args)
return vcpus_pinning, cpuset_reserved
if (instance_cell.cpu_thread_policy ==
fields.CPUThreadAllocationPolicy.REQUIRE):
LOG.debug("Requested 'require' thread policy for %d cores",
len(instance_cell))
elif (instance_cell.cpu_thread_policy ==
fields.CPUThreadAllocationPolicy.PREFER):
LOG.debug("Requested 'prefer' thread policy for %d cores",
len(instance_cell))
elif (instance_cell.cpu_thread_policy ==
fields.CPUThreadAllocationPolicy.ISOLATE):
LOG.debug("Requested 'isolate' thread policy for %d cores",
len(instance_cell))
else:
LOG.debug("User did not specify a thread policy. Using default "
"for %d cores", len(instance_cell))
# NOTE(fanguiju): ISOLATE 策略要求每个 vCPU 都 pin 到 Core 上。
if (instance_cell.cpu_thread_policy ==
fields.CPUThreadAllocationPolicy.ISOLATE):
# make sure we have at least one fully free core
if threads_per_core not in sibling_sets:
LOG.debug('Host does not have any fully free thread sibling sets.'
'It is not possible to emulate a non-SMT behavior '
'for the isolate policy without this.')
return
# NOTE(fanguiju): _get_pinning 是嵌套函数。
pinning, cpuset_reserved = _get_pinning(
# 因为每个 vCPU 都要绑定到 Core 上,所以 thread_no 传入 1。
1, # we only want to "use" one thread per core
sibling_sets[threads_per_core],
instance_cell.cpuset,
num_cpu_reserved=num_cpu_reserved)
else: # REQUIRE, PREFER (explicit, implicit)
# NOTE(ndipanov): We iterate over the sibling sets in descending order
# of cores that can be packed. This is an attempt to evenly distribute
# instances among physical cores
# NOTE(fanguiju): 根据 threads_no 数量由大到小倒序排序,先将 threads_no 数量大的 pinned 掉。
for threads_no, sibling_set in sorted(
(t for t in sibling_sets.items()), reverse=True):
# NOTE(sfinucan): The key difference between the require and
# prefer policies is that require will not settle for non-siblings
# if this is all that is available. Enforce this by ensuring we're
# using sibling sets that contain at least one sibling
# NOTE(fanguiju): REQUIRE 策略要求主机必须开启超线程,所以线程数为 1 的直接过滤。
if (instance_cell.cpu_thread_policy ==
fields.CPUThreadAllocationPolicy.REQUIRE):
if threads_no <= 1:
LOG.debug('Skipping threads_no: %s, as it does not satisfy'
' the require policy', threads_no)
continue
pinning, cpuset_reserved = _get_pinning(
threads_no, sibling_set,
instance_cell.cpuset,
num_cpu_reserved=num_cpu_reserved)
if pinning:
break
# NOTE(sfinucan): If siblings weren't available and we're using PREFER
# (implicitly or explicitly), fall back to linear assignment across
# cores
# NOTE(fanguiju):PREFER 策略,在开启了超线程的时候会先执行 REQUIRE 策略,
# 在没有开启超线程的时候会先执行 ISOLATE 策略。
# 如果在运行完以上策略逻辑之后依然没有得到预期的 vCPU pinning,
# 那么最坏的结果就是随便 Pinning。
if (instance_cell.cpu_thread_policy !=
fields.CPUThreadAllocationPolicy.REQUIRE and
not pinning):
pinning = list(zip(sorted(instance_cell.cpuset),
itertools.chain(*sibling_set)))
threads_no = _threads(instance_cell, threads_no)
if not pinning:
return
LOG.debug('Selected cores for pinning: %s, in cell %s', pinning,
host_cell_id)
# NOTE(fanguiju):此时 threads_no 表示每个 Instance Socket Topology 下的 Core 的 Threads 数量。
# len(pinning) 表示已经被绑定的 vCPU 的数量,两者的地板除得到的是 Core 的数量。
topology = objects.VirtCPUTopology(sockets=1,
cores=len(pinning) // threads_no,
threads=threads_no)
instance_cell.pin_vcpus(*pinning)
instance_cell.cpu_topology = topology
instance_cell.id = host_cell_id
instance_cell.cpuset_reserved = cpuset_reserved
return instance_cell
使用示例在下列使用示例中,我们主要关注不同策略组合下的 CPU pinning 行为模式。基础 Host NUMA Topology 环境如下: Host NUMA Topology:
# /opt/stack/queens/nova/nova/virt/hardware.py
# NOTE(sfinucan): If siblings weren't available and we're using PREFER
# (implicitly or explicitly), fall back to linear assignment across
# cores
if (instance_cell.cpu_thread_policy !=
fields.CPUThreadAllocationPolicy.REQUIRE and
not pinning):
pinning = list(zip(sorted(instance_cell.cpuset),
itertools.chain(*sibling_set)))