Source code for hfai.utils.numa
import os
from hfai._C.utils import MemoryUtils
from hfai._C.multiprocessing import numa
from typing import List
import psutil
gpu_bus_ids = os.listdir('/proc/driver/nvidia/gpus') if os.path.exists('/proc/driver/nvidia/gpus') else []
gpu_bus_ids.sort()
[docs]def which_numa(i_gpu: int = None, i_ib: int = None):
"""
根据 gpu 编号或者 ib 编号得到对应的 numa
Args:
i_gpu (int, optional): gpu 编号
i_ib (int, optional): ib 编号
Returns:
int: 对应的 numa 编号
Examples:
>>> from hfai.utils import which_numa
>>> which_numa(i_gpu=0)
0
"""
assert i_gpu is not None or i_ib is not None, '请指定gpu编号或ib编号'
if i_gpu is not None:
assert i_gpu < len(gpu_bus_ids), '没找到该gpu'
with open(f'/sys/bus/pci/drivers/nvidia/{gpu_bus_ids[i_gpu]}/numa_node', 'rt') as f:
rst = f.read()
return int(rst.strip())
else:
# todo: id of infiniband card may vary between servers, maybe we should find it automatically
with open(f'/sys/class/infiniband/mlx5_{i_ib}/device/numa_node', 'rt') as f:
rst = f.read()
return int(rst.strip())
[docs]def num_gpus():
"""
返回可用的 GPU 个数
相比于 `torch.cuda.device_count()`,`hfai.utils.num_gpus` 不用初始化 cuda,对使用 fork 启动训练任务的用户更加友好
Returns:
int: 可用的 GPU 个数
Examples:
>>> import os, hfai
>>> hfai.utils.num_gpus()
8
>>> os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
>>> hfai.utils.num_gpus()
4
"""
all_gpus = len(gpu_bus_ids)
devices = os.getenv("CUDA_VISIBLE_DEVICES")
if devices is None:
return all_gpus
devices = devices.split(',')
ngpus = min(len(devices), all_gpus)
return ngpus
def bind_to_numa(buf, node):
"""
绑定一块 buffer 到指定的 NUMA 节点
Args:
buf (SharedMemory): 通过 :class:`SharedMemory` 分配的内存
node (int): NUMA 节点编号
Examples:
>>> import hfai, uuid
>>> buf = hfai.utils.SharedMemory(1024, uuid.uuid4().hex)
>>> hfai.utils.bind_to_numa(buf, 0)
"""
return MemoryUtils.bind_to_numa(buf, node)
[docs]def bind_numa(node: int):
"""
绑定当前进程到指定的 NUMA 节点
Args:
node (int): NUMA 节点编号
Examples:
>>> import hfai
>>> hfai.utils.bind_numa(0)
>>> hfai.utils.get_current_numa()
0
>>> hfai.utils.bind_numa(1)
>>> hfai.utils.get_current_numa()
1
"""
numa.bind_numa(node)
[docs]def get_current_numa():
"""
返回当前进程所在的 NUMA 编号
Examples:
>>> import hfai
>>> hfai.utils.bind_numa(0)
>>> hfai.utils.get_current_numa()
0
>>> hfai.utils.bind_numa(1)
>>> hfai.utils.get_current_numa()
1
"""
return numa.get_current_numa()
def set_cpu_affinity(cpus: List):
"""
设置当前进程的cpu亲和度
Examples:
>>> import hfai
>>> hfai.utils.set_cpu_affinity([0, 1, 2, 3, 4, 5, 6, 7])
"""
process = psutil.Process()
process.cpu_affinity(cpus)