Shortcuts

Source code for hfai.utils.numa

import os
from hfai._C.utils import MemoryUtils
from hfai._C.multiprocessing import numa
from typing import List
import psutil


gpu_bus_ids = os.listdir('/proc/driver/nvidia/gpus') if os.path.exists('/proc/driver/nvidia/gpus') else []
gpu_bus_ids.sort()


[docs]def which_numa(i_gpu: int = None, i_ib: int = None): """ 根据 gpu 编号或者 ib 编号得到对应的 numa Args: i_gpu (int, optional): gpu 编号 i_ib (int, optional): ib 编号 Returns: int: 对应的 numa 编号 Examples: >>> from hfai.utils import which_numa >>> which_numa(i_gpu=0) 0 """ assert i_gpu is not None or i_ib is not None, '请指定gpu编号或ib编号' if i_gpu is not None: assert i_gpu < len(gpu_bus_ids), '没找到该gpu' with open(f'/sys/bus/pci/drivers/nvidia/{gpu_bus_ids[i_gpu]}/numa_node', 'rt') as f: rst = f.read() return int(rst.strip()) else: # todo: id of infiniband card may vary between servers, maybe we should find it automatically with open(f'/sys/class/infiniband/mlx5_{i_ib}/device/numa_node', 'rt') as f: rst = f.read() return int(rst.strip())
[docs]def num_gpus(): """ 返回可用的 GPU 个数 相比于 `torch.cuda.device_count()`,`hfai.utils.num_gpus` 不用初始化 cuda,对使用 fork 启动训练任务的用户更加友好 Returns: int: 可用的 GPU 个数 Examples: >>> import os, hfai >>> hfai.utils.num_gpus() 8 >>> os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" >>> hfai.utils.num_gpus() 4 """ all_gpus = len(gpu_bus_ids) devices = os.getenv("CUDA_VISIBLE_DEVICES") if devices is None: return all_gpus devices = devices.split(',') ngpus = min(len(devices), all_gpus) return ngpus
def bind_to_numa(buf, node): """ 绑定一块 buffer 到指定的 NUMA 节点 Args: buf (SharedMemory): 通过 :class:`SharedMemory` 分配的内存 node (int): NUMA 节点编号 Examples: >>> import hfai, uuid >>> buf = hfai.utils.SharedMemory(1024, uuid.uuid4().hex) >>> hfai.utils.bind_to_numa(buf, 0) """ return MemoryUtils.bind_to_numa(buf, node)
[docs]def bind_numa(node: int): """ 绑定当前进程到指定的 NUMA 节点 Args: node (int): NUMA 节点编号 Examples: >>> import hfai >>> hfai.utils.bind_numa(0) >>> hfai.utils.get_current_numa() 0 >>> hfai.utils.bind_numa(1) >>> hfai.utils.get_current_numa() 1 """ numa.bind_numa(node)
[docs]def get_current_numa(): """ 返回当前进程所在的 NUMA 编号 Examples: >>> import hfai >>> hfai.utils.bind_numa(0) >>> hfai.utils.get_current_numa() 0 >>> hfai.utils.bind_numa(1) >>> hfai.utils.get_current_numa() 1 """ return numa.get_current_numa()
def set_cpu_affinity(cpus: List): """ 设置当前进程的cpu亲和度 Examples: >>> import hfai >>> hfai.utils.set_cpu_affinity([0, 1, 2, 3, 4, 5, 6, 7]) """ process = psutil.Process() process.cpu_affinity(cpus)