Shortcuts

Source code for hfai.pl.strategies.ddp_spawn_bind_numa

import logging
import pytorch_lightning
from .strategy_utils import bind_numa, check_numa
from pytorch_lightning.utilities.rank_zero import rank_zero_only
from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy
from pytorch_lightning.strategies.strategy_registry import StrategyRegistry as Registry

log = logging.getLogger(__name__)


[docs]class DDPSpawnStrategyBindNuma(DDPSpawnStrategy): """ 这是一个可以绑定 numa 的 ddp spawn strategy, 支持 ``1.6.0 <= pytorch_lightning.__version__ <= 1.7.6`` Examples: .. code-block:: python from hfai.pl import HFAIEnvironment trainer = pytorch_lightning.Trainer( max_epochs=3, gpus=8, strategy="ddp_spawn_bind_numa", # hfai 支持 ddp_bind_numa, ddp_spawn_bind_numa, hfreduce_bind_numa, hfreduce_spawn_bind_numa plugins=[HFAIEnvironment()] # 定义 Hfai 环境并作为插件输入 ) model_module = ToyNetModule() trainer.fit( model_module ) """ strategy_name = "ddp_spawn_bind_numa" def _configure_launcher(self) -> None: if pytorch_lightning.__version__ < '1.7.0': super()._configure_launcher() return # deal with worker output is None from .launchers.multiprocessing_hf import _MultiProcessingLauncherHF self._launcher = _MultiProcessingLauncherHF(self, start_method=self._start_method) def set_world_ranks(self, process_idx: int = 0) -> None: self._local_rank = process_idx if self.cluster_environment is None: return bind_numa(self.cluster_environment) # add numa bind assert check_numa(self.cluster_environment) # check if bind success self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) rank_zero_only.rank = self.cluster_environment.global_rank()
Registry.register( "ddp_spawn_bind_numa", DDPSpawnStrategyBindNuma, description="DDP strategy with `start_method` `spawn` and `bind_numa`" )