Source code for gammapy.utils.parallel

# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""Multiprocessing and multithreading setup."""
import importlib
import logging
from enum import Enum
from gammapy.utils.pbar import progress_bar

log = logging.getLogger(__name__)

__all__ = [
    "multiprocessing_manager",
    "run_multiprocessing",
    "BACKEND_DEFAULT",
    "N_JOBS_DEFAULT",
    "POOL_KWARGS_DEFAULT",
    "METHOD_DEFAULT",
    "METHOD_KWARGS_DEFAULT",
]


class ParallelBackendEnum(Enum):
    """Enum for parallel backend."""

    multiprocessing = "multiprocessing"
    ray = "ray"

    @classmethod
    def from_str(cls, value):
        """Get enum from string."""

        if value == "ray" and not is_ray_available():
            log.warning("Ray is not installed, falling back to multiprocessing backend")
            value = "multiprocessing"

        return cls(value)


class PoolMethodEnum(Enum):
    """Enum for pool method."""

    starmap = "starmap"
    apply_async = "apply_async"


BACKEND_DEFAULT = ParallelBackendEnum.multiprocessing
N_JOBS_DEFAULT = 1
ALLOW_CHILD_JOBS = False
POOL_KWARGS_DEFAULT = dict(processes=N_JOBS_DEFAULT)
METHOD_DEFAULT = PoolMethodEnum.starmap
METHOD_KWARGS_DEFAULT = {}


def get_multiprocessing():
    """Get multiprocessing module."""
    import multiprocessing

    return multiprocessing


def get_multiprocessing_ray():
    """Get multiprocessing module for ray backend."""
    import ray.util.multiprocessing as multiprocessing

    log.warning(
        "Gammapy support for parallelisation with ray is still a prototype and is not fully functional."
    )
    return multiprocessing


def is_ray_initialized():
    """Check if ray is initialized."""
    try:
        from ray import is_initialized

        return is_initialized()
    except ModuleNotFoundError:
        return False


def is_ray_available():
    """Check if ray is available."""
    try:
        importlib.import_module("ray")
        return True
    except ModuleNotFoundError:
        return False


[docs]class multiprocessing_manager: """Context manager to update the default configuration for multiprocessing. Only the default configuration will be modified, if class arguments like `n_jobs` and `parallel_backend` are set they will overwrite the default configuration. Parameters ---------- backend : {'multiprocessing', 'ray'} Backend to use. pool_kwargs : dict Keyword arguments passed to the pool. The number of processes is limited to the number of physical CPUs. method : {'starmap', 'apply_async'} Pool method to use. method_kwargs : dict Keyword arguments passed to the method Examples -------- :: import gammapy.utils.parallel as parallel from gammapy.estimators import FluxPointsEstimator fpe = FluxPointsEstimator(energy_edges=[1, 3, 10] * u.TeV) with parallel.multiprocessing_manager( backend="multiprocessing", pool_kwargs=dict(processes=2), ): fpe.run(datasets) """ def __init__(self, backend=None, pool_kwargs=None, method=None, method_kwargs=None): global BACKEND_DEFAULT, POOL_KWARGS_DEFAULT, METHOD_DEFAULT, METHOD_KWARGS_DEFAULT, N_JOBS_DEFAULT self._backend = BACKEND_DEFAULT self._pool_kwargs = POOL_KWARGS_DEFAULT self._method = METHOD_DEFAULT self._method_kwargs = METHOD_KWARGS_DEFAULT self._n_jobs = N_JOBS_DEFAULT if backend is not None: BACKEND_DEFAULT = ParallelBackendEnum.from_str(backend).value if pool_kwargs is not None: POOL_KWARGS_DEFAULT = pool_kwargs N_JOBS_DEFAULT = pool_kwargs.get("processes", N_JOBS_DEFAULT) if method is not None: METHOD_DEFAULT = PoolMethodEnum(method).value if method_kwargs is not None: METHOD_KWARGS_DEFAULT = method_kwargs def __enter__(self): pass def __exit__(self, type, value, traceback): global BACKEND_DEFAULT, POOL_KWARGS_DEFAULT, METHOD_DEFAULT, METHOD_KWARGS_DEFAULT, N_JOBS_DEFAULT BACKEND_DEFAULT = self._backend POOL_KWARGS_DEFAULT = self._pool_kwargs METHOD_DEFAULT = self._method METHOD_KWARGS_DEFAULT = self._method_kwargs N_JOBS_DEFAULT = self._n_jobs
class ParallelMixin: """Mixin class to handle parallel processing.""" _n_child_jobs = 1 @property def n_jobs(self): """Number of jobs as an integer.""" # TODO: this is somewhat unusual behaviour. It deviates from a normal default value handling if self._n_jobs is None: return N_JOBS_DEFAULT return self._n_jobs @n_jobs.setter def n_jobs(self, value): """Number of jobs setter as an integer.""" if not isinstance(value, (int, type(None))): raise ValueError( f"Invalid type: {value!r}, and integer or None is expected." ) self._n_jobs = value if ALLOW_CHILD_JOBS: self._n_child_jobs = value def _update_child_jobs(self): """needed because we can update only in the main process otherwise global ALLOW_CHILD_JOBS has default value""" if ALLOW_CHILD_JOBS: self._n_child_jobs = self.n_jobs else: self._n_child_jobs = 1 @property def _get_n_child_jobs(self): """Number of allowed child jobs as an integer.""" return self._n_child_jobs @property def parallel_backend(self): """Parallel backend as a string.""" if self._parallel_backend is None: return BACKEND_DEFAULT return self._parallel_backend @parallel_backend.setter def parallel_backend(self, value): """Parallel backend setter (str)""" if value is None: self._parallel_backend = None else: self._parallel_backend = ParallelBackendEnum.from_str(value).value
[docs]def run_multiprocessing( func, inputs, backend=None, pool_kwargs=None, method=None, method_kwargs=None, task_name="", ): """Run function in a loop or in Parallel. Notes ----- The progress bar can be displayed for this function. Parameters ---------- func : function Function to run. inputs : list List of arguments to pass to the function. backend : {'multiprocessing', 'ray'}, optional Backend to use. Default is None. pool_kwargs : dict, optional Keyword arguments passed to the pool. The number of processes is limited to the number of physical CPUs. Default is None. method : {'starmap', 'apply_async'} Pool method to use. Default is "starmap". method_kwargs : dict, optional Keyword arguments passed to the method. Default is None. task_name : str, optional Name of the task to display in the progress bar. Default is "". """ if backend is None: backend = BACKEND_DEFAULT if method is None: method = METHOD_DEFAULT if method_kwargs is None: method_kwargs = METHOD_KWARGS_DEFAULT if pool_kwargs is None: pool_kwargs = POOL_KWARGS_DEFAULT processes = pool_kwargs.get("processes", N_JOBS_DEFAULT) backend = ParallelBackendEnum.from_str(backend) multiprocessing = PARALLEL_BACKEND_MODULES[backend]() if backend == ParallelBackendEnum.multiprocessing: cpu_count = multiprocessing.cpu_count() if processes > cpu_count: log.info(f"Limiting number of processes from {processes} to {cpu_count}") processes = cpu_count if multiprocessing.current_process().name != "MainProcess": # with multiprocessing subprocesses cannot have childs (but possible with ray) processes = 1 if processes == 1: return run_loop( func=func, inputs=inputs, method_kwargs=method_kwargs, task_name=task_name ) if backend == ParallelBackendEnum.ray: address = "auto" if is_ray_initialized() else None pool_kwargs.setdefault("ray_address", address) log.info(f"Using {processes} processes to compute {task_name}") with multiprocessing.Pool(**pool_kwargs) as pool: pool_func = POOL_METHODS[PoolMethodEnum(method)] results = pool_func( pool=pool, func=func, inputs=inputs, method_kwargs=method_kwargs, task_name=task_name, ) return results
def run_loop(func, inputs, method_kwargs=None, task_name=""): """Loop over inputs and run function.""" results = [] callback = method_kwargs.get("callback", None) for arguments in progress_bar(inputs, desc=task_name): result = func(*arguments) if callback is not None: result = callback(result) results.append(result) return results def run_pool_star_map(pool, func, inputs, method_kwargs=None, task_name=""): """Run function in parallel.""" return pool.starmap(func, progress_bar(inputs, desc=task_name), **method_kwargs) def run_pool_async(pool, func, inputs, method_kwargs=None, task_name=""): """Run function in parallel async.""" results = [] for arguments in progress_bar(inputs, desc=task_name): result = pool.apply_async(func, arguments, **method_kwargs) results.append(result) # wait async run is done [result.wait() for result in results] return results POOL_METHODS = { PoolMethodEnum.starmap: run_pool_star_map, PoolMethodEnum.apply_async: run_pool_async, } PARALLEL_BACKEND_MODULES = { ParallelBackendEnum.multiprocessing: get_multiprocessing, ParallelBackendEnum.ray: get_multiprocessing_ray, }