diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index beedf5dc..91554c0e 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -62,17 +62,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None # default dynamically allocated shared memory size, can be overwritten using smem_args self.smem_size = 0 - # setup observers - self.observers = observers or [] - self.observers.append(CupyRuntimeObserver(self)) - for obs in self.observers: - obs.register_device(self) - # collect environment information env = dict() cupy_info = str(get_runtime_info()).split("\n")[:-1] info_dict = { - s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info + s.split(":", 1)[0].strip(): s.split(":", 1)[1].strip() for s in cupy_info } env["device_name"] = info_dict[f"Device {device} Name"] env["pci_bus_id"] = info_dict[f"Device {device} PCI Bus ID"] @@ -89,6 +83,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.env = env self.name = env["device_name"] + # setup observers + self.observers = observers or [] + self.observers.append(CupyRuntimeObserver(self)) + for obs in self.observers: + obs.register_device(self) + def ready_argument_list(self, arguments): """Ready argument list to be passed to the kernel, allocates gpu mem. diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index c4598816..2c8edc41 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -88,27 +88,33 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None # default dynamically allocated shared memory size, can be overwritten using smem_args self.smem_size = 0 - # setup observers - self.observers = observers or [] - self.observers.append(CudaRuntimeObserver(self)) - for observer in self.observers: - observer.register_device(self) - # collect environment information err, device_properties = runtime.cudaGetDeviceProperties(device) cuda_error_check(err) env = dict() env["uuid"] = str(uuid.UUID(bytes=device_properties.uuid.bytes)) env["device_name"] = device_properties.name.decode() - env["pci_bus_id"] = device_properties.pciBusID env["cuda_version"] = driver.CUDA_VERSION env["compute_capability"] = self.cc env["iterations"] = self.iterations env["compiler_options"] = self.compiler_options env["device_properties"] = str(device_properties).replace("\n", ", ") + + # We must use `cudaDeviceGetPCIBusId` to get the PCI bus string + # It returns a series of bytes containing a null byte, not a `str` + err, pci_bus = runtime.cudaDeviceGetPCIBusId(32, device) # 32 = length? + cuda_error_check(err) + env["pci_bus_id"] = pci_bus.decode("ascii").split("\x00", 1)[0] + self.env = env self.name = env["device_name"] + # setup observers + self.observers = observers or [] + self.observers.append(CudaRuntimeObserver(self)) + for observer in self.observers: + observer.register_device(self) + def __del__(self): for device_memory in self.allocations: if isinstance(device_memory, driver.CUdeviceptr): diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 8f9326c2..9e3eb0d6 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -130,12 +130,6 @@ def _finish_up(): # default dynamically allocated shared memory size, can be overwritten using smem_args self.smem_size = 0 - # setup observers - self.observers = observers or [] - self.observers.append(PyCudaRuntimeObserver(self)) - for obs in self.observers: - obs.register_device(self) - # collect environment information env = dict() env["device_name"] = self.context.get_device().name() @@ -148,6 +142,12 @@ def _finish_up(): self.env = env self.name = env["device_name"] + # setup observers + self.observers = observers or [] + self.observers.append(PyCudaRuntimeObserver(self)) + for obs in self.observers: + obs.register_device(self) + def __del__(self): for gpu_mem in self.allocations: # if needed for when using mocks during testing diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index aef3d6f4..4b83c51d 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -24,26 +24,42 @@ def __init__( use_locked_clocks=False ): """Create object to control device using NVML.""" + # We set these first as __del__ checks these + # and this __init__ may exceptions midway + self.pwr_limit_default = None + self.modified_clocks = False + pynvml.nvmlInit() - if sum(x is not None for x in [device_id, device_uuid, device_pci_bus]) != 1: - raise ValueError("invalid device: specify either the index, the UUID, or the PCI-bus") - elif device_id is not None: + if device_id is not None: self.dev = pynvml.nvmlDeviceGetHandleByIndex(device_id) elif device_uuid is not None: self.dev = pynvml.nvmlDeviceGetHandleByUUID(device_uuid) elif device_pci_bus is not None: - self.dev = pynvml.nvmlDeviceGetHandleByPciBusId_v2(device_pci_bus) + self.dev = pynvml.nvmlDeviceGetHandleByPciBusId(device_pci_bus) self.id = pynvml.nvmlDeviceGetIndex(self.dev) + self.uuid = pynvml.nvmlDeviceGetUUID(self.dev) + self.pci_bus = pynvml.nvmlDeviceGetPciInfo_v3(self.dev).busId self.nvidia_smi = nvidia_smi_fallback or "nvidia-smi" + if device_id is not None and self.id != device_id: + raise ValueError(f"NVML device ID does not match requested device: {device_id} != {self.id}") + + # Some backends have UUID starting with "GPU-" + if device_uuid is not None and self.uuid.removeprefix("GPU-") != device_uuid.removeprefix("GPU-"): + raise ValueError(f"NVML device UUID does not match requested device: {device_uuid} != {self.uuid}") + + # lstrip is needed since some backends use leading zeros + if device_pci_bus is not None and self.pci_bus.lstrip("0") != device_pci_bus.lstrip("0"): + raise ValueError(f"NVML device PCI-bus does not match requested device: {device_pci_bus} != {self.pci_bus}") + try: self.pwr_limit_default = pynvml.nvmlDeviceGetPowerManagementLimit(self.dev) self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.dev) except pynvml.NVMLError_NotSupported: - self.pwr_limit_default = None # inverted range to make all range checks fail + self.pwr_limit_default = None self.pwr_constraints = [1, 0] try: @@ -58,7 +74,6 @@ def __init__( self._auto_boost = None # try to initialize application clocks - self.modified_clocks = False try: if not use_locked_clocks: self.gr_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( @@ -287,6 +302,11 @@ def pwr_usage(self): NVML_FI_DEV_POWER_INSTANT = 186 return pynvml.nvmlDeviceGetFieldValues(self.dev, [NVML_FI_DEV_POWER_INSTANT])[0].value.uiVal + def energy_usage(self): + """Return total energy usage since bootup in milli joules.""" + NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83 + return pynvml.nvmlDeviceGetFieldValues(self.dev, [NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION])[0].value.ullVal + def gr_voltage(self): """Return current graphics voltage in millivolts.""" args = ["nvidia-smi", "-i", str(self.id), "-q", "-d", "VOLTAGE"] @@ -335,7 +355,7 @@ class NVMLObserver(BenchmarkObserver): def __init__( self, observables, - device=0, + device=None, save_all=False, nvidia_smi_fallback=None, use_locked_clocks=False, @@ -374,6 +394,8 @@ def __init__( self.record_gr_voltage = False self.t0 = 0 + self.initial_energy_reading = None + if "gr_voltage" in observables: self.record_gr_voltage = True self.gr_voltage_readings = [] @@ -386,26 +408,34 @@ def __init__( self.iteration = {obs: [] for obs in self.during_obs} def register_device(self, dev): + env = getattr(dev, "env", dict()) + uuid = env.get("uuid") + pci_bus = env.get("pci_bus_id") + if self.device is not None: self.nvml = nvml(device_id=self.device, **self.nvml_kwargs) + elif uuid is not None and pci_bus is not None: + self.nvml = nvml(device_uuid=uuid, device_pci_bus=pci_bus, **self.nvml_kwargs) + elif uuid is not None: + self.nvml = nvml(device_uuid=uuid, **self.nvml_kwargs) + elif pci_bus is not None: + self.nvml = nvml(device_pci_bus=pci_bus, **self.nvml_kwargs) else: - env = getattr(dev, "env", dict()) - uuid = env.get("uuid") - pci_bus = env.get("pci_bus_id") - - if uuid is not None: - self.nvml = nvml(device_uuid=uuid, **self.nvml_kwargs) - elif pci_bus is not None: - self.nvml = nvml(device_pci_bus=pci_bus, **self.nvml_kwargs) - else: - raise ValueError("failed to detect NVIDIA device: no UUID or PCI-bus-id in environment") - - + raise ValueError("failed to detect NVIDIA device: no UUID or PCI-bus-id in environment") def read_power(self): """ Return power in Watt """ return self.nvml.pwr_usage() / 1e3 + def read_energy(self): + """ Return cumulative energy usage in Joule """ + now = self.nvml.energy_usage() + + if self.initial_energy_reading is None: + self.initial_energy_reading = now + + return (now - self.initial_energy_reading) / 1e3 + def before_start(self): # clear results of the observables for next measurement self.iteration = {obs: [] for obs in self.during_obs} @@ -530,3 +560,4 @@ def get_idle_power(device, n=5, sleep_s=0.1): time.sleep(sleep_s) readings.append(d.pwr_usage()) return np.mean(readings) * 1e-3 # Watt +