Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions kernel_tuner/backends/cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,17 +62,11 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
# default dynamically allocated shared memory size, can be overwritten using smem_args
self.smem_size = 0

# setup observers
self.observers = observers or []
self.observers.append(CupyRuntimeObserver(self))
for obs in self.observers:
obs.register_device(self)

# collect environment information
env = dict()
cupy_info = str(get_runtime_info()).split("\n")[:-1]
info_dict = {
s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
s.split(":", 1)[0].strip(): s.split(":", 1)[1].strip() for s in cupy_info
}
env["device_name"] = info_dict[f"Device {device} Name"]
env["pci_bus_id"] = info_dict[f"Device {device} PCI Bus ID"]
Expand All @@ -89,6 +83,12 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
self.env = env
self.name = env["device_name"]

# setup observers
self.observers = observers or []
self.observers.append(CupyRuntimeObserver(self))
for obs in self.observers:
obs.register_device(self)

def ready_argument_list(self, arguments):
"""Ready argument list to be passed to the kernel, allocates gpu mem.

Expand Down
20 changes: 13 additions & 7 deletions kernel_tuner/backends/nvcuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,27 +88,33 @@
# default dynamically allocated shared memory size, can be overwritten using smem_args
self.smem_size = 0

# setup observers
self.observers = observers or []
self.observers.append(CudaRuntimeObserver(self))
for observer in self.observers:
observer.register_device(self)

# collect environment information
err, device_properties = runtime.cudaGetDeviceProperties(device)
cuda_error_check(err)
env = dict()
env["uuid"] = str(uuid.UUID(bytes=device_properties.uuid.bytes))
env["device_name"] = device_properties.name.decode()
env["pci_bus_id"] = device_properties.pciBusID
env["cuda_version"] = driver.CUDA_VERSION
env["compute_capability"] = self.cc
env["iterations"] = self.iterations
env["compiler_options"] = self.compiler_options
env["device_properties"] = str(device_properties).replace("\n", ", ")

# We must use `cudaDeviceGetPCIBusId` to get the PCI bus string
# It returns a series of bytes containing a null byte, not a `str`
err, pci_bus = runtime.cudaDeviceGetPCIBusId(32, device) # 32 = length?

Check warning on line 105 in kernel_tuner/backends/nvcuda.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Move this trailing comment on the previous empty line.

See more on https://sonarcloud.io/project/issues?id=KernelTuner_kernel_tuner&issues=AZ6DUa-19_RloZ2qkHjR&open=AZ6DUa-19_RloZ2qkHjR&pullRequest=383
cuda_error_check(err)
env["pci_bus_id"] = pci_bus.decode("ascii").split("\x00", 1)[0]

self.env = env
self.name = env["device_name"]

# setup observers
self.observers = observers or []
self.observers.append(CudaRuntimeObserver(self))
for observer in self.observers:
observer.register_device(self)

def __del__(self):
for device_memory in self.allocations:
if isinstance(device_memory, driver.CUdeviceptr):
Expand Down
12 changes: 6 additions & 6 deletions kernel_tuner/backends/pycuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,6 @@ def _finish_up():
# default dynamically allocated shared memory size, can be overwritten using smem_args
self.smem_size = 0

# setup observers
self.observers = observers or []
self.observers.append(PyCudaRuntimeObserver(self))
for obs in self.observers:
obs.register_device(self)

# collect environment information
env = dict()
env["device_name"] = self.context.get_device().name()
Expand All @@ -148,6 +142,12 @@ def _finish_up():
self.env = env
self.name = env["device_name"]

# setup observers
self.observers = observers or []
self.observers.append(PyCudaRuntimeObserver(self))
for obs in self.observers:
obs.register_device(self)

def __del__(self):
for gpu_mem in self.allocations:
# if needed for when using mocks during testing
Expand Down
69 changes: 50 additions & 19 deletions kernel_tuner/observers/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
class nvml:
"""Class that gathers the NVML functionality for one device."""

def __init__(

Check failure on line 18 in kernel_tuner/observers/nvml.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this function to reduce its Cognitive Complexity from 19 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=KernelTuner_kernel_tuner&issues=AZ6DUa7Q9_RloZ2qkHjQ&open=AZ6DUa7Q9_RloZ2qkHjQ&pullRequest=383
self,
device_id=None,
device_uuid=None,
Expand All @@ -24,26 +24,42 @@
use_locked_clocks=False
):
"""Create object to control device using NVML."""
# We set these first as __del__ checks these
# and this __init__ may exceptions midway
self.pwr_limit_default = None
self.modified_clocks = False

pynvml.nvmlInit()

if sum(x is not None for x in [device_id, device_uuid, device_pci_bus]) != 1:
raise ValueError("invalid device: specify either the index, the UUID, or the PCI-bus")
elif device_id is not None:
if device_id is not None:
self.dev = pynvml.nvmlDeviceGetHandleByIndex(device_id)
elif device_uuid is not None:
self.dev = pynvml.nvmlDeviceGetHandleByUUID(device_uuid)
elif device_pci_bus is not None:
self.dev = pynvml.nvmlDeviceGetHandleByPciBusId_v2(device_pci_bus)
self.dev = pynvml.nvmlDeviceGetHandleByPciBusId(device_pci_bus)

self.id = pynvml.nvmlDeviceGetIndex(self.dev)
self.uuid = pynvml.nvmlDeviceGetUUID(self.dev)
self.pci_bus = pynvml.nvmlDeviceGetPciInfo_v3(self.dev).busId
self.nvidia_smi = nvidia_smi_fallback or "nvidia-smi"

if device_id is not None and self.id != device_id:
raise ValueError(f"NVML device ID does not match requested device: {device_id} != {self.id}")

# Some backends have UUID starting with "GPU-"
if device_uuid is not None and self.uuid.removeprefix("GPU-") != device_uuid.removeprefix("GPU-"):
raise ValueError(f"NVML device UUID does not match requested device: {device_uuid} != {self.uuid}")

# lstrip is needed since some backends use leading zeros
if device_pci_bus is not None and self.pci_bus.lstrip("0") != device_pci_bus.lstrip("0"):
raise ValueError(f"NVML device PCI-bus does not match requested device: {device_pci_bus} != {self.pci_bus}")

try:
self.pwr_limit_default = pynvml.nvmlDeviceGetPowerManagementLimit(self.dev)
self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.dev)
except pynvml.NVMLError_NotSupported:
self.pwr_limit_default = None
# inverted range to make all range checks fail
self.pwr_limit_default = None
self.pwr_constraints = [1, 0]

try:
Expand All @@ -58,7 +74,6 @@
self._auto_boost = None

# try to initialize application clocks
self.modified_clocks = False
try:
if not use_locked_clocks:
self.gr_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock(
Expand Down Expand Up @@ -287,6 +302,11 @@
NVML_FI_DEV_POWER_INSTANT = 186
return pynvml.nvmlDeviceGetFieldValues(self.dev, [NVML_FI_DEV_POWER_INSTANT])[0].value.uiVal

def energy_usage(self):
"""Return total energy usage since bootup in milli joules."""
NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83
return pynvml.nvmlDeviceGetFieldValues(self.dev, [NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION])[0].value.ullVal

def gr_voltage(self):
"""Return current graphics voltage in millivolts."""
args = ["nvidia-smi", "-i", str(self.id), "-q", "-d", "VOLTAGE"]
Expand Down Expand Up @@ -335,7 +355,7 @@
def __init__(
self,
observables,
device=0,
device=None,
save_all=False,
nvidia_smi_fallback=None,
use_locked_clocks=False,
Expand Down Expand Up @@ -374,6 +394,8 @@

self.record_gr_voltage = False
self.t0 = 0
self.initial_energy_reading = None

if "gr_voltage" in observables:
self.record_gr_voltage = True
self.gr_voltage_readings = []
Expand All @@ -386,26 +408,34 @@
self.iteration = {obs: [] for obs in self.during_obs}

def register_device(self, dev):
env = getattr(dev, "env", dict())
uuid = env.get("uuid")
pci_bus = env.get("pci_bus_id")

if self.device is not None:
self.nvml = nvml(device_id=self.device, **self.nvml_kwargs)
elif uuid is not None and pci_bus is not None:
self.nvml = nvml(device_uuid=uuid, device_pci_bus=pci_bus, **self.nvml_kwargs)
elif uuid is not None:
self.nvml = nvml(device_uuid=uuid, **self.nvml_kwargs)
elif pci_bus is not None:
self.nvml = nvml(device_pci_bus=pci_bus, **self.nvml_kwargs)
else:
env = getattr(dev, "env", dict())
uuid = env.get("uuid")
pci_bus = env.get("pci_bus_id")

if uuid is not None:
self.nvml = nvml(device_uuid=uuid, **self.nvml_kwargs)
elif pci_bus is not None:
self.nvml = nvml(device_pci_bus=pci_bus, **self.nvml_kwargs)
else:
raise ValueError("failed to detect NVIDIA device: no UUID or PCI-bus-id in environment")


raise ValueError("failed to detect NVIDIA device: no UUID or PCI-bus-id in environment")

def read_power(self):
""" Return power in Watt """
return self.nvml.pwr_usage() / 1e3

def read_energy(self):
""" Return cumulative energy usage in Joule """
now = self.nvml.energy_usage()

if self.initial_energy_reading is None:
self.initial_energy_reading = now

return (now - self.initial_energy_reading) / 1e3

def before_start(self):
# clear results of the observables for next measurement
self.iteration = {obs: [] for obs in self.during_obs}
Expand Down Expand Up @@ -530,3 +560,4 @@
time.sleep(sleep_s)
readings.append(d.pwr_usage())
return np.mean(readings) * 1e-3 # Watt

Loading