| 38 | return bytes(binary) |
| 39 | |
| 40 | class CLProgram: |
| 41 | def __init__(self, device:CLDevice, name:str, lib:bytes, arg_dtypes=[], **kwargs): |
| 42 | self.dev, self.name, self.lib, self.arg_dtypes = device, name, device.cl_compiler.compile_cached(lib.decode()), arg_dtypes |
| 43 | self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(self.lib)), |
| 44 | to_char_p_p([self.lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(), |
| 45 | errcode_ret := ctypes.c_int32()), errcode_ret) |
| 46 | check(binary_status.value) |
| 47 | check(cl.clBuildProgram(self.program, 1, device.device_id, None, BP_CB(), None)) # NOTE: OSX requires this |
| 48 | self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status) |
| 49 | |
| 50 | def __del__(self): |
| 51 | try: check(cl.clReleaseKernel(self.kernel)) |
| 52 | except (TypeError, AttributeError): pass |
| 53 | try: check(cl.clReleaseProgram(self.program)) |
| 54 | except (TypeError, AttributeError): pass |
| 55 | |
| 56 | def __call__(self, *bufs:cl.cl_mem, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]|None=None, vals:tuple[int, ...]=(), |
| 57 | wait=False, **kw) -> float|None: |
| 58 | i = 0 |
| 59 | for i,b in enumerate(bufs): |
| 60 | for real_i, dt in self.arg_dtypes[i]: |
| 61 | if isinstance(dt, ImageDType): |
| 62 | fmt = cl.cl_image_format(cl.CL_RGBA, {2:cl.CL_HALF_FLOAT, 4:cl.CL_FLOAT}[dt.itemsize]) |
| 63 | desc = cl.cl_image_desc(cl.CL_MEM_OBJECT_IMAGE2D, dt.shape[1], dt.shape[0], image_row_pitch=dt.pitch, buffer=b) |
| 64 | img = checked(cl.clCreateImage(self.dev.context, cl.CL_MEM_READ_WRITE, fmt, desc, None, status:=ctypes.c_int32()), status) |
| 65 | check(cl.clSetKernelArg(self.kernel, real_i, ctypes.sizeof(img), ctypes.byref(img))) |
| 66 | else: check(cl.clSetKernelArg(self.kernel, real_i, ctypes.sizeof(b), ctypes.byref(b))) |
| 67 | for i,v in enumerate(vals,start=i+1): check(cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))) |
| 68 | if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size))) |
| 69 | event = cl.cl_event() if wait else None |
| 70 | check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), |
| 71 | (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) |
| 72 | if wait: |
| 73 | assert event is not None |
| 74 | check(cl.clWaitForEvents(1, event)) |
| 75 | check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, 8, ctypes.byref(start := ctypes.c_uint64()), None)) |
| 76 | check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, 8, ctypes.byref(end := ctypes.c_uint64()), None)) |
| 77 | return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9 |
| 78 | return None |
| 79 | |
| 80 | class CLAllocator(LRUAllocator['CLDevice']): |
| 81 | def _alloc(self, size:int, options:BufferSpec) -> cl.cl_mem: |
no outgoing calls
searching dependent graphs…