MCPcopy Index your code
hub / github.com/tinygrad/tinygrad / CLProgram

Class CLProgram

tinygrad/runtime/ops_cl.py:40–78  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

38 return bytes(binary)
39
40class CLProgram:
41 def __init__(self, device:CLDevice, name:str, lib:bytes, arg_dtypes=[], **kwargs):
42 self.dev, self.name, self.lib, self.arg_dtypes = device, name, device.cl_compiler.compile_cached(lib.decode()), arg_dtypes
43 self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(self.lib)),
44 to_char_p_p([self.lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
45 errcode_ret := ctypes.c_int32()), errcode_ret)
46 check(binary_status.value)
47 check(cl.clBuildProgram(self.program, 1, device.device_id, None, BP_CB(), None)) # NOTE: OSX requires this
48 self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
49
50 def __del__(self):
51 try: check(cl.clReleaseKernel(self.kernel))
52 except (TypeError, AttributeError): pass
53 try: check(cl.clReleaseProgram(self.program))
54 except (TypeError, AttributeError): pass
55
56 def __call__(self, *bufs:cl.cl_mem, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]|None=None, vals:tuple[int, ...]=(),
57 wait=False, **kw) -> float|None:
58 i = 0
59 for i,b in enumerate(bufs):
60 for real_i, dt in self.arg_dtypes[i]:
61 if isinstance(dt, ImageDType):
62 fmt = cl.cl_image_format(cl.CL_RGBA, {2:cl.CL_HALF_FLOAT, 4:cl.CL_FLOAT}[dt.itemsize])
63 desc = cl.cl_image_desc(cl.CL_MEM_OBJECT_IMAGE2D, dt.shape[1], dt.shape[0], image_row_pitch=dt.pitch, buffer=b)
64 img = checked(cl.clCreateImage(self.dev.context, cl.CL_MEM_READ_WRITE, fmt, desc, None, status:=ctypes.c_int32()), status)
65 check(cl.clSetKernelArg(self.kernel, real_i, ctypes.sizeof(img), ctypes.byref(img)))
66 else: check(cl.clSetKernelArg(self.kernel, real_i, ctypes.sizeof(b), ctypes.byref(b)))
67 for i,v in enumerate(vals,start=i+1): check(cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v))))
68 if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
69 event = cl.cl_event() if wait else None
70 check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size),
71 (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event))
72 if wait:
73 assert event is not None
74 check(cl.clWaitForEvents(1, event))
75 check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, 8, ctypes.byref(start := ctypes.c_uint64()), None))
76 check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, 8, ctypes.byref(end := ctypes.c_uint64()), None))
77 return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9
78 return None
79
80class CLAllocator(LRUAllocator['CLDevice']):
81 def _alloc(self, size:int, options:BufferSpec) -> cl.cl_mem:

Callers 13

warp_size2Function · 0.90
reg_countFunction · 0.90
cl_readFunction · 0.90
gflopsFunction · 0.90
loadMethod · 0.90
saveMethod · 0.90
intel_xmx.pyFile · 0.90
test_compile_cachedMethod · 0.90

Calls

no outgoing calls

Tested by 2

test_compile_cachedMethod · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…