(model, step: Step)
| 111 | return code |
| 112 | |
| 113 | def compile_step(model, step: Step): |
| 114 | linear, output_bufs = jit_model(step, *step.input) |
| 115 | functions, statements, bufs, _ = compile_net(linear, output_bufs) |
| 116 | state = get_state_dict(model) |
| 117 | weights = {(id(b), b.offset, b.size, b.dtype): name for name, x in state.items() if (b:=x.uop.base.realized) is not None} |
| 118 | kernel_code = '\n\n'.join([f"const {key} = `{fixup_code(code, key)}`;" for key, code in functions.items()]) |
| 119 | kernel_names = ', '.join([name for (name, _, _, _) in statements]) |
| 120 | input_names = [f"input{i}" for i in range(len(step.input))] |
| 121 | output_names = [f"output{i}" for i in range(len(output_bufs))] |
| 122 | input_buf_types = [dtype_to_js_type(bufs[inp_name][1]) for inp_name in input_names] |
| 123 | output_buf_types = [dtype_to_js_type(bufs[out_name][1]) for out_name in output_names] |
| 124 | kernel_calls = '\n '.join([f"addComputePass(device, commandEncoder, piplines[{i}], [{', '.join(args)}], {global_size});" for i, (_name, args, global_size, _local_size) in enumerate(statements) ]) |
| 125 | exported_bufs = '\n '.join([f"const {name} = " + (f"createEmptyBuf(device, {size});" if _key not in weights else f"createWeightBuf(device, {size}, getTensorBuffer(safetensor, metadata['{weights[_key]}'], '{weights[_key]}'))") + ";" for name,(size,dtype,_key) in bufs.items()]) |
| 126 | gpu_write_bufs = '\n '.join([f"const gpuWriteBuffer{i} = device.createBuffer({{size:input{i}.size, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE }});" for i in range(len(input_names))]) |
| 127 | input_writer = '\n '.join([f"await gpuWriteBuffer{i}.mapAsync(GPUMapMode.WRITE);\n new {input_buf_types[i]}(gpuWriteBuffer{i}.getMappedRange()).set(" + f'data{i});' + f"\n gpuWriteBuffer{i}.unmap();\ncommandEncoder.copyBufferToBuffer(gpuWriteBuffer{i}, 0, input{i}, 0, gpuWriteBuffer{i}.size);" for i,_ in enumerate(input_names)]) |
| 128 | return f"""\n var {step.name} = function() {{ |
| 129 | |
| 130 | {kernel_code} |
| 131 | |
| 132 | return {{ |
| 133 | "setup": async (device, safetensor) => {{ |
| 134 | const metadata = safetensor ? getTensorMetadata(safetensor[0]) : null; |
| 135 | |
| 136 | {exported_bufs} |
| 137 | |
| 138 | {gpu_write_bufs} |
| 139 | const gpuReadBuffer = device.createBuffer({{ size: output0.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ }}); |
| 140 | |
| 141 | const kernels = [{kernel_names}]; |
| 142 | const piplines = await Promise.all(kernels.map(name => device.createComputePipelineAsync({{layout: "auto", compute: {{ module: device.createShaderModule({{ code: name }}), entryPoint: "main" }}}}))); |
| 143 | |
| 144 | return async ({",".join([f'data{i}' for i in range(len(input_names))])}) => {{ |
| 145 | const commandEncoder = device.createCommandEncoder(); |
| 146 | |
| 147 | {input_writer} |
| 148 | |
| 149 | {kernel_calls} |
| 150 | commandEncoder.copyBufferToBuffer(output0, 0, gpuReadBuffer, 0, output0.size); |
| 151 | const gpuCommands = commandEncoder.finish(); |
| 152 | device.queue.submit([gpuCommands]); |
| 153 | |
| 154 | await gpuReadBuffer.mapAsync(GPUMapMode.READ); |
| 155 | const resultBuffer = new {output_buf_types[0]}(gpuReadBuffer.size/{bufs[output_names[0]][1].itemsize}); |
| 156 | resultBuffer.set(new {output_buf_types[0]}(gpuReadBuffer.getMappedRange())); |
| 157 | gpuReadBuffer.unmap(); |
| 158 | return resultBuffer; |
| 159 | }} |
| 160 | }} |
| 161 | }} |
| 162 | }} |
| 163 | """ |
| 164 | |
| 165 | for step in sub_steps: |
| 166 | print(f'Executing step={step.name}') |
no test coverage detected
searching dependent graphs…