Example usage:
import cycada.core as cuda import numpy as np import cycada.numpycmd as ncmd import cycada.cmd as cmd import cycada.numpy_ext cuda.init() d = cuda.Device() c = d.create_context(flags = 0, opengl = False) s = c.create_stream() dbuf = c.alloc(256 * 256 * 4) hbuf = c.ndarray_host((256, 256), dtype = 'int32') #hbuf = np.zeros(256 * 256, "int32") write_cmd = ncmd.WriteBufferNDArray(dbuf, hbuf) read_cmd = ncmd.ReadBufferNDArray(hbuf, dbuf) s.enqueue(write_cmd) hbuf.fill(10) print hbuf s.enqueue(read_cmd) s.synchronize() print hbuf r = "../kernel/test.ptx" m = c.load_module(open(r).read()) f = m.get_function("fill") f.parameters = (cuda.MEMParameter, cuda.UINTParameter, cuda.UINTParameter) k_cmd = cmd.RangeKernel(f, (dbuf.__buffer__, 1001, 256 * 256), block_dim = (256, 1, 1), grid_dim = (256, 1, 1)) s.enqueue(k_cmd) s.enqueue(read_cmd) s.synchronize() print hbuf