diff --git a/kernel/lensed.cl b/kernel/lensed.cl index 8a6bca3..e3ee6cd 100644 --- a/kernel/lensed.cl +++ b/kernel/lensed.cl @@ -6,12 +6,17 @@ #endif // compute image -kernel void render(constant float* data, float4 pcs, - constant float2* qq, constant float2* ww, +kernel void render(ulong dsiz, constant uint* gdata, local uint* ldata, + float4 pcs, constant float2* qq, constant float2* ww, global float* value, global float* error) { // get pixel index - int k = get_global_id(0); + size_t k = get_global_id(0); + + // load data from global to local memory + for(size_t i = get_local_id(0); i < dsiz; i += get_local_size(0)) + ldata[i] = gdata[i]; + barrier(CLK_LOCAL_MEM_FENCE); // compute pixel flux if pixel is in image if(k < IMAGE_SIZE) @@ -24,7 +29,7 @@ kernel void render(constant float* data, float4 pcs, // apply quadrature rule to computed surface brightness for(size_t n = 0; n < QUAD_POINTS; ++n) - f += ww[n]*compute(data, x + qq[n]); + f += ww[n]*compute(ldata, x + qq[n]); // done value[k] = f.s0; diff --git a/objects/devauc.cl b/objects/devauc.cl index c204130..6deb34c 100644 --- a/objects/devauc.cl +++ b/objects/devauc.cl @@ -23,13 +23,13 @@ data float norm; // normalisation }; -static float brightness(constant data* this, float2 x) +static float brightness(local data* this, float2 x) { // de Vaucouleurs profile for centered and rotated coordinate system return this->norm*exp(-DEVAUC_B*sqrt(sqrt(length(mv22(this->t, x - this->x))/this->rs))); } -static void set(global data* this, float x, float y, float r, float mag, float q, float pa) +static void set(local data* this, float x, float y, float r, float mag, float q, float pa) { float c = cos(pa*DEG2RAD); float s = sin(pa*DEG2RAD); diff --git a/objects/epl.cl b/objects/epl.cl index 4ddd9ac..dc8be76 100644 --- a/objects/epl.cl +++ b/objects/epl.cl @@ -23,7 +23,7 @@ data float n; // normalisation }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { float r, phi; float2 a, A; @@ -68,7 +68,7 @@ static float2 deflection(constant data* this, float2 x) return mv22(this->w, a); } -static void set(global data* this, +static void set(local data* this, float x1, float x2, float r, float t, float q, float pa) { float c; diff --git a/objects/epl_plus_shear.cl b/objects/epl_plus_shear.cl index 935bc9c..d18b1a0 100644 --- a/objects/epl_plus_shear.cl +++ b/objects/epl_plus_shear.cl @@ -26,7 +26,7 @@ data float n; // normalisation }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { float r, phi; float2 a, A; @@ -76,7 +76,7 @@ static float2 deflection(constant data* this, float2 x) return y + mv22(this->g, dx); } -static void set(global data* this, +static void set(local data* this, float x1, float x2, float r, float t, float q, float pa, float g1, float g2) { float c; diff --git a/objects/exponential.cl b/objects/exponential.cl index f8b1cf3..36b4011 100644 --- a/objects/exponential.cl +++ b/objects/exponential.cl @@ -18,13 +18,13 @@ data float norm; // normalisation }; -static float brightness(constant data* this, float2 x) +static float brightness(local data* this, float2 x) { // exponential profile for centered and rotated coordinate system return this->norm*exp(-length(mv22(this->t, x - this->x))/this->rs); } -static void set(global data* this, float x, float y, float rs, float mag, float q, float pa) +static void set(local data* this, float x, float y, float rs, float mag, float q, float pa) { float c = cos(pa*DEG2RAD); float s = sin(pa*DEG2RAD); diff --git a/objects/gauss.cl b/objects/gauss.cl index 4b091e9..b78bb64 100644 --- a/objects/gauss.cl +++ b/objects/gauss.cl @@ -18,14 +18,14 @@ data float norm; // normalisation }; -static float brightness(constant data* this, float2 x) +static float brightness(local data* this, float2 x) { // Gaussian profile for centered and rotated coordinate system float2 y = mv22(this->t, x - this->x); return this->norm*exp(-0.5f*dot(y, y)/this->s2); } -static void set(global data* this, float x, float y, float sigma, float mag, float q, float pa) +static void set(local data* this, float x, float y, float sigma, float mag, float q, float pa) { float c = cos(pa*DEG2RAD); float s = sin(pa*DEG2RAD); diff --git a/objects/nsie.cl b/objects/nsie.cl index c993216..9552996 100644 --- a/objects/nsie.cl +++ b/objects/nsie.cl @@ -26,7 +26,7 @@ data float d; }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { float2 y; float r; @@ -45,7 +45,7 @@ static float2 deflection(constant data* this, float2 x) return mv22(this->w, y); } -static void set(global data* this, float x, float y, float r, float rc, float q, float pa) +static void set(local data* this, float x, float y, float r, float rc, float q, float pa) { float c = cos(pa*DEG2RAD); float s = sin(pa*DEG2RAD); diff --git a/objects/nsis.cl b/objects/nsis.cl index 72253f5..e93bb08 100644 --- a/objects/nsis.cl +++ b/objects/nsis.cl @@ -18,7 +18,7 @@ data float rc; // core radius }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { // move to central coordinates x -= this->x; @@ -27,7 +27,7 @@ static float2 deflection(constant data* this, float2 x) return this->r/(this->rc + length(x))*x; } -static void set(global data* this, float x, float y, float r, float rc) +static void set(local data* this, float x, float y, float r, float rc) { // lens position this->x = (float2)(x, y); diff --git a/objects/point_mass.cl b/objects/point_mass.cl index eede74a..21ccbac 100644 --- a/objects/point_mass.cl +++ b/objects/point_mass.cl @@ -15,14 +15,14 @@ data float r2; // Einstein radius squared }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { // point mass deflection x -= this->x; return this->r2/dot(x, x)*x; } -static void set(global data* this, float x, float y, float r) +static void set(local data* this, float x, float y, float r) { // lens position this->x = (float2)(x, y); diff --git a/objects/sersic.cl b/objects/sersic.cl index 1786ef8..d60acf7 100644 --- a/objects/sersic.cl +++ b/objects/sersic.cl @@ -20,13 +20,13 @@ data float m; }; -static float brightness(constant data* this, float2 x) +static float brightness(local data* this, float2 x) { float2 y = mv22(this->t, x - this->x); return exp(this->log0 - exp(this->log1 + this->m*log(dot(y, y)))); } -static void set(global data* this, float x, float y, float r, float mag, float n, float q, float a) +static void set(local data* this, float x, float y, float r, float mag, float n, float q, float a) { float b = 1.9992f*n - 0.3271f; // approximation valid for 0.5 < n < 8 diff --git a/objects/sie.cl b/objects/sie.cl index 45eca36..ae64bdc 100644 --- a/objects/sie.cl +++ b/objects/sie.cl @@ -24,7 +24,7 @@ data float d; }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { float2 y; float r; @@ -43,7 +43,7 @@ static float2 deflection(constant data* this, float2 x) return mv22(this->w, y); } -static void set(global data* this, float x, float y, float r, float q, float pa) +static void set(local data* this, float x, float y, float r, float q, float pa) { float c = cos(pa*DEG2RAD); float s = sin(pa*DEG2RAD); diff --git a/objects/sie_plus_shear.cl b/objects/sie_plus_shear.cl index 2ad5569..fc850b4 100644 --- a/objects/sie_plus_shear.cl +++ b/objects/sie_plus_shear.cl @@ -27,7 +27,7 @@ data float d; }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { float2 y; float r; @@ -46,7 +46,7 @@ static float2 deflection(constant data* this, float2 x) return mv22(this->w, y) + mv22(this->g, x); } -static void set(global data* this, float x, float y, float r, float q, float pa, float g1, float g2) +static void set(local data* this, float x, float y, float r, float q, float pa, float g1, float g2) { float c = cos(pa*DEG2RAD); float s = sin(pa*DEG2RAD); diff --git a/objects/sis.cl b/objects/sis.cl index 47c5c6a..12f1445 100644 --- a/objects/sis.cl +++ b/objects/sis.cl @@ -16,13 +16,13 @@ data float r; // Einstein radius }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { // SIS deflection return this->r*normalize(x - this->x); } -static void set(global data* this, float x, float y, float r) +static void set(local data* this, float x, float y, float r) { // lens position this->x = (float2)(x, y); diff --git a/objects/sis_plus_shear.cl b/objects/sis_plus_shear.cl index d5fdb4c..c9866bd 100644 --- a/objects/sis_plus_shear.cl +++ b/objects/sis_plus_shear.cl @@ -18,7 +18,7 @@ data float r; // Einstein radius }; -static float2 deflection(constant data* this, float2 x) +static float2 deflection(local data* this, float2 x) { // move to central coordinates x -= this->x; @@ -27,7 +27,7 @@ static float2 deflection(constant data* this, float2 x) return this->r*normalize(x) + mv22(this->g, x); } -static void set(global data* this, float x, float y, float r, float g1, float g2) +static void set(local data* this, float x, float y, float r, float g1, float g2) { // lens position this->x = (float2)(x, y); diff --git a/objects/sky.cl b/objects/sky.cl index a0ec63a..cd3e289 100644 --- a/objects/sky.cl +++ b/objects/sky.cl @@ -13,12 +13,12 @@ data float2 grad; }; -static float foreground(constant data* this, float2 x) +static float foreground(local data* this, float2 x) { return this->bg + dot(this->grad, x - (float2)(1, 1)); } -static void set(global data* this, float bg, float dx, float dy) +static void set(local data* this, float bg, float dx, float dy) { this->bg = bg; this->grad = (float2)(dx, dy); diff --git a/src/kernel.c b/src/kernel.c index 7490446..6a116a6 100644 --- a/src/kernel.c +++ b/src/kernel.c @@ -62,7 +62,7 @@ static const char PARSKERN[] = // kernel to compute images static const char COMPHEAD[] = - "static float compute(constant float* data, float2 x)\n" + "static float compute(local uint* data, float2 x)\n" "{\n" " // ray position\n" " float2 y = x;\n" @@ -80,7 +80,7 @@ static const char COMPLHED[] = " // calculate deflection\n" ; static const char COMPLENS[] = - " a += deflection_%s((constant void*)(data + %zu), y);\n" + " a += deflection_%s((local void*)(data + %zu), y);\n" ; static const char COMPDEFL[] = " \n" @@ -93,14 +93,14 @@ static const char COMPSHED[] = " // calculate surface brightness\n" ; static const char COMPSRCE[] = - " f += brightness_%s((constant void*)(data + %zu), y);\n" + " f += brightness_%s((local void*)(data + %zu), y);\n" ; static const char COMPFHED[] = " \n" " // add foreground\n" ; static const char COMPFGND[] = - " f += foreground_%s((constant void*)(data + %zu), x);\n" + " f += foreground_%s((local void*)(data + %zu), x);\n" ; static const char COMPFOOT[] = " \n" @@ -111,13 +111,23 @@ static const char COMPFOOT[] = // kernel to set parameters static const char SETPHEAD[] = - "kernel void set_params(global float* data, constant float* params)\n" + "kernel void set_params(ulong dsiz, global int* gdata, local int* ldata,\n" + " constant float* params)\n" "{\n" + " // load parameters to local memory\n" + " for(size_t i = get_local_id(0); i < dsiz; i += get_local_size(0))\n" + " ldata[i] = gdata[i];\n" + " \n" ; -static const char SETPLEFT[] = " set_%s((global void*)(data + %zu)"; +static const char SETPLEFT[] = " set_%s((local void*)(ldata + %zu)"; static const char SETPARGS[] = ", params[%zu]"; static const char SETPRGHT[] = ");\n"; static const char SETPFOOT[] = + " \n" + " // store parameters to global memory\n" + " for(size_t i = get_local_id(0); i < dsiz; i += get_local_size(0))\n" + " gdata[i] = ldata[i];\n" + " \n" "}\n" ; diff --git a/src/lensed.c b/src/lensed.c index 0173c0b..8dd6519 100644 --- a/src/lensed.c +++ b/src/lensed.c @@ -63,6 +63,7 @@ int main(int argc, char* argv[]) cl_ulong local_mem_size; // buffer for objects + cl_ulong object_size; cl_mem object_mem; // buffers for quadrature rule @@ -676,7 +677,7 @@ int main(int argc, char* argv[]) // create buffer that contains object data { // collect total size of object data, in units of sizeof(cl_float) - size_t object_size = 0; + object_size = 0; for(size_t i = 0; i < inp->nobjs; ++i) object_size += inp->objs[i].size; @@ -706,8 +707,10 @@ int main(int argc, char* argv[]) // set kernel arguments err = 0; - err |= clSetKernelArg(lensed->set_params, 0, sizeof(cl_mem), &object_mem); - err |= clSetKernelArg(lensed->set_params, 1, sizeof(cl_mem), &lensed->params); + err |= clSetKernelArg(lensed->set_params, 0, sizeof(cl_ulong), &object_size); + err |= clSetKernelArg(lensed->set_params, 1, sizeof(cl_mem), &object_mem); + err |= clSetKernelArg(lensed->set_params, 2, object_size*sizeof(cl_uint), NULL); + err |= clSetKernelArg(lensed->set_params, 3, sizeof(cl_mem), &lensed->params); if(err != CL_SUCCESS) error("failed to set kernel arguments for parameters"); } @@ -742,12 +745,14 @@ int main(int argc, char* argv[]) // set kernel arguments err = 0; - err |= clSetKernelArg(lensed->render, 0, sizeof(cl_mem), &object_mem); - err |= clSetKernelArg(lensed->render, 1, sizeof(cl_float4), &pcs4); - err |= clSetKernelArg(lensed->render, 2, sizeof(cl_mem), &qq_mem); - err |= clSetKernelArg(lensed->render, 3, sizeof(cl_mem), &ww_mem); - err |= clSetKernelArg(lensed->render, 4, sizeof(cl_mem), &lensed->value_mem); - err |= clSetKernelArg(lensed->render, 5, sizeof(cl_mem), &lensed->error_mem); + err |= clSetKernelArg(lensed->render, 0, sizeof(cl_ulong), &object_size); + err |= clSetKernelArg(lensed->render, 1, sizeof(cl_mem), &object_mem); + err |= clSetKernelArg(lensed->render, 2, object_size*sizeof(cl_uint), NULL); + err |= clSetKernelArg(lensed->render, 3, sizeof(cl_float4), &pcs4); + err |= clSetKernelArg(lensed->render, 4, sizeof(cl_mem), &qq_mem); + err |= clSetKernelArg(lensed->render, 5, sizeof(cl_mem), &ww_mem); + err |= clSetKernelArg(lensed->render, 6, sizeof(cl_mem), &lensed->value_mem); + err |= clSetKernelArg(lensed->render, 7, sizeof(cl_mem), &lensed->error_mem); if(err != CL_SUCCESS) error("failed to set render kernel arguments");