From 509efde3d6a3cf5b22f39733fd2bbd788ced8352 Mon Sep 17 00:00:00 2001 From: nicholst Date: Sat, 17 Jan 2015 17:35:41 +0000 Subject: [PATCH 01/13] Tims updates to 1) No longer allocated device memory for Phi (used for logistic regression) and for the residuals. This saves quite a bit of memory. 2) Increase the number of subjects allowed to 2000. (I will change this so it is no longer constant later). This has been tested with 956 subjects 15 covariates, requiring 1565 MB of device (GPU) memory (out of K20c's 4799 MB); 2000 subjects should be feasible. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note max # covariates is 20. Max # subjects now 2000—which should take up less than 3300MB of device Memory w/15 covariates. --- bsglmm/covarGPU.cu | 3918 ++++++++++++++++++++++---------------------- bsglmm/mcmc.cpp | 2640 +++++++++++++++-------------- 2 files changed, 3271 insertions(+), 3287 deletions(-) diff --git a/bsglmm/covarGPU.cu b/bsglmm/covarGPU.cu index 8c46122..0b7a52a 100644 --- a/bsglmm/covarGPU.cu +++ b/bsglmm/covarGPU.cu @@ -1,1959 +1,1959 @@ -/* - * covarGPU.cu - * BinCAR - * - * Created by Timothy Johnson on 5/22/12. - * Copyright 2012 University of Michigan. All rights reserved. - * - */ - -#include -#include -#include -#include -#include -#include -#include "binCAR.h" -#include "randgen.h" -#include "cholesky.h" -#include - -cudaError_t err; - -extern float logit_factor; -extern float t_df; -extern int MODEL; -extern int NSUBS; -extern int NROW; -extern int NCOL; -extern int NDEPTH; -extern int TOTVOX; -extern int TOTVOXp; -extern int NCOVAR; -extern int NCOV_FIX; -extern int NSUBTYPES; -//extern int NSIZE; -extern int iter; -extern INDEX *INDX; -extern float *deviceCovar; -extern float *hostCovar; -extern float *deviceCov_Fix; -extern float *hostCov_Fix; -extern float *deviceAlphaMean; -extern float *deviceFixMean; -extern float *deviceZ; -extern float *devicePhi; -extern float *deviceWM; -extern unsigned int *deviceResid; -extern int *deviceIdx; -extern int *deviceIdxSC; -extern float *devicePrb; -extern float *devicePredict; -extern float *XXprime; -extern float *XXprime_Fix; -extern int *hostIdx; -extern int *hostIdxSC; -//extern float *deviceChiSqHist; -//extern int ChiSqHist_N; - -const int NN=256; -const int BPG=32; -extern curandState *devStates; -extern unsigned char *deviceData; -//extern short *deviceData; -extern float *deviceSpatCoef; -__constant__ float dPrec[20*20]; -__constant__ float dtmpvar[20*20]; -__constant__ float dalphaMean[20]; -__constant__ float dalphaPrec[20]; - -const int TPB=192; -const int TPB2=96; - -#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) - -#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} - -__global__ void setup_kernel ( curandState * state, unsigned long long devseed, const int N ) -{ -/* Each thread gets same seed , a different sequence number, no offset */ - - int idx = threadIdx.x + blockIdx.x * blockDim.x; - - while (idx < N) { - curand_init (devseed , idx , 0 , & state [ idx ]) ; - idx += blockDim.x *gridDim.x; - } -} - - -__global__ void reduce(float *in,float *out,const int N) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int ivox = threadIdx.x + blockIdx.x * blockDim.x; - - float c = 0; - float y,t; - float temp = 0; - while (ivox < N) { - y = in[ivox] - c; - t = temp + y; - c = (t - temp) - y; - temp = t; - ivox += blockDim.x * gridDim.x; - } - - cache[cacheIndex] = temp; - - __syncthreads(); - - int i = (blockDim.x >> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - out[blockIdx.x] = cache[0]; -} - - -__device__ int cholesky_decompGPU(float *A, int num_col) -{ - int k,i,j; - - for (k=0;k=0;i--) { - temp = 0.0; - for (j=i+1;j NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ, - devicePhi,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef");exit(0);} - } - else { - Spat_Coef_Probit<<>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Probit");exit(0);} - } - - } - - - for (this_one=0;this_one>>(&deviceSpatCoef[this_one*TOTVOXp],d_sum,TOTVOXp); - - h_sum = (float *)calloc(BPG,sizeof(float)); - - cudaMemcpy(h_sum,d_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - cmean = 0; - for (int j=0;j>>(&(deviceSpatCoef[this_one*TOTVOXp]),INDX[i].hostN,INDX[i].deviceVox,cmean,deviceIdxSC); - alphaMean[this_one] = cmean/logit_factor; - } -} - - - -__global__ void betaMGPU(float *dcovar,float *dcovF,float *dZ,float *dspatCoef,float *fix,float *dalpha,float *dWM,const int N,const int TOTVOXp,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int TOTVOX,int * vox,float *dmean,int *dIdx,int *dIdxSC) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x*gridDim.x; - - float c = 0; - float y,t; - float temp = 0; - float SC[20]; - float mean=0; - int isub,ii,voxel,IDX,IDXSC; - while (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - for (ii=0;ii> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dmean[blockIdx.x] = cache[0]; -} - - -__global__ void betaVGPU(float *dWM,const int N,int *vox,float *dvar,int *dIdx) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x*gridDim.x; - - float c = 0; - float y,t; - float temp = 0; - - int voxel,IDX; - while (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - y = dWM[IDX]*dWM[IDX] - c; - t = temp + y; - c = (t - temp) - y; - temp = t; - - idx += stride; - } - - cache[cacheIndex] = temp; - - __syncthreads(); - - int i = (blockDim.x >> 1); - while (i != 0) { - if (cacheIndex < i) { - cache[cacheIndex] += cache[cacheIndex + i]; - } - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) { - dvar[blockIdx.x] = cache[0]; - } - -} - -void updateBetaGPU(float *beta,float *dZ,float *dPhi,float *dalpha,float *dWM,float prior_mean_beta,float prior_prec_beta,float *dspatCoef,float *dcovar,unsigned long *seed) -{ - float *dmean,*dvar,*hmean,*hvar; - float mean,var,tmp; - - hmean = (float *)calloc(BPG,sizeof(float)); - hvar = (float *)calloc(BPG,sizeof(float)); - cudaMalloc((void **)&dmean,BPG*sizeof(float)) ; - cudaMalloc((void **)&dvar,BPG*sizeof(float)) ; - - mean = var = 0; - for (int i=0;i<2;i++) { - betaMGPU<<>>(dcovar,deviceCov_Fix,dZ,dspatCoef,deviceFixMean,deviceAlphaMean,deviceWM,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox,dmean,deviceIdx,deviceIdxSC); - cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j>>(deviceWM,INDX[i].hostN,INDX[i].deviceVox,dvar,deviceIdx); - cudaMemcpy(hvar,dvar,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dmean[blockIdx.x] = cache[0]; -} - -void updateAlphaMeanGPU(float *alphaMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) -{ - double *mean,*V,*tmpmean; - float *hmean,*dmean; - - mean = (double *)calloc(NCOVAR,sizeof(double)); - tmpmean = (double *)calloc(NCOVAR,sizeof(double)); - V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); - - for (int i=0;i>>(dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,TOTVOX,INDX[i].deviceVox, - dmean,deviceIdx,deviceIdxSC,icovar); - cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dmean[blockIdx.x] = cache[0]; -} - -void updatefixMeanGPU(float *fixMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) -{ - double *mean,*V,*tmpmean; - float *hmean,*dmean; - - mean = (double *)calloc(NCOV_FIX,sizeof(double)); - tmpmean = (double *)calloc(NCOV_FIX,sizeof(double)); - V = (double *)calloc(NCOV_FIX*NCOV_FIX,sizeof(double)); - - for (int i=0;i>>(deviceCov_Fix,dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox, - dmean,deviceIdx,deviceIdxSC,icovar); - cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dbeta[blockIdx.x] = cache[0]; -} - -void updateSpatCoefPrecGPU_Laplace(float *SpatCoefPrec,unsigned long *seed) -{ - float *hbeta; - double *var; - float *dbeta; - double betaSqr = 2.0;//0.2; -// double tau; - - var = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); - hbeta = (float*)calloc(BPG,sizeof(float)); - cudaMalloc((void **)&dbeta,BPG*sizeof(float)); - cudaMemset(dbeta,0,BPG*sizeof(float)); - - for (int ii=0;ii>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[ii*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} - CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); - for (int j=0;j>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[jj*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} - CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); - for (int j=0;ja */ - float x,a_star; - int stop; -// double rnorm(double,double,unsigned long *); -// double kiss(unsigned long *); - - - if (a<0.0f) - { - stop=0; - while (stop==0) - { - x = curand_normal(state); - if( x>a ) stop=1; - else stop=0; - } - } - else - { - a_star=0.5f*(a+sqrtf(a*a+4.0f)); - stop=0; - while (stop==0) - { - x=a-__logf(curand_uniform(state))/a_star; - if( __logf(curand_uniform(state))a+3*__expf(-a*a-1/(a*a))/(a+sqrtf(a*a+4.0f))) - { - stop=0; - while (stop==0) - { - x=truncNormLeft(a,state); - if( x < b ) stop=1; - else stop = 0; - } - } - else - { - boo=0.0f; - if (b<0.0f) - boo=b*b; - if (a>0.0f) - boo=a*a; - - - stop=0; - while (stop==0) - { - x=(b-a)*curand_uniform(state)+a; - boun=boo-x*x; - if( 2.0f*__logf(curand_uniform(state))0.0f && num<0.0f ) || ( sign<0.0f && num>0.0f ) ) - return -num; - else return num; -} - -__device__ float sgammaGPU(float a, curandState *state) { - const float q1 = 0.0416666664f; - const float q2 = 0.0208333723f; - const float q3 = 0.0079849875f; - const float q4 = 0.0015746717f; - const float q5 = -0.0003349403f; - const float q6 = 0.0003340332f; - const float q7 = 0.0006053049f; - const float q8 = -0.0004701849f; - const float q9 = 0.0001710320f; - const float a1 = 0.333333333f; - const float a2 = -0.249999949f; - const float a3 = 0.199999867f; - const float a4 = -0.166677482f; - const float a5 = 0.142873973f; - const float a6 = -0.124385581f; - const float a7 = 0.110368310f; - const float a8 = 0.112750886f; - const float a9 = 0.104089866f; - const float e1 = 1.000000000f; - const float e2 = 0.499999994f; - const float e3 = 0.166666848f; - const float e4 = 0.041664508f; - const float e5 = 0.008345522f; - const float e6 = 0.001353826f; - const float e7 = 0.000247453f; - float aa = 0.0f; - float aaa = 0.0f; - const float sqrt32 = 5.65685424949238f; - float sgamma,s2,s,d,t,x,u,r,q0,b,si,c,v,q,e,w,p; - - if(a == aa) - goto S2; - if(a < 1.0f) - goto S13; - -//S1: // STEP 1: RECALCULATIONS OF S2,S,D IF A HAS CHANGED - aa = a; - s2 = a-0.5f; - s = sqrtf(s2); - d = sqrt32-12.0f*s; - -S2: // STEP 2: T=STANDARD NORMAL DEVIATE, X=(S,1/2)-NORMAL DEVIATE. IMMEDIATE ACCEPTANCE (I) - t = curand_normal(state); - x = s+0.5f*t; - sgamma = x*x; - if(t >= 0.0f) - return sgamma; - -//S3: // STEP 3: U= 0,1 -UNIFORM SAMPLE. SQUEEZE ACCEPTANCE (S) - u = curand_uniform(state); - if(d*u <= t*t*t) - return sgamma; - -//S4: // STEP 4: RECALCULATIONS OF Q0,B,SI,C IF NECESSARY - if (a != aaa) { - aaa = a; - r = 1.0f/ a; - q0 = r*(q1+r*(q2+r*(q3+r*(q4+r*(q5+r*(q6+r*(q7+r*(q8+r*q9)))))))); - if (a <= 3.686f) { - b = 0.463f+s+0.178f*s2; - si = 1.235f; - c = 0.195f/s-0.079f+0.16f*s; - } - else if (a <= 13.022f) { - b = 1.654f+0.0076f*s2; - si = 1.68f/s+0.275f; - c = .062/s+0.024f; - } - else { - b = 1.77f; - si = 0.75f; - c = 0.1515f/s; - } - } - -//S5: // NO QUOTIENT TEST IF X NOT POSITIVE - if(x <= 0.0f) - goto S8; - -//S6: // CALCULATION OF V AND QUOTIENT Q - v = t/(s+s); - if(fabsf(v) > 0.25f) - q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); - else - q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); - -//S7: // QUOTIENT ACCEPTANCE (Q) - if(log1pf(-u) <= q) return sgamma; - -S8: // E=STANDARD EXPONENTIAL DEVIATE U= 0,1 -UNIFORM DEVIATE T=(B,SI)-DOUBLE EXPONENTIAL (LAPLACE) SAMPLE - e = rexpGPU(1.0f,state); - u = curand_uniform(state); - u += (u-1.0f); - t = b+fsignfGPU(si*e,u); - -//S9: // REJECTION IF T .LT. TAU(1) = -.71874483771719 - if(t <= -0.71874483771719f) - goto S8; - -//S10: // CALCULATION OF V AND QUOTIENT Q - v = t/(s+s); - if(fabsf(v) > 0.25f) - q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); - else - q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); - -//S11: // HAT ACCEPTANCE (H) (IF Q NOT POSITIVE GO TO STEP 8) - if (q <= 0.5f) - w = q*(e1+q*(e2+q*(e3+q*(e4+q*(e5+q*(e6+q*e7)))))); - else - w = expf(q)-1.0f; - if(q <= 0.0f || c*fabs(u) > w*expf(e-0.5f*t*t)) - goto S8; -//S12: - x = s+0.5f*t; - sgamma = x*x; - return sgamma; - -S13: // ALTERNATE METHOD FOR PARAMETERS A BELOW 1 (.3678794=EXP(-1.)) - aa = 0.0f; - b = 1.0f+0.3678794f*a; - -S14: - p = b*curand_uniform(state); - if(p >= 1.0f) - goto S15; - sgamma = expf(logf(p)/ a); - if(rexpGPU(1.0f,state) < sgamma) - goto S14; - return sgamma; - -S15: - sgamma = -logf((b-p)/ a); - if(rexpGPU(1.0f,state) < (1.0f-a)*logf(sgamma)) - goto S14; - return sgamma; - -} - - -__global__ void Z_GPU1(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,unsigned int *dR) -{ - extern __shared__ float shared[]; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int localsize = blockDim.x; - int localid = threadIdx.x; - - float mean[300]; - float tmp=0; - float SC[20]; - int voxel=0,IDX=0,IDXSC=0; - curandState localState; - - if (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - tmp = beta*WM[IDX]; - - for (int ii=0;ii NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j 4.0f); - } - state[idx] = localState; - } -} - -__global__ void Z_GPU2(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,unsigned int *dR,const float alp,const float df) -{ - extern __shared__ float shared[]; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int localsize = blockDim.x; - int localid = threadIdx.x; - - float a = alp; - float mean[300]; - float tmp=0; - float SC[20]; - int voxel=0,IDX=0,IDXSC=0; - curandState localState; - - if (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - tmp = beta*WM[IDX]; - - for (int ii=0;ii NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j 4.0f); - - b = Z - M; - b = (df + b*b)/2.0f; - P = sgammaGPU(a,&localState)/b; - - dZ[isub*TOTVOX+IDX] = Z; - dPhi[isub*TOTVOX+IDX] = P; - } - state[idx] = localState; - } -} - - -__global__ void Phi_GPU(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,const float alp,const float df) -{ - extern __shared__ float shared[]; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int localsize = blockDim.x; - int localid = threadIdx.x; - - const float a = alp; - float beta2[300]; - float tmp=0; - float SC[20]; - int voxel=0,IDX=0,IDXSC=0; - curandState localState; - - if (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - tmp = beta*WM[IDX]; - - for (int ii=0;ii NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j N) ? INDX[i].hostN:N; - - if (MODEL != 1) { - - for (int i=0;i<2;i++) { - int thr_per_block = TPB2; - int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); - int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; - - Phi_GPU<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, - deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, - INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,alpha,t_df); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Phi_GPU");exit(0);} - } - - } -} - -void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed) -{ - double alpha = ((double)t_df + 1)/2; - unsigned char d; - double beta2,mx= -10,mZ=-10,mtmp,mZ2=-10; - - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,TOTVOXp*NCOVAR*sizeof(float),cudaMemcpyDeviceToHost) ); - CUDA_CALL( cudaMemcpy(Z,deviceZ,TOTVOX*NSUBS*sizeof(float),cudaMemcpyDeviceToHost) ); - for (int i=0;i<2;i++) { - for (int j=0;jfabs(Z[isub*TOTVOX+IDX])) ? mZ2:fabs(Z[isub*TOTVOX+IDX]); - beta2 = ((double)t_df + beta2*beta2)/2.0; - Phi[isub*TOTVOX+IDX] = (float)rgamma(alpha,beta2,seed); - // Phi[isub*TOTVOX+IDX] = (float)rgamma(0.5*(double)t_df,0.5*(double)t_df,seed); - // printf("%lf\n",Phi[isub*TOTVOX+IDX]); - } - } - } - CUDA_CALL( cudaMemcpy(devicePhi,Phi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyHostToDevice) ); - printf("beta2_max = %lf %lf %lf\t %lf %d\n",mx,mZ,mtmp,mZ2,(int)d);fflush(NULL); -} - -void updateZGPU(unsigned char *data,float beta,unsigned long *seed) -{ - float alpha = (t_df + 1)/2; - float sqrt2 = sqrtf(2); -// float *rchisq,*drchisq; -// float dfdiv2 = t_df/2; - int N=0; - for (int i=0;i<2;i++) - N = (INDX[i].hostN > N) ? INDX[i].hostN:N; - - if (MODEL != 1) { -// rchisq = (float *)malloc(N*sizeof(float)); -// cudaMalloc((void **)&drchisq,N*sizeof(float)); - - for (int i=0;i<2;i++) { -// for (int j=0;j>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, - deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, - INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,deviceResid,alpha,t_df); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU2");exit(0);} - } -// free(rchisq); -// cudaFree(drchisq); - } - else { - - for (int i=0;i<2;i++) { - int thr_per_block = TPB2; - int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); - int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; - - Z_GPU1<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean, - deviceAlphaMean,deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, - INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,deviceResid); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU1");exit(0);} - - } - } -} - - -__device__ inline float dProbSDNormGPU(const float x) -//Calculate the cumulative probability of standard normal distribution; -{ - const float b1 = 0.319381530f; - const float b2 = -0.356563782f; - const float b3 = 1.781477937f; - const float b4 = -1.821255978f; - const float b5 = 1.330274429f; - const float p = 0.2316419f; - const float c = 0.39894228f; - float t,sgnx,out,y; - - sgnx = copysignf(1.0f,x); - y = sgnx*x; - - t = 1.0f / ( 1.0f + p * y ); - - out = ((x >= 0.0f) - sgnx*c * expf( -y * y / 2.0f ) * t * - ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); - - return out; -} - -__global__ void ProbLogitGPU(float *prb,float *alpha,float *SpatCoef,float beta,float *WM,int *dIdx,int *dIdxSC,int *vox,const int TOTVOX,const int TOTVOXp,const int NSUBTYPES,const int N,const float logit_factor) -{ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x*gridDim.x; - - float x; - float bwhtmat=0; - int voxel,ii,IDX,IDXSC; - - while (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - bwhtmat = beta*WM[IDX]; - - for (ii=0;ii>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN,logit_factor); - } - else { - for (int i=0;i<2;i++) - ProbSDNormGPU<<<(INDX[i].hostN+511)/512, 512 >>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN); - } -} - -__global__ void reduce_for_Mpredict(float *in,float *out,unsigned char *data,const int N) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int ivox = threadIdx.x + blockIdx.x * blockDim.x; - float c = 0; - float y,t; - float temp = 0; - - while (ivox < N) { - int tmp = (int)data[ivox]; - float inv = in[ivox]; - y = tmp*logf(inv + 1E-35f) + (1 - tmp)*logf(1.0f - inv + 1E-35f) - c; - t = temp + y; - c = (t - temp) - y; - temp = t; - ivox += blockDim.x * gridDim.x; - } - - cache[cacheIndex] = temp; - - __syncthreads(); - - int i = (blockDim.x >> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - out[blockIdx.x] = cache[0]; -} - -double compute_predictGPU(float beta,unsigned char *data,unsigned char *msk,float *covar,float *predict,double **Qhat,int first) -{ - double *Mpredict,DIC; - float *hf_sum,*df_sum; - - Mpredict = (double *)calloc(NSUBTYPES,sizeof(double)); - - CUDA_CALL( cudaMalloc((void **)&df_sum,BPG*sizeof(float)) ); - hf_sum = (float *)calloc(BPG,sizeof(float)); - - DIC = 0; - for (int isub=0;isub>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, - deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, - NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN,logit_factor); - } - } - else { - for (int i=0;i<2;i++) { - ProbSDNormGPU_prediction<<<(INDX[i].hostN+511)/512, 512 >>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, - deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, - NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN); - } - } - - // Compute Bayes Factor -/* reduce<<>>(devicePredict,df_sum,NSUBTYPES*TOTVOX); - - cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j>>(&devicePredict[ii*TOTVOX],df_sum,&deviceData[isub*TOTVOX],TOTVOX); - CUDA_CALL( cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); - - for (int j=0;j Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; - Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; - } - } - else { - double max = -DBL_MAX + 0.5; - for (int i=0;i Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; - Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; - } - } - } - - free(Mpredict); - CUDA_CALL( cudaFree(df_sum) ); - free(hf_sum); - - return(DIC); -} - - +/* + * covarGPU.cu + * BinCAR + * + * Created by Timothy Johnson on 5/22/12. + * Copyright 2012 University of Michigan. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include "binCAR.h" +#include "randgen.h" +#include "cholesky.h" +#include + +cudaError_t err; + +extern float logit_factor; +extern float t_df; +extern int MODEL; +extern int NSUBS; +extern int NROW; +extern int NCOL; +extern int NDEPTH; +extern int TOTVOX; +extern int TOTVOXp; +extern int NCOVAR; +extern int NCOV_FIX; +extern int NSUBTYPES; +//extern int NSIZE; +extern int iter; +extern INDEX *INDX; +extern float *deviceCovar; +extern float *hostCovar; +extern float *deviceCov_Fix; +extern float *hostCov_Fix; +extern float *deviceAlphaMean; +extern float *deviceFixMean; +extern float *deviceZ; +extern float *devicePhi; +extern float *deviceWM; +extern unsigned int *deviceResid; +extern int *deviceIdx; +extern int *deviceIdxSC; +extern float *devicePrb; +extern float *devicePredict; +extern float *XXprime; +extern float *XXprime_Fix; +extern int *hostIdx; +extern int *hostIdxSC; +//extern float *deviceChiSqHist; +//extern int ChiSqHist_N; + +const int NN=256; +const int BPG=32; +extern curandState *devStates; +extern unsigned char *deviceData; +//extern short *deviceData; +extern float *deviceSpatCoef; +__constant__ float dPrec[20*20]; +__constant__ float dtmpvar[20*20]; +__constant__ float dalphaMean[20]; +__constant__ float dalphaPrec[20]; + +const int TPB=192; +const int TPB2=96; + +#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) + +#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} + +__global__ void setup_kernel ( curandState * state, unsigned long long devseed, const int N ) +{ +/* Each thread gets same seed , a different sequence number, no offset */ + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + while (idx < N) { + curand_init (devseed , idx , 0 , & state [ idx ]) ; + idx += blockDim.x *gridDim.x; + } +} + + +__global__ void reduce(float *in,float *out,const int N) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int ivox = threadIdx.x + blockIdx.x * blockDim.x; + + float c = 0; + float y,t; + float temp = 0; + while (ivox < N) { + y = in[ivox] - c; + t = temp + y; + c = (t - temp) - y; + temp = t; + ivox += blockDim.x * gridDim.x; + } + + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = (blockDim.x >> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + out[blockIdx.x] = cache[0]; +} + + +__device__ int cholesky_decompGPU(float *A, int num_col) +{ + int k,i,j; + + for (k=0;k=0;i--) { + temp = 0.0; + for (j=i+1;j NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ, + devicePhi,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef");exit(0);} + } + else { + Spat_Coef_Probit<<>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Probit");exit(0);} + } + + } + + + for (this_one=0;this_one>>(&deviceSpatCoef[this_one*TOTVOXp],d_sum,TOTVOXp); + + h_sum = (float *)calloc(BPG,sizeof(float)); + + cudaMemcpy(h_sum,d_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + cmean = 0; + for (int j=0;j>>(&(deviceSpatCoef[this_one*TOTVOXp]),INDX[i].hostN,INDX[i].deviceVox,cmean,deviceIdxSC); + alphaMean[this_one] = cmean/logit_factor; + } +} + + + +__global__ void betaMGPU(float *dcovar,float *dcovF,float *dZ,float *dspatCoef,float *fix,float *dalpha,float *dWM,const int N,const int TOTVOXp,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int TOTVOX,int * vox,float *dmean,int *dIdx,int *dIdxSC) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x*gridDim.x; + + float c = 0; + float y,t; + float temp = 0; + float SC[20]; + float mean=0; + int isub,ii,voxel,IDX,IDXSC; + while (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + for (ii=0;ii> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dmean[blockIdx.x] = cache[0]; +} + + +__global__ void betaVGPU(float *dWM,const int N,int *vox,float *dvar,int *dIdx) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x*gridDim.x; + + float c = 0; + float y,t; + float temp = 0; + + int voxel,IDX; + while (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + y = dWM[IDX]*dWM[IDX] - c; + t = temp + y; + c = (t - temp) - y; + temp = t; + + idx += stride; + } + + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = (blockDim.x >> 1); + while (i != 0) { + if (cacheIndex < i) { + cache[cacheIndex] += cache[cacheIndex + i]; + } + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) { + dvar[blockIdx.x] = cache[0]; + } + +} + +void updateBetaGPU(float *beta,float *dZ,float *dPhi,float *dalpha,float *dWM,float prior_mean_beta,float prior_prec_beta,float *dspatCoef,float *dcovar,unsigned long *seed) +{ + float *dmean,*dvar,*hmean,*hvar; + float mean,var,tmp; + + hmean = (float *)calloc(BPG,sizeof(float)); + hvar = (float *)calloc(BPG,sizeof(float)); + cudaMalloc((void **)&dmean,BPG*sizeof(float)) ; + cudaMalloc((void **)&dvar,BPG*sizeof(float)) ; + + mean = var = 0; + for (int i=0;i<2;i++) { + betaMGPU<<>>(dcovar,deviceCov_Fix,dZ,dspatCoef,deviceFixMean,deviceAlphaMean,deviceWM,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox,dmean,deviceIdx,deviceIdxSC); + cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j>>(deviceWM,INDX[i].hostN,INDX[i].deviceVox,dvar,deviceIdx); + cudaMemcpy(hvar,dvar,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dmean[blockIdx.x] = cache[0]; +} + +void updateAlphaMeanGPU(float *alphaMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) +{ + double *mean,*V,*tmpmean; + float *hmean,*dmean; + + mean = (double *)calloc(NCOVAR,sizeof(double)); + tmpmean = (double *)calloc(NCOVAR,sizeof(double)); + V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); + + for (int i=0;i>>(dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,TOTVOX,INDX[i].deviceVox, + dmean,deviceIdx,deviceIdxSC,icovar); + cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dmean[blockIdx.x] = cache[0]; +} + +void updatefixMeanGPU(float *fixMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) +{ + double *mean,*V,*tmpmean; + float *hmean,*dmean; + + mean = (double *)calloc(NCOV_FIX,sizeof(double)); + tmpmean = (double *)calloc(NCOV_FIX,sizeof(double)); + V = (double *)calloc(NCOV_FIX*NCOV_FIX,sizeof(double)); + + for (int i=0;i>>(deviceCov_Fix,dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox, + dmean,deviceIdx,deviceIdxSC,icovar); + cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dbeta[blockIdx.x] = cache[0]; +} + +void updateSpatCoefPrecGPU_Laplace(float *SpatCoefPrec,unsigned long *seed) +{ + float *hbeta; + double *var; + float *dbeta; + double betaSqr = 2.0;//0.2; + double tau; + + var = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); + hbeta = (float*)calloc(BPG,sizeof(float)); + cudaMalloc((void **)&dbeta,BPG*sizeof(float)); + cudaMemset(dbeta,0,BPG*sizeof(float)); + + for (int ii=0;ii>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[ii*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} + CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); + for (int j=0;j>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[jj*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} + CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); + for (int j=0;ja */ + float x,a_star; + int stop; +// double rnorm(double,double,unsigned long *); +// double kiss(unsigned long *); + + + if (a<0.0f) + { + stop=0; + while (stop==0) + { + x = curand_normal(state); + if( x>a ) stop=1; + else stop=0; + } + } + else + { + a_star=0.5f*(a+sqrtf(a*a+4.0f)); + stop=0; + while (stop==0) + { + x=a-__logf(curand_uniform(state))/a_star; + if( __logf(curand_uniform(state))a+3*__expf(-a*a-1/(a*a))/(a+sqrtf(a*a+4.0f))) + { + stop=0; + while (stop==0) + { + x=truncNormLeft(a,state); + if( x < b ) stop=1; + else stop = 0; + } + } + else + { + boo=0.0f; + if (b<0.0f) + boo=b*b; + if (a>0.0f) + boo=a*a; + + + stop=0; + while (stop==0) + { + x=(b-a)*curand_uniform(state)+a; + boun=boo-x*x; + if( 2.0f*__logf(curand_uniform(state))0.0f && num<0.0f ) || ( sign<0.0f && num>0.0f ) ) + return -num; + else return num; +} + +__device__ float sgammaGPU(float a, curandState *state) { + const float q1 = 0.0416666664f; + const float q2 = 0.0208333723f; + const float q3 = 0.0079849875f; + const float q4 = 0.0015746717f; + const float q5 = -0.0003349403f; + const float q6 = 0.0003340332f; + const float q7 = 0.0006053049f; + const float q8 = -0.0004701849f; + const float q9 = 0.0001710320f; + const float a1 = 0.333333333f; + const float a2 = -0.249999949f; + const float a3 = 0.199999867f; + const float a4 = -0.166677482f; + const float a5 = 0.142873973f; + const float a6 = -0.124385581f; + const float a7 = 0.110368310f; + const float a8 = 0.112750886f; + const float a9 = 0.104089866f; + const float e1 = 1.000000000f; + const float e2 = 0.499999994f; + const float e3 = 0.166666848f; + const float e4 = 0.041664508f; + const float e5 = 0.008345522f; + const float e6 = 0.001353826f; + const float e7 = 0.000247453f; + float aa = 0.0f; + float aaa = 0.0f; + const float sqrt32 = 5.65685424949238f; + float sgamma,s2,s,d,t,x,u,r,q0,b,si,c,v,q,e,w,p; + + if(a == aa) + goto S2; + if(a < 1.0f) + goto S13; + +//S1: // STEP 1: RECALCULATIONS OF S2,S,D IF A HAS CHANGED + aa = a; + s2 = a-0.5f; + s = sqrtf(s2); + d = sqrt32-12.0f*s; + +S2: // STEP 2: T=STANDARD NORMAL DEVIATE, X=(S,1/2)-NORMAL DEVIATE. IMMEDIATE ACCEPTANCE (I) + t = curand_normal(state); + x = s+0.5f*t; + sgamma = x*x; + if(t >= 0.0f) + return sgamma; + +//S3: // STEP 3: U= 0,1 -UNIFORM SAMPLE. SQUEEZE ACCEPTANCE (S) + u = curand_uniform(state); + if(d*u <= t*t*t) + return sgamma; + +//S4: // STEP 4: RECALCULATIONS OF Q0,B,SI,C IF NECESSARY + if (a != aaa) { + aaa = a; + r = 1.0f/ a; + q0 = r*(q1+r*(q2+r*(q3+r*(q4+r*(q5+r*(q6+r*(q7+r*(q8+r*q9)))))))); + if (a <= 3.686f) { + b = 0.463f+s+0.178f*s2; + si = 1.235f; + c = 0.195f/s-0.079f+0.16f*s; + } + else if (a <= 13.022f) { + b = 1.654f+0.0076f*s2; + si = 1.68f/s+0.275f; + c = .062/s+0.024f; + } + else { + b = 1.77f; + si = 0.75f; + c = 0.1515f/s; + } + } + +//S5: // NO QUOTIENT TEST IF X NOT POSITIVE + if(x <= 0.0f) + goto S8; + +//S6: // CALCULATION OF V AND QUOTIENT Q + v = t/(s+s); + if(fabsf(v) > 0.25f) + q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); + else + q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); + +//S7: // QUOTIENT ACCEPTANCE (Q) + if(log1pf(-u) <= q) return sgamma; + +S8: // E=STANDARD EXPONENTIAL DEVIATE U= 0,1 -UNIFORM DEVIATE T=(B,SI)-DOUBLE EXPONENTIAL (LAPLACE) SAMPLE + e = rexpGPU(1.0f,state); + u = curand_uniform(state); + u += (u-1.0f); + t = b+fsignfGPU(si*e,u); + +//S9: // REJECTION IF T .LT. TAU(1) = -.71874483771719 + if(t <= -0.71874483771719f) + goto S8; + +//S10: // CALCULATION OF V AND QUOTIENT Q + v = t/(s+s); + if(fabsf(v) > 0.25f) + q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); + else + q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); + +//S11: // HAT ACCEPTANCE (H) (IF Q NOT POSITIVE GO TO STEP 8) + if (q <= 0.5f) + w = q*(e1+q*(e2+q*(e3+q*(e4+q*(e5+q*(e6+q*e7)))))); + else + w = expf(q)-1.0f; + if(q <= 0.0f || c*fabs(u) > w*expf(e-0.5f*t*t)) + goto S8; +//S12: + x = s+0.5f*t; + sgamma = x*x; + return sgamma; + +S13: // ALTERNATE METHOD FOR PARAMETERS A BELOW 1 (.3678794=EXP(-1.)) + aa = 0.0f; + b = 1.0f+0.3678794f*a; + +S14: + p = b*curand_uniform(state); + if(p >= 1.0f) + goto S15; + sgamma = expf(logf(p)/ a); + if(rexpGPU(1.0f,state) < sgamma) + goto S14; + return sgamma; + +S15: + sgamma = -logf((b-p)/ a); + if(rexpGPU(1.0f,state) < (1.0f-a)*logf(sgamma)) + goto S14; + return sgamma; + +} + + +__global__ void Z_GPU1(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,unsigned int *dR) +{ + extern __shared__ float shared[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int localsize = blockDim.x; + int localid = threadIdx.x; + + float mean[2000]; + float tmp=0; + float SC[20]; + int voxel=0,IDX=0,IDXSC=0; + curandState localState; + + if (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + tmp = beta*WM[IDX]; + + for (int ii=0;ii NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j 4.0f); + } + state[idx] = localState; + } +} + +__global__ void Z_GPU2(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,unsigned int *dR,const float alp,const float df) +{ + extern __shared__ float shared[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int localsize = blockDim.x; + int localid = threadIdx.x; + + float a = alp; + float mean[2000]; + float tmp=0; + float SC[20]; + int voxel=0,IDX=0,IDXSC=0; + curandState localState; + + if (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + tmp = beta*WM[IDX]; + + for (int ii=0;ii NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j 4.0f); + + b = Z - M; + b = (df + b*b)/2.0f; + P = sgammaGPU(a,&localState)/b; + + dZ[isub*TOTVOX+IDX] = Z; + dPhi[isub*TOTVOX+IDX] = P; + } + state[idx] = localState; + } +} + + +__global__ void Phi_GPU(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,const float alp,const float df) +{ + extern __shared__ float shared[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int localsize = blockDim.x; + int localid = threadIdx.x; + + const float a = alp; + float beta2[2000]; + float tmp=0; + float SC[20]; + int voxel=0,IDX=0,IDXSC=0; + curandState localState; + + if (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + tmp = beta*WM[IDX]; + + for (int ii=0;ii NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j N) ? INDX[i].hostN:N; + + if (MODEL != 1) { + + for (int i=0;i<2;i++) { + int thr_per_block = TPB2; + int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); + int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; + + Phi_GPU<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, + deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, + INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,alpha,t_df); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Phi_GPU");exit(0);} + } + + } +} + +void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed) +{ + double alpha = ((double)t_df + 1)/2; + unsigned char d; + double beta2,mx= -10,mZ=-10,mtmp,mZ2=-10; + + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,TOTVOXp*NCOVAR*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(Z,deviceZ,TOTVOX*NSUBS*sizeof(float),cudaMemcpyDeviceToHost) ); + for (int i=0;i<2;i++) { + for (int j=0;jfabs(Z[isub*TOTVOX+IDX])) ? mZ2:fabs(Z[isub*TOTVOX+IDX]); + beta2 = ((double)t_df + beta2*beta2)/2.0; + Phi[isub*TOTVOX+IDX] = (float)rgamma(alpha,beta2,seed); + // Phi[isub*TOTVOX+IDX] = (float)rgamma(0.5*(double)t_df,0.5*(double)t_df,seed); + // printf("%lf\n",Phi[isub*TOTVOX+IDX]); + } + } + } + CUDA_CALL( cudaMemcpy(devicePhi,Phi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyHostToDevice) ); + printf("beta2_max = %lf %lf %lf\t %lf %d\n",mx,mZ,mtmp,mZ2,(int)d);fflush(NULL); +} + +void updateZGPU(unsigned char *data,float beta,unsigned long *seed) +{ + float alpha = (t_df + 1)/2; + float sqrt2 = sqrtf(2); +// float *rchisq,*drchisq; +// float dfdiv2 = t_df/2; + int N=0; + for (int i=0;i<2;i++) + N = (INDX[i].hostN > N) ? INDX[i].hostN:N; + + if (MODEL != 1) { +// rchisq = (float *)malloc(N*sizeof(float)); +// cudaMalloc((void **)&drchisq,N*sizeof(float)); + + for (int i=0;i<2;i++) { +// for (int j=0;j>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, + deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, + INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,deviceResid,alpha,t_df); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU2");exit(0);} + } +// free(rchisq); +// cudaFree(drchisq); + } + else { + + for (int i=0;i<2;i++) { + int thr_per_block = TPB2; + int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); + int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; + + Z_GPU1<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean, + deviceAlphaMean,deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, + INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,deviceResid); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU1");exit(0);} + + } + } +} + + +__device__ inline float dProbSDNormGPU(const float x) +//Calculate the cumulative probability of standard normal distribution; +{ + const float b1 = 0.319381530f; + const float b2 = -0.356563782f; + const float b3 = 1.781477937f; + const float b4 = -1.821255978f; + const float b5 = 1.330274429f; + const float p = 0.2316419f; + const float c = 0.39894228f; + float t,sgnx,out,y; + + sgnx = copysignf(1.0f,x); + y = sgnx*x; + + t = 1.0f / ( 1.0f + p * y ); + + out = ((x >= 0.0f) - sgnx*c * expf( -y * y / 2.0f ) * t * + ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); + + return out; +} + +__global__ void ProbLogitGPU(float *prb,float *alpha,float *SpatCoef,float beta,float *WM,int *dIdx,int *dIdxSC,int *vox,const int TOTVOX,const int TOTVOXp,const int NSUBTYPES,const int N,const float logit_factor) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x*gridDim.x; + + float x; + float bwhtmat=0; + int voxel,ii,IDX,IDXSC; + + while (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + bwhtmat = beta*WM[IDX]; + + for (ii=0;ii>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN,logit_factor); + } + else { + for (int i=0;i<2;i++) + ProbSDNormGPU<<<(INDX[i].hostN+511)/512, 512 >>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN); + } +} + +__global__ void reduce_for_Mpredict(float *in,float *out,unsigned char *data,const int N) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int ivox = threadIdx.x + blockIdx.x * blockDim.x; + float c = 0; + float y,t; + float temp = 0; + + while (ivox < N) { + int tmp = (int)data[ivox]; + float inv = in[ivox]; + y = tmp*logf(inv + 1E-35f) + (1 - tmp)*logf(1.0f - inv + 1E-35f) - c; + t = temp + y; + c = (t - temp) - y; + temp = t; + ivox += blockDim.x * gridDim.x; + } + + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = (blockDim.x >> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + out[blockIdx.x] = cache[0]; +} + +double compute_predictGPU(float beta,unsigned char *data,unsigned char *msk,float *covar,float *predict,double **Qhat,int first) +{ + double *Mpredict,DIC; + float *hf_sum,*df_sum; + + Mpredict = (double *)calloc(NSUBTYPES,sizeof(double)); + + CUDA_CALL( cudaMalloc((void **)&df_sum,BPG*sizeof(float)) ); + hf_sum = (float *)calloc(BPG,sizeof(float)); + + DIC = 0; + for (int isub=0;isub>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, + deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, + NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN,logit_factor); + } + } + else { + for (int i=0;i<2;i++) { + ProbSDNormGPU_prediction<<<(INDX[i].hostN+511)/512, 512 >>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, + deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, + NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN); + } + } + + // Compute Bayes Factor +/* reduce<<>>(devicePredict,df_sum,NSUBTYPES*TOTVOX); + + cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j>>(&devicePredict[ii*TOTVOX],df_sum,&deviceData[isub*TOTVOX],TOTVOX); + CUDA_CALL( cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); + + for (int j=0;j Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; + Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; + } + } + else { + double max = -DBL_MAX + 0.5; + for (int i=0;i Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; + Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; + } + } + } + + free(Mpredict); + CUDA_CALL( cudaFree(df_sum) ); + free(hf_sum); + + return(DIC); +} + + diff --git a/bsglmm/mcmc.cpp b/bsglmm/mcmc.cpp index 553d2f0..3475a44 100644 --- a/bsglmm/mcmc.cpp +++ b/bsglmm/mcmc.cpp @@ -1,1328 +1,1312 @@ -/* - * mcmc.cpp - * - * Created by Timothy Johnson on 4/14/12. - * Copyright 2012 University of Michigan. All rights reserved. - * - */ - -#include -#include -#include -#include -#include -#include -#include -//#include -#include -#include "binCAR.h" -#include "randgen.h" -#include "cholesky.h" -#include - -typedef double MY_DATATYPE; -extern float logit_factor; -extern float t_df; -extern double *grp_prior; - -int iter; -extern int MODEL; -extern int GPU; -extern int MAXITER; -extern int BURNIN; -extern int NSUBS; -extern int NROW; -extern int NCOL; -extern int TOTVOX; -extern int TOTVOXp; -extern int NDEPTH; -extern int NSUBTYPES; -extern int NCOVAR; -extern int NCOV_FIX; -extern int RESTART; -extern int *hostIdx; -extern int *deviceIdx; -extern int *hostIdxSC; -extern int *deviceIdxSC; -float *deviceSpatCoef; -extern float *deviceCovar; -extern float *deviceCov_Fix; -extern float *XXprime; -unsigned char *deviceData; -unsigned int *deviceResid; -extern char **SS; - -//short *deviceData; -float *deviceWM; -float *deviceAlphaMean; -float *deviceFixMean; -float *deviceZ; -float *devicePhi; -float *devicePrb; -float *devicePredict; -float sqrt2pi = 2.506628274631; -float NCNT; -int MIX = 0; - -int NSIZE; - -#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) -#define idx2(i,j,k) (i + (NROW)*j + (NROW)*(NCOL)*k) - -#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} - -void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned char *msk,unsigned long *seed) -{ - int TOTSAV=0,IDX,PRNtITER,SAVE_ITER,FIRST; - float *Z,*Phi,*alpha,*prb,*predict; - double *Mprb,*MCov,*SCov,*MWM,**Qhat,ED=0; - float beta=0,prior_mean_beta=0,prior_prec_beta=0.000001f; - float *SpatCoefPrec,*SpatCoefMean,*spatCoef; - - float *alphaMean,*alphaPrec;//scarPrec=1; - float *fixMean,*fixPrec;//scarPrec=1; - float prior_alphaPrec_A=3.0f; - float prior_alphaPrec_B=2.0f; -// float prior_carPrec_A=3.0,prior_carPrec_B=2.0; - float Prec = 0.000001f; - SUB *subj; - char *S; - -// float *hostWM; - FILE *out,*out2,*fWM,*fcoef,*fcoefsd,*fBF,*fresid; - - void itoa(int,char *); -// unsigned char *get_neighbors(); - void initializeAlpha(unsigned char *,float *,float *,float *,unsigned long *); - void initializeZ(float *,unsigned char *,unsigned char *,unsigned long *); - void initializePhi(float *,unsigned char *,unsigned char *,float,unsigned long *); - void updateZGPU(unsigned char *,float,unsigned long *); - void updateZ(float *,float *,unsigned char *,unsigned char *,float, float *,float *,float *,unsigned long *); - void updatePhiGPU(float,float *); - void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed); - void updateAlpha(float *,float ,float ,float *,unsigned char *,float,float *,float *,float *,int,unsigned long *); - void updateAlphaGPU(float *,float *,float *,float *,unsigned char *,float,float *,float *,float *,unsigned long *); - void updateAlphaPrecGPU(float *,float *,float,float,float,unsigned long *); - void updateAlphaPrec(float *,float *,float,float,float,unsigned char *,unsigned long *); - void updateAlphaMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); - void updatefixMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); - void updateAlphaMean(float *,float *,float *,float,float *,float *,float,unsigned char *,unsigned long *); - void compute_prbGPU(float); - void compute_prb(float *,float *,float *,float,float *,unsigned char *); - void updateBeta(float *,float *,float *,float *,unsigned char *,float,float,float *,float *,unsigned long *); - void updateBetaGPU(float *,float *,float *,float *,float *,float,float,float *,float *,unsigned long *); - void updateSpatCoefGPU(float *,float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); - void updateSpatCoef(float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); - void updateSpatCoefMean(float *,float *,float,float,unsigned char *,float **,float *,float *,float *,float,int,unsigned long *); - void updateSpatCoefPrec(float *,float *,unsigned char *,unsigned long *); - void updateSpatCoefPrecGPU(float *,unsigned long *); - void updateSpatCoefPrecGPU_Laplace(float *,unsigned long *); - void save_iter(float,float *,float *,float *,float *,float *,float *,float *,float *,float *); - void restart_iter(float *,float *,float *,float *,float *,float *,float *,float *,float *,float*); - int write_nifti_file(int NROW,int NCOL,int NDEPTH,int NTIME,char *hdr_file,char *data_file,MY_DATATYPE *data); - double compute_predictGPU(float,unsigned char *,unsigned char *msk,float *,float *,double **Qhat,int); - double compute_prb_DIC(double *,double *,float *,unsigned char *,unsigned char *,int); - -// void tmp(float *,float *); - -// fresid = fopen("resid.dat","w"); - - NSIZE = (NROW+2)*(NCOL+2)*(NDEPTH+2); - int RSIZE = NROW*NCOL*NDEPTH; -// printf("%d %d \n",TOTVOX,NSIZE); - - if (GPU) { - CUDA_CALL( cudaMalloc((void **)&deviceData,NSUBS*TOTVOX*sizeof(unsigned char)) ); - // unsigned char *data2; - // data2 = (unsigned char *)calloc(NSUBS*TOTVOX,sizeof(unsigned char)); - // for (int i=0;i 0) - fixMean = (float *)calloc(NCOV_FIX,sizeof(float)); - - alphaPrec = (float *)calloc(NSUBTYPES,sizeof(float)); - for (int i = 0;i 0) { - CUDA_CALL( cudaMalloc((void **)&deviceFixMean,NCOV_FIX*sizeof(float)) ); - CUDA_CALL( cudaMemset(deviceFixMean,0,NCOV_FIX*sizeof(float)) ); - } - } -// else -// alpha = (float *)calloc(TOTVOX*NSUBTYPES,sizeof(float)); - -/* if (!RESTART) { - initializeAlpha(msk,alphaMean,alphaPrec,alpha,seed); - if (GPU) - cudaMemcpy(deviceAlpha,alpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyHostToDevice); - }*/ - -// Malpha = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); - MWM = (double *)calloc(RSIZE,sizeof(double)); - Mprb = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); - MCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); - SCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); - - Qhat = (double **)calloc(NSUBS,sizeof(double *)); - for (int i=0;i 0) - updatefixMeanGPU(fixMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); - // updateAlphaMeanGPU(alphaMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); - } - else { - if (!(iter%PRNtITER)) - CUDA_CALL( cudaEventRecord(start, 0) ); - - // for (int tj=0;tj<10;tj++) - updateZ(Z,alpha,data,msk,beta,WM,spatCoef,covar,seed); - if (!(iter%PRNtITER)) { - CUDA_CALL( cudaEventRecord(stop, 0) ); - CUDA_CALL( cudaEventSynchronize(stop) ); - CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); -// printf ("Time for the updateZCPU kernel: %f ms\n", time); - } - // CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - updateBeta(&beta,Z,alpha,WM,msk,prior_mean_beta,prior_prec_beta,spatCoef,covar,seed); - // CUDA_CALL( cudaEventRecord(stop, 0) ); - // CUDA_CALL( cudaEventSynchronize(stop) ); - // CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); - // printf ("Time for the updateBetaCPU kernel: %f ms\n", time); - - if (!(iter%PRNtITER)) - CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - updateSpatCoef(covar,spatCoef,SpatCoefPrec,alpha,alphaMean,Z,msk,beta,WM,seed); - if (!(iter%PRNtITER)) { - CUDA_CALL( cudaEventRecord(stop, 0) ); - CUDA_CALL( cudaEventSynchronize(stop) ); - CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); -// printf ("Time for the updateSpatCoefCPU kernel: %f ms\n", time); - } - if (!(iter%PRNtITER)) - CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - updateSpatCoefPrec(SpatCoefPrec,spatCoef,msk,seed); - if (!(iter%PRNtITER)) { - CUDA_CALL( cudaEventRecord(stop, 0) ); - CUDA_CALL( cudaEventSynchronize(stop) ); - CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); -// printf ("Time for the updateSpatCoefPrecCPU kernel: %f ms\n\n", time); - } - - // CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - // compute_prb(prb,alpha,spatCoef,beta,WM,msk); - // CUDA_CALL( cudaEventRecord(stop, 0) ); - // CUDA_CALL( cudaEventSynchronize(stop) ); - /// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); - // printf ("Time for the compute_prbCPU kernel: %f ms\n\n", time); - } - -/* if (!(iter%PRNtITER) && iter > 0) { - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - int i = 56;//57; - int j = 77;//46; - int k = 19;//72; - IDX = idx(i,j,k); - - for (int ii=NSUBTYPES;ii (double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor)) ? - max:(double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor); - } - } - } - printf("max = %lf\n",max); */ - - double *V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); - for (int ii=0;ii BURNIN) && (!(iter%SAVE_ITER))) { -// if ((iter > 0)) { - // compute probability of lesion - - if (GPU) { - if (TOTSAV == 0) - FIRST = 1; - else - FIRST = 0; - if (!(iter%PRNtITER)) - cudaEventRecord(start, 0); - ED += compute_predictGPU(beta,data,msk,covar,predict,Qhat,FIRST); - if (!(iter%PRNtITER)) { - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); -// printf ("Time to compute_predictGPU: %f ms\n", time); - } - - if (!(iter%PRNtITER)) - cudaEventRecord(start, 0); - compute_prbGPU(beta); - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - CUDA_CALL( cudaMemcpy(prb,devicePrb,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); - if (!(iter%PRNtITER)) { - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); -// printf ("Time to compute_prbGPU: %f ms\n\n", time); - } - } - else { - if (!(iter%PRNtITER)) - cudaEventRecord(start, 0); - compute_prb(prb,alpha,spatCoef,beta,WM,msk); - if (!(iter%PRNtITER)) { - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); -// printf ("Time to compute_prbGPU: %f ms\n\n", time); - } - } - - for (int k=1;k 1) { - if (GPU) { - CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); - CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); -// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - } - save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); - }*/ - } -// CUDA_CALL( cudaEventRecord(stop, 0) ); -// CUDA_CALL( cudaEventSynchronize(stop) ); -// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); -// printf ("Time for the updateZGPU kernel: %f sec.\n", time/1000.0); - -// fclose(fBF); - - unsigned int *Resid; - double *ResidMap; - Resid = (unsigned int *)calloc(NSUBS*TOTVOX,sizeof(unsigned int)); - ResidMap = (double *)calloc(RSIZE,sizeof(double)); - if (GPU) { - CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); - CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); -// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - - CUDA_CALL( cudaMemcpy(Resid,deviceResid,NSUBS*TOTVOX*sizeof(unsigned int),cudaMemcpyDeviceToHost) ); - } -// save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); - - for (int isub=0;isub 0.05) { -// fprintf(fresid,"%d %d %d %d %lf\n",isub,i,j,k,tmp); - ResidMap[IDX2]++; - } - } - } - } - } - } - free(Resid); -// fclose(out); -// fclose(out2); -// fclose(fresid); - - char *RR = (char *)calloc(500,sizeof(char)); - S = (char *)calloc(100,sizeof(char)); -// S = strcpy(S,"ResidMap.nii"); -// int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,ResidMap); - free(ResidMap); - -// RR = strcpy(RR,"gzip -f "); -// RR = strcat(RR,(const char *)S); -// rtn = system(RR); - -// printf("TOTSAV = %d\n",TOTSAV); - - out = fopen("Qhat.dat","w"); - for (int isub=0;isub0) - standCoef[IDX] = MCov[ii*RSIZE+IDX]/sqrt(tmp); - } - } - } - - S = strcpy(S,"spatCoef_"); - S = strcat(S,SS[ii]); - S = strcat(S,".nii"); - int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&MCov[ii*RSIZE]); - - RR = strcpy(RR,"gzip -f "); - RR = strcat(RR,(const char *)S); - rtn = system(RR); - - S = strcpy(S,"spatCoef_"); - S = strcat(S,SS[ii]); - S = strcat(S,".Var.nii"); - rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&SCov[ii*RSIZE]); - - RR = strcpy(RR,"gzip -f "); - RR = strcat(RR,(const char *)S); - rtn = system(RR); - -/* for (int k=0;k +#include +#include +#include +#include +#include +#include +//#include +#include +#include "binCAR.h" +#include "randgen.h" +#include "cholesky.h" +#include + +typedef double MY_DATATYPE; +extern float logit_factor; +extern float t_df; +extern double *grp_prior; + +int iter; +extern int MODEL; +extern int GPU; +extern int MAXITER; +extern int BURNIN; +extern int NSUBS; +extern int NROW; +extern int NCOL; +extern int TOTVOX; +extern int TOTVOXp; +extern int NDEPTH; +extern int NSUBTYPES; +extern int NCOVAR; +extern int NCOV_FIX; +extern int RESTART; +extern int *hostIdx; +extern int *deviceIdx; +extern int *hostIdxSC; +extern int *deviceIdxSC; +float *deviceSpatCoef; +extern float *deviceCovar; +extern float *deviceCov_Fix; +extern float *XXprime; +unsigned char *deviceData; +unsigned int *deviceResid; +extern char **SS; + +//short *deviceData; +float *deviceWM; +float *deviceAlphaMean; +float *deviceFixMean; +float *deviceZ; +float *devicePhi; +float *devicePrb; +float *devicePredict; +float sqrt2pi = 2.506628274631; +float NCNT; +int MIX = 0; + +int NSIZE; + +#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) +#define idx2(i,j,k) (i + (NROW)*j + (NROW)*(NCOL)*k) + +#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} + +void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned char *msk,unsigned long *seed) +{ + int TOTSAV=0,IDX,PRNtITER,SAVE_ITER,FIRST; + float *Z,*Phi,*alpha,*prb,*predict; + double *Mprb,*MCov,*SCov,*MWM,**Qhat,ED=0; + float beta=0,prior_mean_beta=0,prior_prec_beta=0.000001f; + float *SpatCoefPrec,*SpatCoefMean,*spatCoef; + + float *alphaMean,*alphaPrec;//scarPrec=1; + float *fixMean,*fixPrec;//scarPrec=1; + float prior_alphaPrec_A=3.0f; + float prior_alphaPrec_B=2.0f; +// float prior_carPrec_A=3.0,prior_carPrec_B=2.0; + float Prec = 0.000001f; + SUB *subj; + char *S; + +// float *hostWM; + FILE *out,*out2,*fWM,*fcoef,*fcoefsd,*fBF,*fresid; + + void itoa(int,char *); +// unsigned char *get_neighbors(); + void initializeAlpha(unsigned char *,float *,float *,float *,unsigned long *); + void initializeZ(float *,unsigned char *,unsigned char *,unsigned long *); + void initializePhi(float *,unsigned char *,unsigned char *,float,unsigned long *); + void updateZGPU(unsigned char *,float,unsigned long *); + void updateZ(float *,float *,unsigned char *,unsigned char *,float, float *,float *,float *,unsigned long *); + void updatePhiGPU(float,float *); + void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed); + void updateAlpha(float *,float ,float ,float *,unsigned char *,float,float *,float *,float *,int,unsigned long *); + void updateAlphaGPU(float *,float *,float *,float *,unsigned char *,float,float *,float *,float *,unsigned long *); + void updateAlphaPrecGPU(float *,float *,float,float,float,unsigned long *); + void updateAlphaPrec(float *,float *,float,float,float,unsigned char *,unsigned long *); + void updateAlphaMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); + void updatefixMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); + void updateAlphaMean(float *,float *,float *,float,float *,float *,float,unsigned char *,unsigned long *); + void compute_prbGPU(float); + void compute_prb(float *,float *,float *,float,float *,unsigned char *); + void updateBeta(float *,float *,float *,float *,unsigned char *,float,float,float *,float *,unsigned long *); + void updateBetaGPU(float *,float *,float *,float *,float *,float,float,float *,float *,unsigned long *); + void updateSpatCoefGPU(float *,float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); + void updateSpatCoef(float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); + void updateSpatCoefMean(float *,float *,float,float,unsigned char *,float **,float *,float *,float *,float,int,unsigned long *); + void updateSpatCoefPrec(float *,float *,unsigned char *,unsigned long *); + void updateSpatCoefPrecGPU(float *,unsigned long *); + void updateSpatCoefPrecGPU_Laplace(float *,unsigned long *); + void save_iter(float,float *,float *,float *,float *,float *,float *,float *,float *,float *); + void restart_iter(float *,float *,float *,float *,float *,float *,float *,float *,float *,float*); + int write_nifti_file(int NROW,int NCOL,int NDEPTH,int NTIME,char *hdr_file,char *data_file,MY_DATATYPE *data); + double compute_predictGPU(float,unsigned char *,unsigned char *msk,float *,float *,double **Qhat,int); + double compute_prb_DIC(double *,double *,float *,unsigned char *,unsigned char *,int); + +// void tmp(float *,float *); + fresid = fopen("resid.dat","w"); + + NSIZE = (NROW+2)*(NCOL+2)*(NDEPTH+2); + int RSIZE = NROW*NCOL*NDEPTH; +// printf("%d %d \n",TOTVOX,NSIZE); + + if (GPU) { + CUDA_CALL( cudaMalloc((void **)&deviceData,NSUBS*TOTVOX*sizeof(unsigned char)) ); + // unsigned char *data2; + // data2 = (unsigned char *)calloc(NSUBS*TOTVOX,sizeof(unsigned char)); + // for (int i=0;i 0) + fixMean = (float *)calloc(NCOV_FIX,sizeof(float)); + + alphaPrec = (float *)calloc(NSUBTYPES,sizeof(float)); + for (int i = 0;i 0) { + CUDA_CALL( cudaMalloc((void **)&deviceFixMean,NCOV_FIX*sizeof(float)) ); + CUDA_CALL( cudaMemset(deviceFixMean,0,NCOV_FIX*sizeof(float)) ); + } + } +// else +// alpha = (float *)calloc(TOTVOX*NSUBTYPES,sizeof(float)); + +/* if (!RESTART) { + initializeAlpha(msk,alphaMean,alphaPrec,alpha,seed); + if (GPU) + cudaMemcpy(deviceAlpha,alpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyHostToDevice); + }*/ + +// Malpha = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); + MWM = (double *)calloc(RSIZE,sizeof(double)); + Mprb = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); + MCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); + SCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); + + Qhat = (double **)calloc(NSUBS,sizeof(double *)); + for (int i=0;i 0) + updatefixMeanGPU(fixMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); + // updateAlphaMeanGPU(alphaMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); + } + else { + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + + // for (int tj=0;tj<10;tj++) + updateZ(Z,alpha,data,msk,beta,WM,spatCoef,covar,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateZCPU kernel: %f ms\n", time); + } + // CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateBeta(&beta,Z,alpha,WM,msk,prior_mean_beta,prior_prec_beta,spatCoef,covar,seed); + // CUDA_CALL( cudaEventRecord(stop, 0) ); + // CUDA_CALL( cudaEventSynchronize(stop) ); + // CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + // printf ("Time for the updateBetaCPU kernel: %f ms\n", time); + + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateSpatCoef(covar,spatCoef,SpatCoefPrec,alpha,alphaMean,Z,msk,beta,WM,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateSpatCoefCPU kernel: %f ms\n", time); + } + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateSpatCoefPrec(SpatCoefPrec,spatCoef,msk,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateSpatCoefPrecCPU kernel: %f ms\n\n", time); + } + + // CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + // compute_prb(prb,alpha,spatCoef,beta,WM,msk); + // CUDA_CALL( cudaEventRecord(stop, 0) ); + // CUDA_CALL( cudaEventSynchronize(stop) ); + /// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + // printf ("Time for the compute_prbCPU kernel: %f ms\n\n", time); + } + +/* if (!(iter%PRNtITER) && iter > 0) { + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + int i = 56;//57; + int j = 77;//46; + int k = 19;//72; + IDX = idx(i,j,k); + + for (int ii=NSUBTYPES;ii (double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor)) ? + max:(double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor); + } + } + } + printf("max = %lf\n",max); */ + + double *V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); + for (int ii=0;ii BURNIN) && (!(iter%SAVE_ITER))) { +// if ((iter > 0)) { + // compute probability of lesion + + if (GPU) { + if (TOTSAV == 0) + FIRST = 1; + else + FIRST = 0; + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + ED += compute_predictGPU(beta,data,msk,covar,predict,Qhat,FIRST); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_predictGPU: %f ms\n", time); + } + + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + compute_prbGPU(beta); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(prb,devicePrb,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_prbGPU: %f ms\n\n", time); + } + } + else { + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + compute_prb(prb,alpha,spatCoef,beta,WM,msk); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_prbGPU: %f ms\n\n", time); + } + } + + for (int k=1;k 1) { + if (GPU) { + CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + } + save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); + }*/ + } +// CUDA_CALL( cudaEventRecord(stop, 0) ); +// CUDA_CALL( cudaEventSynchronize(stop) ); +// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); +// printf ("Time for the updateZGPU kernel: %f sec.\n", time/1000.0); + + fclose(fBF); + +// unsigned int *Resid; +// double *ResidMap; +// Resid = (unsigned int *)calloc(NSUBS*TOTVOX,sizeof(unsigned int)); +// ResidMap = (double *)calloc(RSIZE,sizeof(double)); + if (GPU) { + CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + +// CUDA_CALL( cudaMemcpy(Resid,deviceResid,NSUBS*TOTVOX*sizeof(unsigned int),cudaMemcpyDeviceToHost) ); + } +// save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); +/* for (int isub=0;isub 0.05) { + fprintf(fresid,"%d %d %d %d %lf\n",isub,i,j,k,tmp); + ResidMap[IDX2]++; + } + } + } + } + } + } + free(Resid); + fclose(out); + fclose(out2); + fclose(fresid);*/ + + char *RR = (char *)calloc(500,sizeof(char)); + S = (char *)calloc(100,sizeof(char)); +// S = strcpy(S,"ResidMap.nii"); +// int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,ResidMap); +// free(ResidMap); + int rtn; +// RR = strcpy(RR,"gzip -f "); +// RR = strcat(RR,(const char *)S); +// rtn = system(RR); + + printf("TOTSAV = %d\n",TOTSAV); + + out = fopen("Qhat.dat","w"); + for (int isub=0;isub0) + standCoef[IDX] = MCov[ii*RSIZE+IDX]/sqrt(tmp); + } + } + } + + S = strcpy(S,"spatCoef_"); + S = strcat(S,SS[ii]); + S = strcat(S,".nii"); + int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&MCov[ii*RSIZE]); + + RR = strcpy(RR,"gzip -f "); + RR = strcat(RR,(const char *)S); + rtn = system(RR); + + S = strcpy(S,"spatCoef_"); + S = strcat(S,SS[ii]); + S = strcat(S,".Var.nii"); + rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&SCov[ii*RSIZE]); + + RR = strcpy(RR,"gzip -f "); + RR = strcat(RR,(const char *)S); + rtn = system(RR); + +/* for (int k=0;k Date: Mon, 19 Jan 2015 05:58:18 +0000 Subject: [PATCH 02/13] Add check for missing seed.dat file --- bsglmm/main.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bsglmm/main.cu b/bsglmm/main.cu index 1d65db8..da62ab7 100644 --- a/bsglmm/main.cu +++ b/bsglmm/main.cu @@ -113,6 +113,10 @@ int main (int argc, char * const argv[]) { seed = (unsigned long *)calloc(3,sizeof(unsigned long)); fseed = fopen("seed.dat","r"); + if (fseed == NULL){ + printf("seed.dat (with 3 integers) does not exist\n"); + exit(1); + } logit_factor = 1.0f; t_df = 1000.0f; From 63c0a750200600899792ad1f0725b39cf45db0e8 Mon Sep 17 00:00:00 2001 From: nicholst Date: Mon, 19 Jan 2015 14:35:31 +0000 Subject: [PATCH 03/13] Added various edits to improve the user interface --- Makefile | 186 ++++++++ main.cu | 476 ++++++++++++++++++++ mcmc.cpp | 1312 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1974 insertions(+) create mode 100644 Makefile create mode 100644 main.cu create mode 100644 mcmc.cpp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..126daf1 --- /dev/null +++ b/Makefile @@ -0,0 +1,186 @@ +################################################################################ +# +# Copyright 1993-2013 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +# include /home/tdjtdj/NVIDIA_CUDA-5.5_Samples/common/findcudalib.mk + + +# OS Name (Linux or Darwin) +OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) +OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) + +# Flags to detect 32-bit or 64-bit OS platform +OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/") +OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/") + +# These flags will override any settings +ifeq ($(i386),1) + OS_SIZE = 32 + OS_ARCH = i686 +endif + +ifeq ($(x86_64),1) + OS_SIZE = 64 + OS_ARCH = x86_64 +endif + + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda-5.0 +CUDA_BIN_PATH ?= $(CUDA_PATH)/bin + + +# Common binaries +NVCC ?= $(CUDA_BIN_PATH)/nvcc +GCC ?= g++ + + +# internal flags +NVCCFLAGS := -m${OS_SIZE} +CCFLAGS := -O3 +NVCCLDFLAGS := +LDFLAGS := + +# Extra user flags +EXTRA_NVCCFLAGS ?= +EXTRA_NVCCLDFLAGS ?= +EXTRA_LDFLAGS ?= +EXTRA_CCFLAGS ?= + +# OS-specific build flags +ifneq ($(DARWIN),) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(OS_ARCH) $(STDLIB) +else + ifeq ($(OS_ARCH),armv7l) + ifeq ($(abi),gnueabi) + CCFLAGS += -mfloat-abi=softfp + else + # default to gnueabihf + override abi := gnueabihf + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard + endif + endif +endif + +ifeq ($(ARMv7),1) +NVCCFLAGS += -target-cpu-arch ARM +ifneq ($(TARGET_FS),) +CCFLAGS += --sysroot=$(TARGET_FS) +LDFLAGS += --sysroot=$(TARGET_FS) +LDFLAGS += -rpath-link=$(TARGET_FS)/lib +LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib +LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-$(abi) +endif +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + TARGET := debug +else + TARGET := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(NVCCLDFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(EXTRA_NVCCLDFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I../../common/inc +LIBRARIES := + +################################################################################ + +# CUDA code generation flags +ifneq ($(OS_ARCH),armv7l) +GENCODE_SM10 := -gencode arch=compute_10,code=sm_10 +endif +GENCODE_SM20 := -gencode arch=compute_20,code=sm_20 +GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=\"sm_35,compute_35\" +GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30) + +################################################################################ + +# Target rules +all: build + +build: BinCar + +main.o: main.cu + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +randgen.o: randgen.cpp + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +read_data.o: read_data.cpp + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +mcmc.o: mcmc.cpp + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +covar.o: covar.cpp + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +nifti1_read_write.o: nifti1_read_write.cpp + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +cholesky.o: cholesky.cpp + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +covarGPU.o: covarGPU.cu + $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +BinCar: main.o mcmc.o randgen.o read_data.o covar.o nifti1_read_write.o cholesky.o covarGPU.o + $(NVCC) $(ALL_LDFLAGS) -o $@ $+ $(LIBRARIES) + cp $@ $(HOME)/bin/ + +run: build + ./BinCar + +clean: + rm -f BinCarGPU main.o mcmc.o randgen.o read_data.o covar.o nifti1_read.write.o cholesky.o covarGPU.o + #rm -rf ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))/matrixMul + +clobber: clean diff --git a/main.cu b/main.cu new file mode 100644 index 0000000..59169a1 --- /dev/null +++ b/main.cu @@ -0,0 +1,476 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "randgen.h" +#include "binCAR.h" + +int PREC=64; +int NSUBS; +int GPU; +int NROW; +int NCOL; +int NDEPTH; +int TOTVOX; +int TOTVOXp; // total voxels plus boundary + +int *hostIdx; +int *deviceIdx; +int *hostIdxSC; +int *deviceIdxSC; + +float *covar; +int NSUBTYPES; +int NCOVAR; +int NCOV_FIX = 0; +int MAXITER = 100000; +int BURNIN = 50000; +int RESTART; +float *deviceCovar; +float *hostCovar; +float *XXprime; +float *deviceCov_Fix; +float *hostCov_Fix; +float *XXprime_Fix; +float logit_factor; +float t_df; +int MODEL = 1; // logistic = 0; Probit = 1, t = 2; +//float *deviceChiSqHist; +//int ChiSqHist_N; + +curandState *devStates; +INDEX *INDX; + +double M = exp(-PREC*log(2.0)); + +#define IDX(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) + +#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} + +int main (int argc, char * const argv[]) { + int rtn,cnt,cnt0,cnt1,cntp; + unsigned char *data; + unsigned char *msk,*mskp; + unsigned long *seed; + float *WM; + FILE *fseed; + +// unsigned char *read_nifti1_mask(char *,char *); + unsigned char *read_nifti1_mask(char *); + unsigned char *read_nifti1_image(unsigned char *,char *); + float *read_nifti1_WM(unsigned char *); + float *read_covariates(char *); + float *read_covariates_fix(); + void mcmc(float *,float *,unsigned char *,float *,unsigned char *,unsigned long *); + void write_empir_prb(unsigned char *,float *,unsigned char *,int *); + unsigned char *get_WM_mask(float *,unsigned char *); + + if (argc !=5 && argc !=7) { + printf("%s: Usage\n",argv[0]); + printf("%s NTypes NCov GPU Design [MaxIter BurnIn] \n",argv[0]); + printf(" NTypes - Number of groups\n",argv[0]); + printf(" NCov - Number of covariates (count must include groups)\n",argv[0]); + printf(" GPU - 1 use GPU; 0 use CPU\n",argv[0]); + printf(" Design - Text file, tab or space separated data file\n",argv[0]); + printf(" MaxIter - Number of iterations (defaults to 1,000,000)\n",argv[0]); + printf(" BurnIn - Number of burn-in iterations (defaults to 500,000)\n",argv[0]); + printf("For documentation see: http://warwick.ac.uk/tenichols/BSGLMM\n",argv[0]); + exit(1); + } + + NSUBTYPES = atoi(argv[1]); + NCOVAR = atoi(argv[2]); + GPU = atoi(argv[3]); + if (argc >5) { + MAXITER = atoi(argv[5]); + BURNIN = atoi(argv[6]); + } + + + RESTART = 0; + int deviceCount = 0; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + + if (error_id != cudaSuccess){ + printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); + printf("Result = FAIL\n"); + exit(EXIT_FAILURE); + } + if (deviceCount == 0) { + printf("There are no available device(s) that support CUDA\n"); + } + else { + printf("Detected %d CUDA Capable device(s)\n", deviceCount); + } + int best = 0; + int major = 100; + for (int dev = 0; dev < deviceCount; ++dev) { + cudaSetDevice(dev); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); + printf("Compute capability %d.%d \n",deviceProp.major,deviceProp.minor); + if (deviceProp.major >= major) { + major = deviceProp.major; + best = dev; + } + } + cudaSetDevice(best); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, best); + printf("Device used is %s \n\n",deviceProp.name); + + RESTART = 0; + + seed = (unsigned long *)calloc(3,sizeof(unsigned long)); + fseed = fopen("seed.dat","r"); + if (fseed == NULL){ + printf("No seed.dat found; using [ 1 2 3 ]\n"); + seed[0]=1L;seed[1]=2L;seed[2]=2L; + } else { + rtn = fscanf(fseed,"%lu %lu %lu\n",&(seed[0]),&(seed[1]),&(seed[2])); + if (rtn==0) + exit(1); + rtn = fclose(fseed); + if (rtn != 0) + exit(1); + } + + + logit_factor = 1.0f; + t_df = 1000.0f; + if (MODEL == 0) { + logit_factor = 0.634f; + t_df = 8.0f; + } + if (MODEL == 2) + t_df = 3.0f; + + + +// msk = read_nifti1_mask("./images/avg152T1_highres_brain.hdr", +// "./images/avg152T1_highres_brain.img"); + msk = read_nifti1_mask("./images/mask.nii"); + + WM = read_nifti1_WM(msk); + data = read_nifti1_image(msk,argv[4]); + + if (RESTART) + BURNIN = 1; + + + mskp = (unsigned char *)calloc((NROW+2)*(NCOL+2)*(NDEPTH+2),sizeof(unsigned char)); + for (int k=1;k INDX[i].hostN) ? max:INDX[i].hostN; +//printf("max = %d\n",max); + for (int i=0;i<2;i++) + CUDA_CALL( cudaMemcpy(INDX[i].deviceVox,INDX[i].hostVox,INDX[i].hostN*sizeof(int),cudaMemcpyHostToDevice) ); + for (int i=0;i<2;i++) + CUDA_CALL( cudaMemcpy(INDX[i].deviceNBRS,INDX[i].hostNBRS,INDX[i].hostN*sizeof(unsigned char),cudaMemcpyHostToDevice) ); + CUDA_CALL( cudaMemcpy(deviceIdx,hostIdx,NSIZE*sizeof(int),cudaMemcpyHostToDevice) ); + CUDA_CALL( cudaMemcpy(deviceIdxSC,hostIdxSC,NSIZE*sizeof(int),cudaMemcpyHostToDevice) ); + } + + if (GPU) { + __global__ void setup_kernel(curandState *,unsigned long long, const int); + unsigned long long devseed; +// devseed = (unsigned long long)runiform_long_n(18446744073709551615ULL,seed); + devseed = (unsigned long long)runiform_long_n(ULONG_MAX,seed); + CUDA_CALL( cudaMalloc (( void **) &devStates , max * sizeof ( curandState ) ) ); + setup_kernel<<<512,512 >>>(devStates,devseed,max); + // cutilCheckMsg("setup_kernel failed:"); + } + + write_empir_prb(msk,covar,data,hostIdx); + + mcmc(covar,hostCov_Fix,data,WM,msk,seed); + + free(msk); + free(mskp); + free(WM); + free(data); + + + + fseed = fopen("seed.dat","w"); + fprintf(fseed,"%lu %lu %lu\n",seed[0],seed[1],seed[2]); + fclose(fseed); + free(seed); + +// cudaFree(deviceChiSqHist); + if (GPU) { + free(hostIdx); + cudaFree(deviceIdx); + free(hostIdxSC); + cudaFree(deviceIdxSC); + for (int i=0;i<2;i++) { + free(INDX[i].hostVox); + cudaFree(INDX[i].deviceVox); + free(INDX[i].hostNBRS); + cudaFree(INDX[i].deviceNBRS); + } + cudaFree(deviceCovar); + if (NCOV_FIX > 0) { + cudaFree(deviceCov_Fix); + } + } + else { + free(covar); + if (NCOV_FIX > 0) + free(hostCov_Fix); + free(hostIdx); + free(deviceIdx); + free(hostIdxSC); + free(deviceIdxSC); + for (int i=0;i<2;i++) { + free(INDX[i].hostVox); + free(INDX[i].hostNBRS); + } + } + free(INDX); + + return 0; +} + + +void cnt_nbrs(int idx,int iidx,int *cnt,unsigned char *msk) +{ + int tmp; + + INDX[idx].hostVox[*cnt] = iidx; + + tmp = iidx-1; + if (msk[tmp]) + INDX[idx].hostNBRS[*cnt]++; + + tmp = iidx+1; + if (msk[tmp]) + INDX[idx].hostNBRS[*cnt]++; + + tmp = iidx - (NROW+2); + if (msk[tmp]) + INDX[idx].hostNBRS[*cnt]++; + + tmp = iidx + (NROW+2); + if (msk[tmp]) + INDX[idx].hostNBRS[*cnt]++; + + tmp = iidx - (NROW+2)*(NCOL+2); + if (msk[tmp]) + INDX[idx].hostNBRS[*cnt]++; + + tmp = iidx + (NROW+2)*(NCOL+2); + if (msk[tmp]) + INDX[idx].hostNBRS[*cnt]++; + + (*cnt)++; +} diff --git a/mcmc.cpp b/mcmc.cpp new file mode 100644 index 0000000..284ef5f --- /dev/null +++ b/mcmc.cpp @@ -0,0 +1,1312 @@ +/* + * mcmc.cpp + * + * Created by Timothy Johnson on 4/14/12. + * Copyright 2012 University of Michigan. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include "binCAR.h" +#include "randgen.h" +#include "cholesky.h" +#include + +typedef double MY_DATATYPE; +extern float logit_factor; +extern float t_df; +extern double *grp_prior; + +int iter; +extern int MODEL; +extern int GPU; +extern int MAXITER; +extern int BURNIN; +extern int NSUBS; +extern int NROW; +extern int NCOL; +extern int TOTVOX; +extern int TOTVOXp; +extern int NDEPTH; +extern int NSUBTYPES; +extern int NCOVAR; +extern int NCOV_FIX; +extern int RESTART; +extern int *hostIdx; +extern int *deviceIdx; +extern int *hostIdxSC; +extern int *deviceIdxSC; +float *deviceSpatCoef; +extern float *deviceCovar; +extern float *deviceCov_Fix; +extern float *XXprime; +unsigned char *deviceData; +unsigned int *deviceResid; +extern char **SS; + +//short *deviceData; +float *deviceWM; +float *deviceAlphaMean; +float *deviceFixMean; +float *deviceZ; +float *devicePhi; +float *devicePrb; +float *devicePredict; +float sqrt2pi = 2.506628274631; +float NCNT; +int MIX = 0; + +int NSIZE; + +#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) +#define idx2(i,j,k) (i + (NROW)*j + (NROW)*(NCOL)*k) + +#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} + +void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned char *msk,unsigned long *seed) +{ + int TOTSAV=0,IDX,PRNtITER,SAVE_ITER,FIRST; + float *Z,*Phi,*alpha,*prb,*predict; + double *Mprb,*MCov,*SCov,*MWM,**Qhat,ED=0; + float beta=0,prior_mean_beta=0,prior_prec_beta=0.000001f; + float *SpatCoefPrec,*SpatCoefMean,*spatCoef; + + float *alphaMean,*alphaPrec;//scarPrec=1; + float *fixMean,*fixPrec;//scarPrec=1; + float prior_alphaPrec_A=3.0f; + float prior_alphaPrec_B=2.0f; +// float prior_carPrec_A=3.0,prior_carPrec_B=2.0; + float Prec = 0.000001f; + SUB *subj; + char *S; + +// float *hostWM; + FILE *out,*out2,*fWM,*fcoef,*fcoefsd,*fBF,*fresid; + + void itoa(int,char *); +// unsigned char *get_neighbors(); + void initializeAlpha(unsigned char *,float *,float *,float *,unsigned long *); + void initializeZ(float *,unsigned char *,unsigned char *,unsigned long *); + void initializePhi(float *,unsigned char *,unsigned char *,float,unsigned long *); + void updateZGPU(unsigned char *,float,unsigned long *); + void updateZ(float *,float *,unsigned char *,unsigned char *,float, float *,float *,float *,unsigned long *); + void updatePhiGPU(float,float *); + void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed); + void updateAlpha(float *,float ,float ,float *,unsigned char *,float,float *,float *,float *,int,unsigned long *); + void updateAlphaGPU(float *,float *,float *,float *,unsigned char *,float,float *,float *,float *,unsigned long *); + void updateAlphaPrecGPU(float *,float *,float,float,float,unsigned long *); + void updateAlphaPrec(float *,float *,float,float,float,unsigned char *,unsigned long *); + void updateAlphaMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); + void updatefixMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); + void updateAlphaMean(float *,float *,float *,float,float *,float *,float,unsigned char *,unsigned long *); + void compute_prbGPU(float); + void compute_prb(float *,float *,float *,float,float *,unsigned char *); + void updateBeta(float *,float *,float *,float *,unsigned char *,float,float,float *,float *,unsigned long *); + void updateBetaGPU(float *,float *,float *,float *,float *,float,float,float *,float *,unsigned long *); + void updateSpatCoefGPU(float *,float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); + void updateSpatCoef(float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); + void updateSpatCoefMean(float *,float *,float,float,unsigned char *,float **,float *,float *,float *,float,int,unsigned long *); + void updateSpatCoefPrec(float *,float *,unsigned char *,unsigned long *); + void updateSpatCoefPrecGPU(float *,unsigned long *); + void updateSpatCoefPrecGPU_Laplace(float *,unsigned long *); + void save_iter(float,float *,float *,float *,float *,float *,float *,float *,float *,float *); + void restart_iter(float *,float *,float *,float *,float *,float *,float *,float *,float *,float*); + int write_nifti_file(int NROW,int NCOL,int NDEPTH,int NTIME,char *hdr_file,char *data_file,MY_DATATYPE *data); + double compute_predictGPU(float,unsigned char *,unsigned char *msk,float *,float *,double **Qhat,int); + double compute_prb_DIC(double *,double *,float *,unsigned char *,unsigned char *,int); + +// void tmp(float *,float *); + fresid = fopen("resid.dat","w"); + + NSIZE = (NROW+2)*(NCOL+2)*(NDEPTH+2); + int RSIZE = NROW*NCOL*NDEPTH; +// printf("%d %d \n",TOTVOX,NSIZE); + + if (GPU) { + CUDA_CALL( cudaMalloc((void **)&deviceData,NSUBS*TOTVOX*sizeof(unsigned char)) ); + // unsigned char *data2; + // data2 = (unsigned char *)calloc(NSUBS*TOTVOX,sizeof(unsigned char)); + // for (int i=0;i 0) + fixMean = (float *)calloc(NCOV_FIX,sizeof(float)); + + alphaPrec = (float *)calloc(NSUBTYPES,sizeof(float)); + for (int i = 0;i 0) { + CUDA_CALL( cudaMalloc((void **)&deviceFixMean,NCOV_FIX*sizeof(float)) ); + CUDA_CALL( cudaMemset(deviceFixMean,0,NCOV_FIX*sizeof(float)) ); + } + } +// else +// alpha = (float *)calloc(TOTVOX*NSUBTYPES,sizeof(float)); + +/* if (!RESTART) { + initializeAlpha(msk,alphaMean,alphaPrec,alpha,seed); + if (GPU) + cudaMemcpy(deviceAlpha,alpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyHostToDevice); + }*/ + +// Malpha = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); + MWM = (double *)calloc(RSIZE,sizeof(double)); + Mprb = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); + MCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); + SCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); + + Qhat = (double **)calloc(NSUBS,sizeof(double *)); + for (int i=0;i 0) + updatefixMeanGPU(fixMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); + // updateAlphaMeanGPU(alphaMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); + } + else { + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + + // for (int tj=0;tj<10;tj++) + updateZ(Z,alpha,data,msk,beta,WM,spatCoef,covar,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateZCPU kernel: %f ms\n", time); + } + // CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateBeta(&beta,Z,alpha,WM,msk,prior_mean_beta,prior_prec_beta,spatCoef,covar,seed); + // CUDA_CALL( cudaEventRecord(stop, 0) ); + // CUDA_CALL( cudaEventSynchronize(stop) ); + // CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + // printf ("Time for the updateBetaCPU kernel: %f ms\n", time); + + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateSpatCoef(covar,spatCoef,SpatCoefPrec,alpha,alphaMean,Z,msk,beta,WM,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateSpatCoefCPU kernel: %f ms\n", time); + } + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateSpatCoefPrec(SpatCoefPrec,spatCoef,msk,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateSpatCoefPrecCPU kernel: %f ms\n\n", time); + } + + // CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + // compute_prb(prb,alpha,spatCoef,beta,WM,msk); + // CUDA_CALL( cudaEventRecord(stop, 0) ); + // CUDA_CALL( cudaEventSynchronize(stop) ); + /// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + // printf ("Time for the compute_prbCPU kernel: %f ms\n\n", time); + } + +/* if (!(iter%PRNtITER) && iter > 0) { + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + int i = 56;//57; + int j = 77;//46; + int k = 19;//72; + IDX = idx(i,j,k); + + for (int ii=NSUBTYPES;ii (double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor)) ? + max:(double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor); + } + } + } + printf("max = %lf\n",max); */ + + double *V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); + for (int ii=0;ii BURNIN) && (!(iter%SAVE_ITER))) { +// if ((iter > 0)) { + // compute probability of lesion + + if (GPU) { + if (TOTSAV == 0) + FIRST = 1; + else + FIRST = 0; + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + ED += compute_predictGPU(beta,data,msk,covar,predict,Qhat,FIRST); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_predictGPU: %f ms\n", time); + } + + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + compute_prbGPU(beta); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(prb,devicePrb,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_prbGPU: %f ms\n\n", time); + } + } + else { + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + compute_prb(prb,alpha,spatCoef,beta,WM,msk); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_prbGPU: %f ms\n\n", time); + } + } + + for (int k=1;k 1) { + if (GPU) { + CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + } + save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); + }*/ + } +// CUDA_CALL( cudaEventRecord(stop, 0) ); +// CUDA_CALL( cudaEventSynchronize(stop) ); +// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); +// printf ("Time for the updateZGPU kernel: %f sec.\n", time/1000.0); + + fclose(fBF); + +// unsigned int *Resid; +// double *ResidMap; +// Resid = (unsigned int *)calloc(NSUBS*TOTVOX,sizeof(unsigned int)); +// ResidMap = (double *)calloc(RSIZE,sizeof(double)); + if (GPU) { + CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + +// CUDA_CALL( cudaMemcpy(Resid,deviceResid,NSUBS*TOTVOX*sizeof(unsigned int),cudaMemcpyDeviceToHost) ); + } +// save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); +/* for (int isub=0;isub 0.05) { + fprintf(fresid,"%d %d %d %d %lf\n",isub,i,j,k,tmp); + ResidMap[IDX2]++; + } + } + } + } + } + } + free(Resid); + fclose(out); + fclose(out2); + fclose(fresid);*/ + + char *RR = (char *)calloc(500,sizeof(char)); + S = (char *)calloc(100,sizeof(char)); +// S = strcpy(S,"ResidMap.nii"); +// int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,ResidMap); +// free(ResidMap); + int rtn; +// RR = strcpy(RR,"gzip -f "); +// RR = strcat(RR,(const char *)S); +// rtn = system(RR); + + printf("TOTSAV = %d\n",TOTSAV); + + out = fopen("Qhat.dat","w"); + for (int isub=0;isub0) + standCoef[IDX] = MCov[ii*RSIZE+IDX]/sqrt(tmp); + } + } + } + + S = strcpy(S,"spatCoef_"); + S = strcat(S,SS[ii]); + S = strcat(S,".nii"); + int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&MCov[ii*RSIZE]); + + RR = strcpy(RR,"gzip -f "); + RR = strcat(RR,(const char *)S); + rtn = system(RR); + + S = strcpy(S,"spatCoef_"); + S = strcat(S,SS[ii]); + S = strcat(S,".Var.nii"); + rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&SCov[ii*RSIZE]); + + RR = strcpy(RR,"gzip -f "); + RR = strcat(RR,(const char *)S); + rtn = system(RR); + +/* for (int k=0;k Date: Tue, 20 Jan 2015 12:12:37 +0000 Subject: [PATCH 04/13] fix bug in SAVE_ITER (must never be zero) --- bsglmm/mcmc.cpp | 222 ++++++++++++++++++++++++------------------------ 1 file changed, 113 insertions(+), 109 deletions(-) diff --git a/bsglmm/mcmc.cpp b/bsglmm/mcmc.cpp index 3475a44..4ebbd9b 100644 --- a/bsglmm/mcmc.cpp +++ b/bsglmm/mcmc.cpp @@ -87,10 +87,10 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c float Prec = 0.000001f; SUB *subj; char *S; - + // float *hostWM; FILE *out,*out2,*fWM,*fcoef,*fcoefsd,*fBF,*fresid; - + void itoa(int,char *); // unsigned char *get_neighbors(); void initializeAlpha(unsigned char *,float *,float *,float *,unsigned long *); @@ -99,7 +99,7 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c void updateZGPU(unsigned char *,float,unsigned long *); void updateZ(float *,float *,unsigned char *,unsigned char *,float, float *,float *,float *,unsigned long *); void updatePhiGPU(float,float *); - void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed); + void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed); void updateAlpha(float *,float ,float ,float *,unsigned char *,float,float *,float *,float *,int,unsigned long *); void updateAlphaGPU(float *,float *,float *,float *,unsigned char *,float,float *,float *,float *,unsigned long *); void updateAlphaPrecGPU(float *,float *,float,float,float,unsigned long *); @@ -125,7 +125,7 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c // void tmp(float *,float *); fresid = fopen("resid.dat","w"); - + NSIZE = (NROW+2)*(NCOL+2)*(NDEPTH+2); int RSIZE = NROW*NCOL*NDEPTH; // printf("%d %d \n",TOTVOX,NSIZE); @@ -138,11 +138,11 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c // if (data[i] == 1) // data2[i] = 2; // } - // CUDA_CALL( cudaMemcpy(deviceData,data2,NSUBS*TOTVOX*sizeof(unsigned char),cudaMemcpyHostToDevice) ); - CUDA_CALL( cudaMemcpy(deviceData,data,NSUBS*TOTVOX*sizeof(unsigned char),cudaMemcpyHostToDevice) ); + // CUDA_CALL( cudaMemcpy(deviceData,data2,NSUBS*TOTVOX*sizeof(unsigned char),cudaMemcpyHostToDevice) ); + CUDA_CALL( cudaMemcpy(deviceData,data,NSUBS*TOTVOX*sizeof(unsigned char),cudaMemcpyHostToDevice) ); // free(data2); } - + SpatCoefMean = (float *)calloc(NCOVAR,sizeof(float)); SpatCoefPrec = (float *)calloc(NCOVAR*NCOVAR,sizeof(float)); for (int i=0;i (double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor)) ? max:(double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor); } } } - printf("max = %lf\n",max); */ - + printf("max = %lf\n",max); */ + double *V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); for (int ii=0;ii 1) { if (GPU) { CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); @@ -646,17 +650,17 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c // printf ("Time for the updateZGPU kernel: %f sec.\n", time/1000.0); fclose(fBF); - + // unsigned int *Resid; // double *ResidMap; -// Resid = (unsigned int *)calloc(NSUBS*TOTVOX,sizeof(unsigned int)); -// ResidMap = (double *)calloc(RSIZE,sizeof(double)); +// Resid = (unsigned int *)calloc(NSUBS*TOTVOX,sizeof(unsigned int)); +// ResidMap = (double *)calloc(RSIZE,sizeof(double)); if (GPU) { CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); // CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); // cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - + // CUDA_CALL( cudaMemcpy(Resid,deviceResid,NSUBS*TOTVOX*sizeof(unsigned int),cudaMemcpyDeviceToHost) ); } // save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); @@ -697,7 +701,7 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c printf("TOTSAV = %d\n",TOTSAV); - out = fopen("Qhat.dat","w"); + out = fopen("Qhat.dat","w"); for (int isub=0;isub Date: Tue, 20 Jan 2015 12:17:08 +0000 Subject: [PATCH 05/13] synchronise with changes I made on goldfinch --- bsglmm/mcmc.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bsglmm/mcmc.cpp b/bsglmm/mcmc.cpp index 4ebbd9b..42cc507 100644 --- a/bsglmm/mcmc.cpp +++ b/bsglmm/mcmc.cpp @@ -295,10 +295,10 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c CUDA_CALL( cudaEventCreate(&start) ); CUDA_CALL( cudaEventCreate(&stop) ); //cudaEventRecord(start,0); - - SAVE_ITER = (MAXITER - BURNIN)/10000; - if (SAVE_ITER==0) - SAVE_ITER = 100; + + SAVE_ITER = (MAXITER - BURNIN)/10000; + if (SAVE_ITER==0) + SAVE_ITER = 100; PRNtITER = 100; From 3b77a383744d612b5822d530c68c813f7f08a862 Mon Sep 17 00:00:00 2001 From: Bernd Taschler Date: Tue, 20 Jan 2015 12:27:38 +0000 Subject: [PATCH 06/13] incorporate changes made to main.cu (more verbose error massages) --- bsglmm/main.cu | 46 +++++++++++++++++++++++++++++++++++----------- bsglmm/mcmc.cpp | 4 ++-- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/bsglmm/main.cu b/bsglmm/main.cu index 1d65db8..acf7041 100644 --- a/bsglmm/main.cu +++ b/bsglmm/main.cu @@ -27,8 +27,8 @@ float *covar; int NSUBTYPES; int NCOVAR; int NCOV_FIX = 0; -int MAXITER = 1000000; -int BURNIN = 500000; +int MAXITER = 100000; +int BURNIN = 50000; int RESTART; float *deviceCovar; float *hostCovar; @@ -69,12 +69,31 @@ int main (int argc, char * const argv[]) { void write_empir_prb(unsigned char *,float *,unsigned char *,int *); unsigned char *get_WM_mask(float *,unsigned char *); - if (argc !=5) - exit(0); + if (argc !=5 && argc !=7) { + printf("%s: Usage\n",argv[0]); + printf("%s NTypes NCov GPU Design [MaxIter BurnIn] \n",argv[0]); + printf(" NTypes - Number of groups\n",argv[0]); + printf(" NCov - Number of covariates (count must include groups)\n",argv[0]); + printf(" GPU - 1 use GPU; 0 use CPU\n",argv[0]); + printf(" Design - Text file, tab or space separated data file\n",argv[0]); + printf(" MaxIter - Number of iterations (defaults to 1,000,000)\n",argv[0]); + printf(" BurnIn - Number of burn-in iterations (defaults to 500,000)\n",argv[0]); + printf("For documentation see: http://warwick.ac.uk/tenichols/BSGLMM\n",argv[0]); + exit(1); + } NSUBTYPES = atoi(argv[1]); NCOVAR = atoi(argv[2]); GPU = atoi(argv[3]); + if (argc >5) { + MAXITER = atoi(argv[5]); + BURNIN = atoi(argv[6]); + if (MAXITER<1000) + printf("WARNING: Silly-small number of iterations used; recommend abort and use more\n"); + if (BURNIN>MAXITER) + printf("WARNING: Burn-in exceeds MAXITER, no results will be saved\n"); + } + RESTART = 0; int deviceCount = 0; @@ -113,6 +132,18 @@ int main (int argc, char * const argv[]) { seed = (unsigned long *)calloc(3,sizeof(unsigned long)); fseed = fopen("seed.dat","r"); + if (fseed == NULL){ + printf("No seed.dat found; using [ 1 2 3 ]\n"); + seed[0]=1L;seed[1]=2L;seed[2]=2L; + } else { + rtn = fscanf(fseed,"%lu %lu %lu\n",&(seed[0]),&(seed[1]),&(seed[2])); + if (rtn==0) + exit(1); + rtn = fclose(fseed); + if (rtn != 0) + exit(1); + } + logit_factor = 1.0f; t_df = 1000.0f; @@ -124,13 +155,6 @@ int main (int argc, char * const argv[]) { t_df = 3.0f; - rtn = fscanf(fseed,"%lu %lu %lu\n",&(seed[0]),&(seed[1]),&(seed[2])); - if (rtn==0) - exit(0); - rtn = fclose(fseed); - if (rtn != 0) - exit(0); - // msk = read_nifti1_mask("./images/avg152T1_highres_brain.hdr", // "./images/avg152T1_highres_brain.img"); diff --git a/bsglmm/mcmc.cpp b/bsglmm/mcmc.cpp index 42cc507..0e63450 100644 --- a/bsglmm/mcmc.cpp +++ b/bsglmm/mcmc.cpp @@ -296,9 +296,9 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c CUDA_CALL( cudaEventCreate(&stop) ); //cudaEventRecord(start,0); - SAVE_ITER = (MAXITER - BURNIN)/10000; + SAVE_ITER = (MAXITER - BURNIN)/10000; if (SAVE_ITER==0) - SAVE_ITER = 100; + SAVE_ITER = 100; PRNtITER = 100; From 734c34dd5371927f3cbaf57ed213f5790e9e8381 Mon Sep 17 00:00:00 2001 From: nicholst Date: Tue, 20 Jan 2015 13:34:52 +0000 Subject: [PATCH 07/13] Getting rid of carriage returns (and maybe updating a fix for the low-iter bug) --- bsglmm/mcmc.cpp | 2620 +++++++++++++++++++++++------------------------ 1 file changed, 1310 insertions(+), 1310 deletions(-) diff --git a/bsglmm/mcmc.cpp b/bsglmm/mcmc.cpp index 0e63450..807b24f 100644 --- a/bsglmm/mcmc.cpp +++ b/bsglmm/mcmc.cpp @@ -1,1316 +1,1316 @@ -/* - * mcmc.cpp - * - * Created by Timothy Johnson on 4/14/12. - * Copyright 2012 University of Michigan. All rights reserved. - * - */ - -#include -#include -#include -#include -#include -#include -#include -//#include -#include -#include "binCAR.h" -#include "randgen.h" -#include "cholesky.h" -#include - -typedef double MY_DATATYPE; -extern float logit_factor; -extern float t_df; -extern double *grp_prior; - -int iter; -extern int MODEL; -extern int GPU; -extern int MAXITER; -extern int BURNIN; -extern int NSUBS; -extern int NROW; -extern int NCOL; -extern int TOTVOX; -extern int TOTVOXp; -extern int NDEPTH; -extern int NSUBTYPES; -extern int NCOVAR; -extern int NCOV_FIX; -extern int RESTART; -extern int *hostIdx; -extern int *deviceIdx; -extern int *hostIdxSC; -extern int *deviceIdxSC; -float *deviceSpatCoef; -extern float *deviceCovar; -extern float *deviceCov_Fix; -extern float *XXprime; -unsigned char *deviceData; -unsigned int *deviceResid; -extern char **SS; - -//short *deviceData; -float *deviceWM; -float *deviceAlphaMean; -float *deviceFixMean; -float *deviceZ; -float *devicePhi; -float *devicePrb; -float *devicePredict; -float sqrt2pi = 2.506628274631; -float NCNT; -int MIX = 0; - -int NSIZE; - -#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) -#define idx2(i,j,k) (i + (NROW)*j + (NROW)*(NCOL)*k) - -#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} - -void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned char *msk,unsigned long *seed) -{ - int TOTSAV=0,IDX,PRNtITER,SAVE_ITER,FIRST; - float *Z,*Phi,*alpha,*prb,*predict; - double *Mprb,*MCov,*SCov,*MWM,**Qhat,ED=0; - float beta=0,prior_mean_beta=0,prior_prec_beta=0.000001f; - float *SpatCoefPrec,*SpatCoefMean,*spatCoef; - - float *alphaMean,*alphaPrec;//scarPrec=1; - float *fixMean,*fixPrec;//scarPrec=1; - float prior_alphaPrec_A=3.0f; - float prior_alphaPrec_B=2.0f; -// float prior_carPrec_A=3.0,prior_carPrec_B=2.0; - float Prec = 0.000001f; - SUB *subj; - char *S; - -// float *hostWM; - FILE *out,*out2,*fWM,*fcoef,*fcoefsd,*fBF,*fresid; - - void itoa(int,char *); -// unsigned char *get_neighbors(); - void initializeAlpha(unsigned char *,float *,float *,float *,unsigned long *); - void initializeZ(float *,unsigned char *,unsigned char *,unsigned long *); - void initializePhi(float *,unsigned char *,unsigned char *,float,unsigned long *); - void updateZGPU(unsigned char *,float,unsigned long *); - void updateZ(float *,float *,unsigned char *,unsigned char *,float, float *,float *,float *,unsigned long *); - void updatePhiGPU(float,float *); - void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed); - void updateAlpha(float *,float ,float ,float *,unsigned char *,float,float *,float *,float *,int,unsigned long *); - void updateAlphaGPU(float *,float *,float *,float *,unsigned char *,float,float *,float *,float *,unsigned long *); - void updateAlphaPrecGPU(float *,float *,float,float,float,unsigned long *); - void updateAlphaPrec(float *,float *,float,float,float,unsigned char *,unsigned long *); - void updateAlphaMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); - void updatefixMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); - void updateAlphaMean(float *,float *,float *,float,float *,float *,float,unsigned char *,unsigned long *); - void compute_prbGPU(float); - void compute_prb(float *,float *,float *,float,float *,unsigned char *); - void updateBeta(float *,float *,float *,float *,unsigned char *,float,float,float *,float *,unsigned long *); - void updateBetaGPU(float *,float *,float *,float *,float *,float,float,float *,float *,unsigned long *); - void updateSpatCoefGPU(float *,float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); - void updateSpatCoef(float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); - void updateSpatCoefMean(float *,float *,float,float,unsigned char *,float **,float *,float *,float *,float,int,unsigned long *); - void updateSpatCoefPrec(float *,float *,unsigned char *,unsigned long *); - void updateSpatCoefPrecGPU(float *,unsigned long *); - void updateSpatCoefPrecGPU_Laplace(float *,unsigned long *); - void save_iter(float,float *,float *,float *,float *,float *,float *,float *,float *,float *); - void restart_iter(float *,float *,float *,float *,float *,float *,float *,float *,float *,float*); - int write_nifti_file(int NROW,int NCOL,int NDEPTH,int NTIME,char *hdr_file,char *data_file,MY_DATATYPE *data); - double compute_predictGPU(float,unsigned char *,unsigned char *msk,float *,float *,double **Qhat,int); - double compute_prb_DIC(double *,double *,float *,unsigned char *,unsigned char *,int); - -// void tmp(float *,float *); - fresid = fopen("resid.dat","w"); - - NSIZE = (NROW+2)*(NCOL+2)*(NDEPTH+2); - int RSIZE = NROW*NCOL*NDEPTH; -// printf("%d %d \n",TOTVOX,NSIZE); - - if (GPU) { - CUDA_CALL( cudaMalloc((void **)&deviceData,NSUBS*TOTVOX*sizeof(unsigned char)) ); - // unsigned char *data2; - // data2 = (unsigned char *)calloc(NSUBS*TOTVOX,sizeof(unsigned char)); - // for (int i=0;i 0) - fixMean = (float *)calloc(NCOV_FIX,sizeof(float)); - - alphaPrec = (float *)calloc(NSUBTYPES,sizeof(float)); - for (int i = 0;i 0) { - CUDA_CALL( cudaMalloc((void **)&deviceFixMean,NCOV_FIX*sizeof(float)) ); - CUDA_CALL( cudaMemset(deviceFixMean,0,NCOV_FIX*sizeof(float)) ); - } - } -// else -// alpha = (float *)calloc(TOTVOX*NSUBTYPES,sizeof(float)); - -/* if (!RESTART) { - initializeAlpha(msk,alphaMean,alphaPrec,alpha,seed); - if (GPU) - cudaMemcpy(deviceAlpha,alpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyHostToDevice); - }*/ - -// Malpha = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); - MWM = (double *)calloc(RSIZE,sizeof(double)); - Mprb = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); - MCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); - SCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); - - Qhat = (double **)calloc(NSUBS,sizeof(double *)); - for (int i=0;i +#include +#include +#include +#include +#include +#include +//#include +#include +#include "binCAR.h" +#include "randgen.h" +#include "cholesky.h" +#include + +typedef double MY_DATATYPE; +extern float logit_factor; +extern float t_df; +extern double *grp_prior; + +int iter; +extern int MODEL; +extern int GPU; +extern int MAXITER; +extern int BURNIN; +extern int NSUBS; +extern int NROW; +extern int NCOL; +extern int TOTVOX; +extern int TOTVOXp; +extern int NDEPTH; +extern int NSUBTYPES; +extern int NCOVAR; +extern int NCOV_FIX; +extern int RESTART; +extern int *hostIdx; +extern int *deviceIdx; +extern int *hostIdxSC; +extern int *deviceIdxSC; +float *deviceSpatCoef; +extern float *deviceCovar; +extern float *deviceCov_Fix; +extern float *XXprime; +unsigned char *deviceData; +unsigned int *deviceResid; +extern char **SS; + +//short *deviceData; +float *deviceWM; +float *deviceAlphaMean; +float *deviceFixMean; +float *deviceZ; +float *devicePhi; +float *devicePrb; +float *devicePredict; +float sqrt2pi = 2.506628274631; +float NCNT; +int MIX = 0; + +int NSIZE; + +#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) +#define idx2(i,j,k) (i + (NROW)*j + (NROW)*(NCOL)*k) + +#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} + +void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned char *msk,unsigned long *seed) +{ + int TOTSAV=0,IDX,PRNtITER,SAVE_ITER,FIRST; + float *Z,*Phi,*alpha,*prb,*predict; + double *Mprb,*MCov,*SCov,*MWM,**Qhat,ED=0; + float beta=0,prior_mean_beta=0,prior_prec_beta=0.000001f; + float *SpatCoefPrec,*SpatCoefMean,*spatCoef; + + float *alphaMean,*alphaPrec;//scarPrec=1; + float *fixMean,*fixPrec;//scarPrec=1; + float prior_alphaPrec_A=3.0f; + float prior_alphaPrec_B=2.0f; +// float prior_carPrec_A=3.0,prior_carPrec_B=2.0; + float Prec = 0.000001f; + SUB *subj; + char *S; + +// float *hostWM; + FILE *out,*out2,*fWM,*fcoef,*fcoefsd,*fBF,*fresid; + + void itoa(int,char *); +// unsigned char *get_neighbors(); + void initializeAlpha(unsigned char *,float *,float *,float *,unsigned long *); + void initializeZ(float *,unsigned char *,unsigned char *,unsigned long *); + void initializePhi(float *,unsigned char *,unsigned char *,float,unsigned long *); + void updateZGPU(unsigned char *,float,unsigned long *); + void updateZ(float *,float *,unsigned char *,unsigned char *,float, float *,float *,float *,unsigned long *); + void updatePhiGPU(float,float *); + void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed); + void updateAlpha(float *,float ,float ,float *,unsigned char *,float,float *,float *,float *,int,unsigned long *); + void updateAlphaGPU(float *,float *,float *,float *,unsigned char *,float,float *,float *,float *,unsigned long *); + void updateAlphaPrecGPU(float *,float *,float,float,float,unsigned long *); + void updateAlphaPrec(float *,float *,float,float,float,unsigned char *,unsigned long *); + void updateAlphaMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); + void updatefixMeanGPU(float *,float,float *,float *,float *,float,unsigned long *); + void updateAlphaMean(float *,float *,float *,float,float *,float *,float,unsigned char *,unsigned long *); + void compute_prbGPU(float); + void compute_prb(float *,float *,float *,float,float *,unsigned char *); + void updateBeta(float *,float *,float *,float *,unsigned char *,float,float,float *,float *,unsigned long *); + void updateBetaGPU(float *,float *,float *,float *,float *,float,float,float *,float *,unsigned long *); + void updateSpatCoefGPU(float *,float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); + void updateSpatCoef(float *,float *,float *,float *,float *,float *,unsigned char *,float,float *,unsigned long *); + void updateSpatCoefMean(float *,float *,float,float,unsigned char *,float **,float *,float *,float *,float,int,unsigned long *); + void updateSpatCoefPrec(float *,float *,unsigned char *,unsigned long *); + void updateSpatCoefPrecGPU(float *,unsigned long *); + void updateSpatCoefPrecGPU_Laplace(float *,unsigned long *); + void save_iter(float,float *,float *,float *,float *,float *,float *,float *,float *,float *); + void restart_iter(float *,float *,float *,float *,float *,float *,float *,float *,float *,float*); + int write_nifti_file(int NROW,int NCOL,int NDEPTH,int NTIME,char *hdr_file,char *data_file,MY_DATATYPE *data); + double compute_predictGPU(float,unsigned char *,unsigned char *msk,float *,float *,double **Qhat,int); + double compute_prb_DIC(double *,double *,float *,unsigned char *,unsigned char *,int); + +// void tmp(float *,float *); + fresid = fopen("resid.dat","w"); + + NSIZE = (NROW+2)*(NCOL+2)*(NDEPTH+2); + int RSIZE = NROW*NCOL*NDEPTH; +// printf("%d %d \n",TOTVOX,NSIZE); + + if (GPU) { + CUDA_CALL( cudaMalloc((void **)&deviceData,NSUBS*TOTVOX*sizeof(unsigned char)) ); + // unsigned char *data2; + // data2 = (unsigned char *)calloc(NSUBS*TOTVOX,sizeof(unsigned char)); + // for (int i=0;i 0) + fixMean = (float *)calloc(NCOV_FIX,sizeof(float)); + + alphaPrec = (float *)calloc(NSUBTYPES,sizeof(float)); + for (int i = 0;i 0) { + CUDA_CALL( cudaMalloc((void **)&deviceFixMean,NCOV_FIX*sizeof(float)) ); + CUDA_CALL( cudaMemset(deviceFixMean,0,NCOV_FIX*sizeof(float)) ); + } + } +// else +// alpha = (float *)calloc(TOTVOX*NSUBTYPES,sizeof(float)); + +/* if (!RESTART) { + initializeAlpha(msk,alphaMean,alphaPrec,alpha,seed); + if (GPU) + cudaMemcpy(deviceAlpha,alpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyHostToDevice); + }*/ + +// Malpha = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); + MWM = (double *)calloc(RSIZE,sizeof(double)); + Mprb = (double *)calloc(NSUBTYPES*RSIZE,sizeof(double)); + MCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); + SCov = (double *)calloc(NCOVAR*RSIZE,sizeof(double)); + + Qhat = (double **)calloc(NSUBS,sizeof(double *)); + for (int i=0;i 0) - updatefixMeanGPU(fixMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); - // updateAlphaMeanGPU(alphaMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); - } - else { - if (!(iter%PRNtITER)) - CUDA_CALL( cudaEventRecord(start, 0) ); - - // for (int tj=0;tj<10;tj++) - updateZ(Z,alpha,data,msk,beta,WM,spatCoef,covar,seed); - if (!(iter%PRNtITER)) { - CUDA_CALL( cudaEventRecord(stop, 0) ); - CUDA_CALL( cudaEventSynchronize(stop) ); - CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); - printf ("Time for the updateZCPU kernel: %f ms\n", time); - } - // CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - updateBeta(&beta,Z,alpha,WM,msk,prior_mean_beta,prior_prec_beta,spatCoef,covar,seed); - // CUDA_CALL( cudaEventRecord(stop, 0) ); - // CUDA_CALL( cudaEventSynchronize(stop) ); - // CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); - // printf ("Time for the updateBetaCPU kernel: %f ms\n", time); - - if (!(iter%PRNtITER)) - CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - updateSpatCoef(covar,spatCoef,SpatCoefPrec,alpha,alphaMean,Z,msk,beta,WM,seed); - if (!(iter%PRNtITER)) { - CUDA_CALL( cudaEventRecord(stop, 0) ); - CUDA_CALL( cudaEventSynchronize(stop) ); - CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); - printf ("Time for the updateSpatCoefCPU kernel: %f ms\n", time); - } - if (!(iter%PRNtITER)) - CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - updateSpatCoefPrec(SpatCoefPrec,spatCoef,msk,seed); - if (!(iter%PRNtITER)) { - CUDA_CALL( cudaEventRecord(stop, 0) ); - CUDA_CALL( cudaEventSynchronize(stop) ); - CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); - printf ("Time for the updateSpatCoefPrecCPU kernel: %f ms\n\n", time); - } - - // CUDA_CALL( cudaEventRecord(start, 0) ); - // for (int tj=0;tj<10;tj++) - // compute_prb(prb,alpha,spatCoef,beta,WM,msk); - // CUDA_CALL( cudaEventRecord(stop, 0) ); - // CUDA_CALL( cudaEventSynchronize(stop) ); - /// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); - // printf ("Time for the compute_prbCPU kernel: %f ms\n\n", time); - } - -/* if (!(iter%PRNtITER) && iter > 0) { - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - int i = 56;//57; - int j = 77;//46; - int k = 19;//72; - IDX = idx(i,j,k); - - for (int ii=NSUBTYPES;ii (double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor)) ? - max:(double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor); - } - } - } - printf("max = %lf\n",max); */ - - double *V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); - for (int ii=0;ii BURNIN) && (!(iter%SAVE_ITER))) { -// if ((iter > 0)) { - // compute probability of lesion - - if (GPU) { - if (TOTSAV == 0) - FIRST = 1; - else - FIRST = 0; - if (!(iter%PRNtITER)) - cudaEventRecord(start, 0); - ED += compute_predictGPU(beta,data,msk,covar,predict,Qhat,FIRST); - if (!(iter%PRNtITER)) { - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); - printf ("Time to compute_predictGPU: %f ms\n", time); - } - - if (!(iter%PRNtITER)) - cudaEventRecord(start, 0); - compute_prbGPU(beta); - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - CUDA_CALL( cudaMemcpy(prb,devicePrb,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); - if (!(iter%PRNtITER)) { - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); - printf ("Time to compute_prbGPU: %f ms\n\n", time); - } - } - else { - if (!(iter%PRNtITER)) - cudaEventRecord(start, 0); - compute_prb(prb,alpha,spatCoef,beta,WM,msk); - if (!(iter%PRNtITER)) { - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&time, start, stop); - printf ("Time to compute_prbGPU: %f ms\n\n", time); - } - } - - for (int k=1;k 1) { - if (GPU) { - CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); - CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); -// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - } - save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); - }*/ - } -// CUDA_CALL( cudaEventRecord(stop, 0) ); -// CUDA_CALL( cudaEventSynchronize(stop) ); -// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); -// printf ("Time for the updateZGPU kernel: %f sec.\n", time/1000.0); - - fclose(fBF); - -// unsigned int *Resid; -// double *ResidMap; -// Resid = (unsigned int *)calloc(NSUBS*TOTVOX,sizeof(unsigned int)); -// ResidMap = (double *)calloc(RSIZE,sizeof(double)); - if (GPU) { - CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); -// CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); -// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); - -// CUDA_CALL( cudaMemcpy(Resid,deviceResid,NSUBS*TOTVOX*sizeof(unsigned int),cudaMemcpyDeviceToHost) ); - } -// save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); -/* for (int isub=0;isub 0.05) { - fprintf(fresid,"%d %d %d %d %lf\n",isub,i,j,k,tmp); - ResidMap[IDX2]++; - } - } - } - } - } - } - free(Resid); - fclose(out); - fclose(out2); - fclose(fresid);*/ - - char *RR = (char *)calloc(500,sizeof(char)); - S = (char *)calloc(100,sizeof(char)); -// S = strcpy(S,"ResidMap.nii"); -// int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,ResidMap); -// free(ResidMap); - int rtn; -// RR = strcpy(RR,"gzip -f "); -// RR = strcat(RR,(const char *)S); -// rtn = system(RR); - - printf("TOTSAV = %d\n",TOTSAV); - - out = fopen("Qhat.dat","w"); - for (int isub=0;isub0) - standCoef[IDX] = MCov[ii*RSIZE+IDX]/sqrt(tmp); - } - } - } - - S = strcpy(S,"spatCoef_"); - S = strcat(S,SS[ii]); - S = strcat(S,".nii"); - int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&MCov[ii*RSIZE]); - - RR = strcpy(RR,"gzip -f "); - RR = strcat(RR,(const char *)S); - rtn = system(RR); - - S = strcpy(S,"spatCoef_"); - S = strcat(S,SS[ii]); - S = strcat(S,".Var.nii"); - rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&SCov[ii*RSIZE]); - - RR = strcpy(RR,"gzip -f "); - RR = strcat(RR,(const char *)S); - rtn = system(RR); - -/* for (int k=0;k 0) + updatefixMeanGPU(fixMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); + // updateAlphaMeanGPU(alphaMean,Prec,deviceCovar,deviceZ,deviceSpatCoef,beta,seed); + } + else { + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + + // for (int tj=0;tj<10;tj++) + updateZ(Z,alpha,data,msk,beta,WM,spatCoef,covar,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateZCPU kernel: %f ms\n", time); + } + // CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateBeta(&beta,Z,alpha,WM,msk,prior_mean_beta,prior_prec_beta,spatCoef,covar,seed); + // CUDA_CALL( cudaEventRecord(stop, 0) ); + // CUDA_CALL( cudaEventSynchronize(stop) ); + // CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + // printf ("Time for the updateBetaCPU kernel: %f ms\n", time); + + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateSpatCoef(covar,spatCoef,SpatCoefPrec,alpha,alphaMean,Z,msk,beta,WM,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateSpatCoefCPU kernel: %f ms\n", time); + } + if (!(iter%PRNtITER)) + CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + updateSpatCoefPrec(SpatCoefPrec,spatCoef,msk,seed); + if (!(iter%PRNtITER)) { + CUDA_CALL( cudaEventRecord(stop, 0) ); + CUDA_CALL( cudaEventSynchronize(stop) ); + CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + printf ("Time for the updateSpatCoefPrecCPU kernel: %f ms\n\n", time); + } + + // CUDA_CALL( cudaEventRecord(start, 0) ); + // for (int tj=0;tj<10;tj++) + // compute_prb(prb,alpha,spatCoef,beta,WM,msk); + // CUDA_CALL( cudaEventRecord(stop, 0) ); + // CUDA_CALL( cudaEventSynchronize(stop) ); + /// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); + // printf ("Time for the compute_prbCPU kernel: %f ms\n\n", time); + } + +/* if (!(iter%PRNtITER) && iter > 0) { + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + int i = 56;//57; + int j = 77;//46; + int k = 19;//72; + IDX = idx(i,j,k); + + for (int ii=NSUBTYPES;ii (double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor)) ? + max:(double)(spatCoef[4*TOTVOXp+hostIdxSC[IDX]]/logit_factor); + } + } + } + printf("max = %lf\n",max); */ + + double *V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); + for (int ii=0;ii BURNIN) && (!(iter%SAVE_ITER))) { +// if ((iter > 0)) { + // compute probability of lesion + + if (GPU) { + if (TOTSAV == 0) + FIRST = 1; + else + FIRST = 0; + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + ED += compute_predictGPU(beta,data,msk,covar,predict,Qhat,FIRST); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_predictGPU: %f ms\n", time); + } + + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + compute_prbGPU(beta); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(prb,devicePrb,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_prbGPU: %f ms\n\n", time); + } + } + else { + if (!(iter%PRNtITER)) + cudaEventRecord(start, 0); + compute_prb(prb,alpha,spatCoef,beta,WM,msk); + if (!(iter%PRNtITER)) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&time, start, stop); + printf ("Time to compute_prbGPU: %f ms\n\n", time); + } + } + + for (int k=1;k 1) { + if (GPU) { + CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + } + save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); + }*/ + } +// CUDA_CALL( cudaEventRecord(stop, 0) ); +// CUDA_CALL( cudaEventSynchronize(stop) ); +// CUDA_CALL( cudaEventElapsedTime(&time, start, stop) ); +// printf ("Time for the updateZGPU kernel: %f sec.\n", time/1000.0); + + fclose(fBF); + +// unsigned int *Resid; +// double *ResidMap; +// Resid = (unsigned int *)calloc(NSUBS*TOTVOX,sizeof(unsigned int)); +// ResidMap = (double *)calloc(RSIZE,sizeof(double)); + if (GPU) { + CUDA_CALL( cudaMemcpy(Z,deviceZ,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// CUDA_CALL( cudaMemcpy(Phi,devicePhi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost) ); +// cudaMemcpy(alpha,deviceAlpha,NSUBTYPES*TOTVOX*sizeof(float),cudaMemcpyDeviceToHost); + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,NCOVAR*TOTVOXp*sizeof(float),cudaMemcpyDeviceToHost) ); + +// CUDA_CALL( cudaMemcpy(Resid,deviceResid,NSUBS*TOTVOX*sizeof(unsigned int),cudaMemcpyDeviceToHost) ); + } +// save_iter(beta,fixMean,alphaMean,alphaPrec,SpatCoefMean,SpatCoefPrec,spatCoef,alpha,Z,Phi); +/* for (int isub=0;isub 0.05) { + fprintf(fresid,"%d %d %d %d %lf\n",isub,i,j,k,tmp); + ResidMap[IDX2]++; + } + } + } + } + } + } + free(Resid); + fclose(out); + fclose(out2); + fclose(fresid);*/ + + char *RR = (char *)calloc(500,sizeof(char)); + S = (char *)calloc(100,sizeof(char)); +// S = strcpy(S,"ResidMap.nii"); +// int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,ResidMap); +// free(ResidMap); + int rtn; +// RR = strcpy(RR,"gzip -f "); +// RR = strcat(RR,(const char *)S); +// rtn = system(RR); + + printf("TOTSAV = %d\n",TOTSAV); + + out = fopen("Qhat.dat","w"); + for (int isub=0;isub0) + standCoef[IDX] = MCov[ii*RSIZE+IDX]/sqrt(tmp); + } + } + } + + S = strcpy(S,"spatCoef_"); + S = strcat(S,SS[ii]); + S = strcat(S,".nii"); + int rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&MCov[ii*RSIZE]); + + RR = strcpy(RR,"gzip -f "); + RR = strcat(RR,(const char *)S); + rtn = system(RR); + + S = strcpy(S,"spatCoef_"); + S = strcat(S,SS[ii]); + S = strcat(S,".Var.nii"); + rtn = write_nifti_file(NROW,NCOL,NDEPTH,1,S,S,&SCov[ii*RSIZE]); + + RR = strcpy(RR,"gzip -f "); + RR = strcat(RR,(const char *)S); + rtn = system(RR); + +/* for (int k=0;k Date: Tue, 20 Jan 2015 13:40:55 +0000 Subject: [PATCH 08/13] Added .gitattributes... hopefully to solve LF/CR nightmares --- .gitattributes | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f96575f --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.txt -crlf From 637d02ae6bc12de54699d5abff1110ad869de4ef Mon Sep 17 00:00:00 2001 From: nicholst Date: Tue, 20 Jan 2015 13:52:23 +0000 Subject: [PATCH 09/13] Edits to main to improve usability --- .gitignore | 1 + bsglmm/main.cu | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index a75452f..f5d15ff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store .*~ __MACOSX +#* diff --git a/bsglmm/main.cu b/bsglmm/main.cu index ef5dfd6..acf7041 100644 --- a/bsglmm/main.cu +++ b/bsglmm/main.cu @@ -133,11 +133,6 @@ int main (int argc, char * const argv[]) { seed = (unsigned long *)calloc(3,sizeof(unsigned long)); fseed = fopen("seed.dat","r"); if (fseed == NULL){ -<<<<<<< HEAD - printf("seed.dat (with 3 integers) does not exist\n"); - exit(1); - } -======= printf("No seed.dat found; using [ 1 2 3 ]\n"); seed[0]=1L;seed[1]=2L;seed[2]=2L; } else { @@ -149,7 +144,6 @@ int main (int argc, char * const argv[]) { exit(1); } ->>>>>>> 3b77a383744d612b5822d530c68c813f7f08a862 logit_factor = 1.0f; t_df = 1000.0f; From 9c720bf21c777b24917709ad242b0201d5c70010 Mon Sep 17 00:00:00 2001 From: nicholst Date: Tue, 20 Jan 2015 14:02:32 +0000 Subject: [PATCH 10/13] More CRLF nightmares --- .gitattributes | 1 - bsglmm/covarGPU.cu | 3918 ++++++++++++++++++++++---------------------- 2 files changed, 1959 insertions(+), 1960 deletions(-) diff --git a/.gitattributes b/.gitattributes index f96575f..e69de29 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +0,0 @@ -*.txt -crlf diff --git a/bsglmm/covarGPU.cu b/bsglmm/covarGPU.cu index 0b7a52a..9bc4a44 100644 --- a/bsglmm/covarGPU.cu +++ b/bsglmm/covarGPU.cu @@ -1,1959 +1,1959 @@ -/* - * covarGPU.cu - * BinCAR - * - * Created by Timothy Johnson on 5/22/12. - * Copyright 2012 University of Michigan. All rights reserved. - * - */ - -#include -#include -#include -#include -#include -#include -#include "binCAR.h" -#include "randgen.h" -#include "cholesky.h" -#include - -cudaError_t err; - -extern float logit_factor; -extern float t_df; -extern int MODEL; -extern int NSUBS; -extern int NROW; -extern int NCOL; -extern int NDEPTH; -extern int TOTVOX; -extern int TOTVOXp; -extern int NCOVAR; -extern int NCOV_FIX; -extern int NSUBTYPES; -//extern int NSIZE; -extern int iter; -extern INDEX *INDX; -extern float *deviceCovar; -extern float *hostCovar; -extern float *deviceCov_Fix; -extern float *hostCov_Fix; -extern float *deviceAlphaMean; -extern float *deviceFixMean; -extern float *deviceZ; -extern float *devicePhi; -extern float *deviceWM; -extern unsigned int *deviceResid; -extern int *deviceIdx; -extern int *deviceIdxSC; -extern float *devicePrb; -extern float *devicePredict; -extern float *XXprime; -extern float *XXprime_Fix; -extern int *hostIdx; -extern int *hostIdxSC; -//extern float *deviceChiSqHist; -//extern int ChiSqHist_N; - -const int NN=256; -const int BPG=32; -extern curandState *devStates; -extern unsigned char *deviceData; -//extern short *deviceData; -extern float *deviceSpatCoef; -__constant__ float dPrec[20*20]; -__constant__ float dtmpvar[20*20]; -__constant__ float dalphaMean[20]; -__constant__ float dalphaPrec[20]; - -const int TPB=192; -const int TPB2=96; - -#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) - -#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} - -__global__ void setup_kernel ( curandState * state, unsigned long long devseed, const int N ) -{ -/* Each thread gets same seed , a different sequence number, no offset */ - - int idx = threadIdx.x + blockIdx.x * blockDim.x; - - while (idx < N) { - curand_init (devseed , idx , 0 , & state [ idx ]) ; - idx += blockDim.x *gridDim.x; - } -} - - -__global__ void reduce(float *in,float *out,const int N) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int ivox = threadIdx.x + blockIdx.x * blockDim.x; - - float c = 0; - float y,t; - float temp = 0; - while (ivox < N) { - y = in[ivox] - c; - t = temp + y; - c = (t - temp) - y; - temp = t; - ivox += blockDim.x * gridDim.x; - } - - cache[cacheIndex] = temp; - - __syncthreads(); - - int i = (blockDim.x >> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - out[blockIdx.x] = cache[0]; -} - - -__device__ int cholesky_decompGPU(float *A, int num_col) -{ - int k,i,j; - - for (k=0;k=0;i--) { - temp = 0.0; - for (j=i+1;j NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ, - devicePhi,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef");exit(0);} - } - else { - Spat_Coef_Probit<<>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Probit");exit(0);} - } - - } - - - for (this_one=0;this_one>>(&deviceSpatCoef[this_one*TOTVOXp],d_sum,TOTVOXp); - - h_sum = (float *)calloc(BPG,sizeof(float)); - - cudaMemcpy(h_sum,d_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - cmean = 0; - for (int j=0;j>>(&(deviceSpatCoef[this_one*TOTVOXp]),INDX[i].hostN,INDX[i].deviceVox,cmean,deviceIdxSC); - alphaMean[this_one] = cmean/logit_factor; - } -} - - - -__global__ void betaMGPU(float *dcovar,float *dcovF,float *dZ,float *dspatCoef,float *fix,float *dalpha,float *dWM,const int N,const int TOTVOXp,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int TOTVOX,int * vox,float *dmean,int *dIdx,int *dIdxSC) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x*gridDim.x; - - float c = 0; - float y,t; - float temp = 0; - float SC[20]; - float mean=0; - int isub,ii,voxel,IDX,IDXSC; - while (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - for (ii=0;ii> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dmean[blockIdx.x] = cache[0]; -} - - -__global__ void betaVGPU(float *dWM,const int N,int *vox,float *dvar,int *dIdx) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x*gridDim.x; - - float c = 0; - float y,t; - float temp = 0; - - int voxel,IDX; - while (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - y = dWM[IDX]*dWM[IDX] - c; - t = temp + y; - c = (t - temp) - y; - temp = t; - - idx += stride; - } - - cache[cacheIndex] = temp; - - __syncthreads(); - - int i = (blockDim.x >> 1); - while (i != 0) { - if (cacheIndex < i) { - cache[cacheIndex] += cache[cacheIndex + i]; - } - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) { - dvar[blockIdx.x] = cache[0]; - } - -} - -void updateBetaGPU(float *beta,float *dZ,float *dPhi,float *dalpha,float *dWM,float prior_mean_beta,float prior_prec_beta,float *dspatCoef,float *dcovar,unsigned long *seed) -{ - float *dmean,*dvar,*hmean,*hvar; - float mean,var,tmp; - - hmean = (float *)calloc(BPG,sizeof(float)); - hvar = (float *)calloc(BPG,sizeof(float)); - cudaMalloc((void **)&dmean,BPG*sizeof(float)) ; - cudaMalloc((void **)&dvar,BPG*sizeof(float)) ; - - mean = var = 0; - for (int i=0;i<2;i++) { - betaMGPU<<>>(dcovar,deviceCov_Fix,dZ,dspatCoef,deviceFixMean,deviceAlphaMean,deviceWM,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox,dmean,deviceIdx,deviceIdxSC); - cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j>>(deviceWM,INDX[i].hostN,INDX[i].deviceVox,dvar,deviceIdx); - cudaMemcpy(hvar,dvar,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dmean[blockIdx.x] = cache[0]; -} - -void updateAlphaMeanGPU(float *alphaMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) -{ - double *mean,*V,*tmpmean; - float *hmean,*dmean; - - mean = (double *)calloc(NCOVAR,sizeof(double)); - tmpmean = (double *)calloc(NCOVAR,sizeof(double)); - V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); - - for (int i=0;i>>(dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,TOTVOX,INDX[i].deviceVox, - dmean,deviceIdx,deviceIdxSC,icovar); - cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dmean[blockIdx.x] = cache[0]; -} - -void updatefixMeanGPU(float *fixMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) -{ - double *mean,*V,*tmpmean; - float *hmean,*dmean; - - mean = (double *)calloc(NCOV_FIX,sizeof(double)); - tmpmean = (double *)calloc(NCOV_FIX,sizeof(double)); - V = (double *)calloc(NCOV_FIX*NCOV_FIX,sizeof(double)); - - for (int i=0;i>>(deviceCov_Fix,dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox, - dmean,deviceIdx,deviceIdxSC,icovar); - cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - dbeta[blockIdx.x] = cache[0]; -} - -void updateSpatCoefPrecGPU_Laplace(float *SpatCoefPrec,unsigned long *seed) -{ - float *hbeta; - double *var; - float *dbeta; - double betaSqr = 2.0;//0.2; - double tau; - - var = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); - hbeta = (float*)calloc(BPG,sizeof(float)); - cudaMalloc((void **)&dbeta,BPG*sizeof(float)); - cudaMemset(dbeta,0,BPG*sizeof(float)); - - for (int ii=0;ii>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[ii*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} - CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); - for (int j=0;j>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[jj*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} - CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); - for (int j=0;ja */ - float x,a_star; - int stop; -// double rnorm(double,double,unsigned long *); -// double kiss(unsigned long *); - - - if (a<0.0f) - { - stop=0; - while (stop==0) - { - x = curand_normal(state); - if( x>a ) stop=1; - else stop=0; - } - } - else - { - a_star=0.5f*(a+sqrtf(a*a+4.0f)); - stop=0; - while (stop==0) - { - x=a-__logf(curand_uniform(state))/a_star; - if( __logf(curand_uniform(state))a+3*__expf(-a*a-1/(a*a))/(a+sqrtf(a*a+4.0f))) - { - stop=0; - while (stop==0) - { - x=truncNormLeft(a,state); - if( x < b ) stop=1; - else stop = 0; - } - } - else - { - boo=0.0f; - if (b<0.0f) - boo=b*b; - if (a>0.0f) - boo=a*a; - - - stop=0; - while (stop==0) - { - x=(b-a)*curand_uniform(state)+a; - boun=boo-x*x; - if( 2.0f*__logf(curand_uniform(state))0.0f && num<0.0f ) || ( sign<0.0f && num>0.0f ) ) - return -num; - else return num; -} - -__device__ float sgammaGPU(float a, curandState *state) { - const float q1 = 0.0416666664f; - const float q2 = 0.0208333723f; - const float q3 = 0.0079849875f; - const float q4 = 0.0015746717f; - const float q5 = -0.0003349403f; - const float q6 = 0.0003340332f; - const float q7 = 0.0006053049f; - const float q8 = -0.0004701849f; - const float q9 = 0.0001710320f; - const float a1 = 0.333333333f; - const float a2 = -0.249999949f; - const float a3 = 0.199999867f; - const float a4 = -0.166677482f; - const float a5 = 0.142873973f; - const float a6 = -0.124385581f; - const float a7 = 0.110368310f; - const float a8 = 0.112750886f; - const float a9 = 0.104089866f; - const float e1 = 1.000000000f; - const float e2 = 0.499999994f; - const float e3 = 0.166666848f; - const float e4 = 0.041664508f; - const float e5 = 0.008345522f; - const float e6 = 0.001353826f; - const float e7 = 0.000247453f; - float aa = 0.0f; - float aaa = 0.0f; - const float sqrt32 = 5.65685424949238f; - float sgamma,s2,s,d,t,x,u,r,q0,b,si,c,v,q,e,w,p; - - if(a == aa) - goto S2; - if(a < 1.0f) - goto S13; - -//S1: // STEP 1: RECALCULATIONS OF S2,S,D IF A HAS CHANGED - aa = a; - s2 = a-0.5f; - s = sqrtf(s2); - d = sqrt32-12.0f*s; - -S2: // STEP 2: T=STANDARD NORMAL DEVIATE, X=(S,1/2)-NORMAL DEVIATE. IMMEDIATE ACCEPTANCE (I) - t = curand_normal(state); - x = s+0.5f*t; - sgamma = x*x; - if(t >= 0.0f) - return sgamma; - -//S3: // STEP 3: U= 0,1 -UNIFORM SAMPLE. SQUEEZE ACCEPTANCE (S) - u = curand_uniform(state); - if(d*u <= t*t*t) - return sgamma; - -//S4: // STEP 4: RECALCULATIONS OF Q0,B,SI,C IF NECESSARY - if (a != aaa) { - aaa = a; - r = 1.0f/ a; - q0 = r*(q1+r*(q2+r*(q3+r*(q4+r*(q5+r*(q6+r*(q7+r*(q8+r*q9)))))))); - if (a <= 3.686f) { - b = 0.463f+s+0.178f*s2; - si = 1.235f; - c = 0.195f/s-0.079f+0.16f*s; - } - else if (a <= 13.022f) { - b = 1.654f+0.0076f*s2; - si = 1.68f/s+0.275f; - c = .062/s+0.024f; - } - else { - b = 1.77f; - si = 0.75f; - c = 0.1515f/s; - } - } - -//S5: // NO QUOTIENT TEST IF X NOT POSITIVE - if(x <= 0.0f) - goto S8; - -//S6: // CALCULATION OF V AND QUOTIENT Q - v = t/(s+s); - if(fabsf(v) > 0.25f) - q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); - else - q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); - -//S7: // QUOTIENT ACCEPTANCE (Q) - if(log1pf(-u) <= q) return sgamma; - -S8: // E=STANDARD EXPONENTIAL DEVIATE U= 0,1 -UNIFORM DEVIATE T=(B,SI)-DOUBLE EXPONENTIAL (LAPLACE) SAMPLE - e = rexpGPU(1.0f,state); - u = curand_uniform(state); - u += (u-1.0f); - t = b+fsignfGPU(si*e,u); - -//S9: // REJECTION IF T .LT. TAU(1) = -.71874483771719 - if(t <= -0.71874483771719f) - goto S8; - -//S10: // CALCULATION OF V AND QUOTIENT Q - v = t/(s+s); - if(fabsf(v) > 0.25f) - q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); - else - q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); - -//S11: // HAT ACCEPTANCE (H) (IF Q NOT POSITIVE GO TO STEP 8) - if (q <= 0.5f) - w = q*(e1+q*(e2+q*(e3+q*(e4+q*(e5+q*(e6+q*e7)))))); - else - w = expf(q)-1.0f; - if(q <= 0.0f || c*fabs(u) > w*expf(e-0.5f*t*t)) - goto S8; -//S12: - x = s+0.5f*t; - sgamma = x*x; - return sgamma; - -S13: // ALTERNATE METHOD FOR PARAMETERS A BELOW 1 (.3678794=EXP(-1.)) - aa = 0.0f; - b = 1.0f+0.3678794f*a; - -S14: - p = b*curand_uniform(state); - if(p >= 1.0f) - goto S15; - sgamma = expf(logf(p)/ a); - if(rexpGPU(1.0f,state) < sgamma) - goto S14; - return sgamma; - -S15: - sgamma = -logf((b-p)/ a); - if(rexpGPU(1.0f,state) < (1.0f-a)*logf(sgamma)) - goto S14; - return sgamma; - -} - - -__global__ void Z_GPU1(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,unsigned int *dR) -{ - extern __shared__ float shared[]; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int localsize = blockDim.x; - int localid = threadIdx.x; - - float mean[2000]; - float tmp=0; - float SC[20]; - int voxel=0,IDX=0,IDXSC=0; - curandState localState; - - if (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - tmp = beta*WM[IDX]; - - for (int ii=0;ii NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j 4.0f); - } - state[idx] = localState; - } -} - -__global__ void Z_GPU2(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,unsigned int *dR,const float alp,const float df) -{ - extern __shared__ float shared[]; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int localsize = blockDim.x; - int localid = threadIdx.x; - - float a = alp; - float mean[2000]; - float tmp=0; - float SC[20]; - int voxel=0,IDX=0,IDXSC=0; - curandState localState; - - if (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - tmp = beta*WM[IDX]; - - for (int ii=0;ii NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j 4.0f); - - b = Z - M; - b = (df + b*b)/2.0f; - P = sgammaGPU(a,&localState)/b; - - dZ[isub*TOTVOX+IDX] = Z; - dPhi[isub*TOTVOX+IDX] = P; - } - state[idx] = localState; - } -} - - -__global__ void Phi_GPU(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,const float alp,const float df) -{ - extern __shared__ float shared[]; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int localsize = blockDim.x; - int localid = threadIdx.x; - - const float a = alp; - float beta2[2000]; - float tmp=0; - float SC[20]; - int voxel=0,IDX=0,IDXSC=0; - curandState localState; - - if (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - tmp = beta*WM[IDX]; - - for (int ii=0;ii NSUBS) - localsize = NSUBS - isub; - - if ((isub+localid) < NSUBS) { - for (int j=0;j N) ? INDX[i].hostN:N; - - if (MODEL != 1) { - - for (int i=0;i<2;i++) { - int thr_per_block = TPB2; - int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); - int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; - - Phi_GPU<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, - deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, - INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,alpha,t_df); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Phi_GPU");exit(0);} - } - - } -} - -void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed) -{ - double alpha = ((double)t_df + 1)/2; - unsigned char d; - double beta2,mx= -10,mZ=-10,mtmp,mZ2=-10; - - CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,TOTVOXp*NCOVAR*sizeof(float),cudaMemcpyDeviceToHost) ); - CUDA_CALL( cudaMemcpy(Z,deviceZ,TOTVOX*NSUBS*sizeof(float),cudaMemcpyDeviceToHost) ); - for (int i=0;i<2;i++) { - for (int j=0;jfabs(Z[isub*TOTVOX+IDX])) ? mZ2:fabs(Z[isub*TOTVOX+IDX]); - beta2 = ((double)t_df + beta2*beta2)/2.0; - Phi[isub*TOTVOX+IDX] = (float)rgamma(alpha,beta2,seed); - // Phi[isub*TOTVOX+IDX] = (float)rgamma(0.5*(double)t_df,0.5*(double)t_df,seed); - // printf("%lf\n",Phi[isub*TOTVOX+IDX]); - } - } - } - CUDA_CALL( cudaMemcpy(devicePhi,Phi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyHostToDevice) ); - printf("beta2_max = %lf %lf %lf\t %lf %d\n",mx,mZ,mtmp,mZ2,(int)d);fflush(NULL); -} - -void updateZGPU(unsigned char *data,float beta,unsigned long *seed) -{ - float alpha = (t_df + 1)/2; - float sqrt2 = sqrtf(2); -// float *rchisq,*drchisq; -// float dfdiv2 = t_df/2; - int N=0; - for (int i=0;i<2;i++) - N = (INDX[i].hostN > N) ? INDX[i].hostN:N; - - if (MODEL != 1) { -// rchisq = (float *)malloc(N*sizeof(float)); -// cudaMalloc((void **)&drchisq,N*sizeof(float)); - - for (int i=0;i<2;i++) { -// for (int j=0;j>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, - deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, - INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,deviceResid,alpha,t_df); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU2");exit(0);} - } -// free(rchisq); -// cudaFree(drchisq); - } - else { - - for (int i=0;i<2;i++) { - int thr_per_block = TPB2; - int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); - int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; - - Z_GPU1<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean, - deviceAlphaMean,deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, - INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,deviceResid); - if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU1");exit(0);} - - } - } -} - - -__device__ inline float dProbSDNormGPU(const float x) -//Calculate the cumulative probability of standard normal distribution; -{ - const float b1 = 0.319381530f; - const float b2 = -0.356563782f; - const float b3 = 1.781477937f; - const float b4 = -1.821255978f; - const float b5 = 1.330274429f; - const float p = 0.2316419f; - const float c = 0.39894228f; - float t,sgnx,out,y; - - sgnx = copysignf(1.0f,x); - y = sgnx*x; - - t = 1.0f / ( 1.0f + p * y ); - - out = ((x >= 0.0f) - sgnx*c * expf( -y * y / 2.0f ) * t * - ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); - - return out; -} - -__global__ void ProbLogitGPU(float *prb,float *alpha,float *SpatCoef,float beta,float *WM,int *dIdx,int *dIdxSC,int *vox,const int TOTVOX,const int TOTVOXp,const int NSUBTYPES,const int N,const float logit_factor) -{ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x*gridDim.x; - - float x; - float bwhtmat=0; - int voxel,ii,IDX,IDXSC; - - while (idx < N) { - voxel = vox[idx]; - IDX = dIdx[voxel]; - IDXSC = dIdxSC[voxel]; - - bwhtmat = beta*WM[IDX]; - - for (ii=0;ii>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN,logit_factor); - } - else { - for (int i=0;i<2;i++) - ProbSDNormGPU<<<(INDX[i].hostN+511)/512, 512 >>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN); - } -} - -__global__ void reduce_for_Mpredict(float *in,float *out,unsigned char *data,const int N) -{ - __shared__ float cache[NN]; - int cacheIndex = threadIdx.x; - int ivox = threadIdx.x + blockIdx.x * blockDim.x; - float c = 0; - float y,t; - float temp = 0; - - while (ivox < N) { - int tmp = (int)data[ivox]; - float inv = in[ivox]; - y = tmp*logf(inv + 1E-35f) + (1 - tmp)*logf(1.0f - inv + 1E-35f) - c; - t = temp + y; - c = (t - temp) - y; - temp = t; - ivox += blockDim.x * gridDim.x; - } - - cache[cacheIndex] = temp; - - __syncthreads(); - - int i = (blockDim.x >> 1); - while (i != 0) { - if (cacheIndex < i) - cache[cacheIndex] += cache[cacheIndex + i]; - __syncthreads(); - i = i >> 1; - } - - if (cacheIndex == 0) - out[blockIdx.x] = cache[0]; -} - -double compute_predictGPU(float beta,unsigned char *data,unsigned char *msk,float *covar,float *predict,double **Qhat,int first) -{ - double *Mpredict,DIC; - float *hf_sum,*df_sum; - - Mpredict = (double *)calloc(NSUBTYPES,sizeof(double)); - - CUDA_CALL( cudaMalloc((void **)&df_sum,BPG*sizeof(float)) ); - hf_sum = (float *)calloc(BPG,sizeof(float)); - - DIC = 0; - for (int isub=0;isub>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, - deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, - NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN,logit_factor); - } - } - else { - for (int i=0;i<2;i++) { - ProbSDNormGPU_prediction<<<(INDX[i].hostN+511)/512, 512 >>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, - deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, - NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN); - } - } - - // Compute Bayes Factor -/* reduce<<>>(devicePredict,df_sum,NSUBTYPES*TOTVOX); - - cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); - - for (int j=0;j>>(&devicePredict[ii*TOTVOX],df_sum,&deviceData[isub*TOTVOX],TOTVOX); - CUDA_CALL( cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); - - for (int j=0;j Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; - Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; - } - } - else { - double max = -DBL_MAX + 0.5; - for (int i=0;i Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; - Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; - } - } - } - - free(Mpredict); - CUDA_CALL( cudaFree(df_sum) ); - free(hf_sum); - - return(DIC); -} - - +/* + * covarGPU.cu + * BinCAR + * + * Created by Timothy Johnson on 5/22/12. + * Copyright 2012 University of Michigan. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include "binCAR.h" +#include "randgen.h" +#include "cholesky.h" +#include + +cudaError_t err; + +extern float logit_factor; +extern float t_df; +extern int MODEL; +extern int NSUBS; +extern int NROW; +extern int NCOL; +extern int NDEPTH; +extern int TOTVOX; +extern int TOTVOXp; +extern int NCOVAR; +extern int NCOV_FIX; +extern int NSUBTYPES; +//extern int NSIZE; +extern int iter; +extern INDEX *INDX; +extern float *deviceCovar; +extern float *hostCovar; +extern float *deviceCov_Fix; +extern float *hostCov_Fix; +extern float *deviceAlphaMean; +extern float *deviceFixMean; +extern float *deviceZ; +extern float *devicePhi; +extern float *deviceWM; +extern unsigned int *deviceResid; +extern int *deviceIdx; +extern int *deviceIdxSC; +extern float *devicePrb; +extern float *devicePredict; +extern float *XXprime; +extern float *XXprime_Fix; +extern int *hostIdx; +extern int *hostIdxSC; +//extern float *deviceChiSqHist; +//extern int ChiSqHist_N; + +const int NN=256; +const int BPG=32; +extern curandState *devStates; +extern unsigned char *deviceData; +//extern short *deviceData; +extern float *deviceSpatCoef; +__constant__ float dPrec[20*20]; +__constant__ float dtmpvar[20*20]; +__constant__ float dalphaMean[20]; +__constant__ float dalphaPrec[20]; + +const int TPB=192; +const int TPB2=96; + +#define idx(i,j,k) (i + (NROW+2)*j + (NROW+2)*(NCOL+2)*k) + +#define CUDA_CALL(x) {const cudaError_t a = (x); if (a != cudaSuccess) {printf("\nCUDA Error: %s (err_num=%d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}} + +__global__ void setup_kernel ( curandState * state, unsigned long long devseed, const int N ) +{ +/* Each thread gets same seed , a different sequence number, no offset */ + + int idx = threadIdx.x + blockIdx.x * blockDim.x; + + while (idx < N) { + curand_init (devseed , idx , 0 , & state [ idx ]) ; + idx += blockDim.x *gridDim.x; + } +} + + +__global__ void reduce(float *in,float *out,const int N) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int ivox = threadIdx.x + blockIdx.x * blockDim.x; + + float c = 0; + float y,t; + float temp = 0; + while (ivox < N) { + y = in[ivox] - c; + t = temp + y; + c = (t - temp) - y; + temp = t; + ivox += blockDim.x * gridDim.x; + } + + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = (blockDim.x >> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + out[blockIdx.x] = cache[0]; +} + + +__device__ int cholesky_decompGPU(float *A, int num_col) +{ + int k,i,j; + + for (k=0;k=0;i--) { + temp = 0.0; + for (j=i+1;j NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ, + devicePhi,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef");exit(0);} + } + else { + Spat_Coef_Probit<<>>(deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean,deviceZ,deviceWM,deviceSpatCoef,beta,INDX[i].deviceNBRS,INDX[i].deviceVox,NSUBS,NROW,NCOL,NSUBTYPES,NCOVAR,NCOV_FIX,INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Probit");exit(0);} + } + + } + + + for (this_one=0;this_one>>(&deviceSpatCoef[this_one*TOTVOXp],d_sum,TOTVOXp); + + h_sum = (float *)calloc(BPG,sizeof(float)); + + cudaMemcpy(h_sum,d_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + cmean = 0; + for (int j=0;j>>(&(deviceSpatCoef[this_one*TOTVOXp]),INDX[i].hostN,INDX[i].deviceVox,cmean,deviceIdxSC); + alphaMean[this_one] = cmean/logit_factor; + } +} + + + +__global__ void betaMGPU(float *dcovar,float *dcovF,float *dZ,float *dspatCoef,float *fix,float *dalpha,float *dWM,const int N,const int TOTVOXp,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int TOTVOX,int * vox,float *dmean,int *dIdx,int *dIdxSC) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x*gridDim.x; + + float c = 0; + float y,t; + float temp = 0; + float SC[20]; + float mean=0; + int isub,ii,voxel,IDX,IDXSC; + while (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + for (ii=0;ii> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dmean[blockIdx.x] = cache[0]; +} + + +__global__ void betaVGPU(float *dWM,const int N,int *vox,float *dvar,int *dIdx) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x*gridDim.x; + + float c = 0; + float y,t; + float temp = 0; + + int voxel,IDX; + while (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + y = dWM[IDX]*dWM[IDX] - c; + t = temp + y; + c = (t - temp) - y; + temp = t; + + idx += stride; + } + + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = (blockDim.x >> 1); + while (i != 0) { + if (cacheIndex < i) { + cache[cacheIndex] += cache[cacheIndex + i]; + } + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) { + dvar[blockIdx.x] = cache[0]; + } + +} + +void updateBetaGPU(float *beta,float *dZ,float *dPhi,float *dalpha,float *dWM,float prior_mean_beta,float prior_prec_beta,float *dspatCoef,float *dcovar,unsigned long *seed) +{ + float *dmean,*dvar,*hmean,*hvar; + float mean,var,tmp; + + hmean = (float *)calloc(BPG,sizeof(float)); + hvar = (float *)calloc(BPG,sizeof(float)); + cudaMalloc((void **)&dmean,BPG*sizeof(float)) ; + cudaMalloc((void **)&dvar,BPG*sizeof(float)) ; + + mean = var = 0; + for (int i=0;i<2;i++) { + betaMGPU<<>>(dcovar,deviceCov_Fix,dZ,dspatCoef,deviceFixMean,deviceAlphaMean,deviceWM,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox,dmean,deviceIdx,deviceIdxSC); + cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j>>(deviceWM,INDX[i].hostN,INDX[i].deviceVox,dvar,deviceIdx); + cudaMemcpy(hvar,dvar,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dmean[blockIdx.x] = cache[0]; +} + +void updateAlphaMeanGPU(float *alphaMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) +{ + double *mean,*V,*tmpmean; + float *hmean,*dmean; + + mean = (double *)calloc(NCOVAR,sizeof(double)); + tmpmean = (double *)calloc(NCOVAR,sizeof(double)); + V = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); + + for (int i=0;i>>(dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,TOTVOX,INDX[i].deviceVox, + dmean,deviceIdx,deviceIdxSC,icovar); + cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dmean[blockIdx.x] = cache[0]; +} + +void updatefixMeanGPU(float *fixMean,float Prec,float *dcovar,float *dZ,float *dspatCoef,float beta,unsigned long *seed) +{ + double *mean,*V,*tmpmean; + float *hmean,*dmean; + + mean = (double *)calloc(NCOV_FIX,sizeof(double)); + tmpmean = (double *)calloc(NCOV_FIX,sizeof(double)); + V = (double *)calloc(NCOV_FIX*NCOV_FIX,sizeof(double)); + + for (int i=0;i>>(deviceCov_Fix,dcovar,dZ,dspatCoef,deviceWM,beta,INDX[i].hostN,TOTVOXp,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX,TOTVOX,INDX[i].deviceVox, + dmean,deviceIdx,deviceIdxSC,icovar); + cudaMemcpy(hmean,dmean,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + dbeta[blockIdx.x] = cache[0]; +} + +void updateSpatCoefPrecGPU_Laplace(float *SpatCoefPrec,unsigned long *seed) +{ + float *hbeta; + double *var; + float *dbeta; + double betaSqr = 2.0;//0.2; + double tau; + + var = (double *)calloc(NCOVAR*NCOVAR,sizeof(double)); + hbeta = (float*)calloc(BPG,sizeof(float)); + cudaMalloc((void **)&dbeta,BPG*sizeof(float)); + cudaMemset(dbeta,0,BPG*sizeof(float)); + + for (int ii=0;ii>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[ii*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} + CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); + for (int j=0;j>>(&(deviceSpatCoef[ii*TOTVOXp]),&(deviceSpatCoef[jj*TOTVOXp]),INDX[i].hostN,NROW,NCOL,INDX[i].deviceVox,dbeta,deviceIdxSC); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Spat_Coef_Prec");exit(0);} + CUDA_CALL( cudaMemcpy(hbeta,dbeta,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); + for (int j=0;ja */ + float x,a_star; + int stop; +// double rnorm(double,double,unsigned long *); +// double kiss(unsigned long *); + + + if (a<0.0f) + { + stop=0; + while (stop==0) + { + x = curand_normal(state); + if( x>a ) stop=1; + else stop=0; + } + } + else + { + a_star=0.5f*(a+sqrtf(a*a+4.0f)); + stop=0; + while (stop==0) + { + x=a-__logf(curand_uniform(state))/a_star; + if( __logf(curand_uniform(state))a+3*__expf(-a*a-1/(a*a))/(a+sqrtf(a*a+4.0f))) + { + stop=0; + while (stop==0) + { + x=truncNormLeft(a,state); + if( x < b ) stop=1; + else stop = 0; + } + } + else + { + boo=0.0f; + if (b<0.0f) + boo=b*b; + if (a>0.0f) + boo=a*a; + + + stop=0; + while (stop==0) + { + x=(b-a)*curand_uniform(state)+a; + boun=boo-x*x; + if( 2.0f*__logf(curand_uniform(state))0.0f && num<0.0f ) || ( sign<0.0f && num>0.0f ) ) + return -num; + else return num; +} + +__device__ float sgammaGPU(float a, curandState *state) { + const float q1 = 0.0416666664f; + const float q2 = 0.0208333723f; + const float q3 = 0.0079849875f; + const float q4 = 0.0015746717f; + const float q5 = -0.0003349403f; + const float q6 = 0.0003340332f; + const float q7 = 0.0006053049f; + const float q8 = -0.0004701849f; + const float q9 = 0.0001710320f; + const float a1 = 0.333333333f; + const float a2 = -0.249999949f; + const float a3 = 0.199999867f; + const float a4 = -0.166677482f; + const float a5 = 0.142873973f; + const float a6 = -0.124385581f; + const float a7 = 0.110368310f; + const float a8 = 0.112750886f; + const float a9 = 0.104089866f; + const float e1 = 1.000000000f; + const float e2 = 0.499999994f; + const float e3 = 0.166666848f; + const float e4 = 0.041664508f; + const float e5 = 0.008345522f; + const float e6 = 0.001353826f; + const float e7 = 0.000247453f; + float aa = 0.0f; + float aaa = 0.0f; + const float sqrt32 = 5.65685424949238f; + float sgamma,s2,s,d,t,x,u,r,q0,b,si,c,v,q,e,w,p; + + if(a == aa) + goto S2; + if(a < 1.0f) + goto S13; + +//S1: // STEP 1: RECALCULATIONS OF S2,S,D IF A HAS CHANGED + aa = a; + s2 = a-0.5f; + s = sqrtf(s2); + d = sqrt32-12.0f*s; + +S2: // STEP 2: T=STANDARD NORMAL DEVIATE, X=(S,1/2)-NORMAL DEVIATE. IMMEDIATE ACCEPTANCE (I) + t = curand_normal(state); + x = s+0.5f*t; + sgamma = x*x; + if(t >= 0.0f) + return sgamma; + +//S3: // STEP 3: U= 0,1 -UNIFORM SAMPLE. SQUEEZE ACCEPTANCE (S) + u = curand_uniform(state); + if(d*u <= t*t*t) + return sgamma; + +//S4: // STEP 4: RECALCULATIONS OF Q0,B,SI,C IF NECESSARY + if (a != aaa) { + aaa = a; + r = 1.0f/ a; + q0 = r*(q1+r*(q2+r*(q3+r*(q4+r*(q5+r*(q6+r*(q7+r*(q8+r*q9)))))))); + if (a <= 3.686f) { + b = 0.463f+s+0.178f*s2; + si = 1.235f; + c = 0.195f/s-0.079f+0.16f*s; + } + else if (a <= 13.022f) { + b = 1.654f+0.0076f*s2; + si = 1.68f/s+0.275f; + c = .062/s+0.024f; + } + else { + b = 1.77f; + si = 0.75f; + c = 0.1515f/s; + } + } + +//S5: // NO QUOTIENT TEST IF X NOT POSITIVE + if(x <= 0.0f) + goto S8; + +//S6: // CALCULATION OF V AND QUOTIENT Q + v = t/(s+s); + if(fabsf(v) > 0.25f) + q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); + else + q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); + +//S7: // QUOTIENT ACCEPTANCE (Q) + if(log1pf(-u) <= q) return sgamma; + +S8: // E=STANDARD EXPONENTIAL DEVIATE U= 0,1 -UNIFORM DEVIATE T=(B,SI)-DOUBLE EXPONENTIAL (LAPLACE) SAMPLE + e = rexpGPU(1.0f,state); + u = curand_uniform(state); + u += (u-1.0f); + t = b+fsignfGPU(si*e,u); + +//S9: // REJECTION IF T .LT. TAU(1) = -.71874483771719 + if(t <= -0.71874483771719f) + goto S8; + +//S10: // CALCULATION OF V AND QUOTIENT Q + v = t/(s+s); + if(fabsf(v) > 0.25f) + q = q0-s*t+0.25f*t*t+(s2+s2)*log1pf(v); + else + q = q0+0.5f*t*t*v*(a1+v*(a2+v*(a3+v*(a4+v*(a5+v*(a6+v*(a7+v*(a8+v*a9)))))))); + +//S11: // HAT ACCEPTANCE (H) (IF Q NOT POSITIVE GO TO STEP 8) + if (q <= 0.5f) + w = q*(e1+q*(e2+q*(e3+q*(e4+q*(e5+q*(e6+q*e7)))))); + else + w = expf(q)-1.0f; + if(q <= 0.0f || c*fabs(u) > w*expf(e-0.5f*t*t)) + goto S8; +//S12: + x = s+0.5f*t; + sgamma = x*x; + return sgamma; + +S13: // ALTERNATE METHOD FOR PARAMETERS A BELOW 1 (.3678794=EXP(-1.)) + aa = 0.0f; + b = 1.0f+0.3678794f*a; + +S14: + p = b*curand_uniform(state); + if(p >= 1.0f) + goto S15; + sgamma = expf(logf(p)/ a); + if(rexpGPU(1.0f,state) < sgamma) + goto S14; + return sgamma; + +S15: + sgamma = -logf((b-p)/ a); + if(rexpGPU(1.0f,state) < (1.0f-a)*logf(sgamma)) + goto S14; + return sgamma; + +} + + +__global__ void Z_GPU1(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,unsigned int *dR) +{ + extern __shared__ float shared[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int localsize = blockDim.x; + int localid = threadIdx.x; + + float mean[2000]; + float tmp=0; + float SC[20]; + int voxel=0,IDX=0,IDXSC=0; + curandState localState; + + if (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + tmp = beta*WM[IDX]; + + for (int ii=0;ii NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j 4.0f); + } + state[idx] = localState; + } +} + +__global__ void Z_GPU2(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,unsigned int *dR,const float alp,const float df) +{ + extern __shared__ float shared[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int localsize = blockDim.x; + int localid = threadIdx.x; + + float a = alp; + float mean[2000]; + float tmp=0; + float SC[20]; + int voxel=0,IDX=0,IDXSC=0; + curandState localState; + + if (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + tmp = beta*WM[IDX]; + + for (int ii=0;ii NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j 4.0f); + + b = Z - M; + b = (df + b*b)/2.0f; + P = sgammaGPU(a,&localState)/b; + + dZ[isub*TOTVOX+IDX] = Z; + dPhi[isub*TOTVOX+IDX] = P; + } + state[idx] = localState; + } +} + + +__global__ void Phi_GPU(float *dZ,unsigned char *data,float *covar,float *covarF,float *fix,float *alpha,float *WM,float *SpatCoef,const float beta,int *vox,const int NSUBS,const int NSUBTYPES,const int NCOVAR,const int NCOV_FIX,const int N,const int TOTVOXp,const int TOTVOX,curandState *state,int *dIdx,int *dIdxSC,const float sqrt2,float *dPhi,const float alp,const float df) +{ + extern __shared__ float shared[]; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int localsize = blockDim.x; + int localid = threadIdx.x; + + const float a = alp; + float beta2[2000]; + float tmp=0; + float SC[20]; + int voxel=0,IDX=0,IDXSC=0; + curandState localState; + + if (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + tmp = beta*WM[IDX]; + + for (int ii=0;ii NSUBS) + localsize = NSUBS - isub; + + if ((isub+localid) < NSUBS) { + for (int j=0;j N) ? INDX[i].hostN:N; + + if (MODEL != 1) { + + for (int i=0;i<2;i++) { + int thr_per_block = TPB2; + int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); + int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; + + Phi_GPU<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, + deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, + INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,alpha,t_df); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Phi_GPU");exit(0);} + } + + } +} + +void updatePhi(unsigned char *data,float *Z,float *Phi,unsigned char *msk,float beta,float *WM,float *spatCoef,float *covar,unsigned long *seed) +{ + double alpha = ((double)t_df + 1)/2; + unsigned char d; + double beta2,mx= -10,mZ=-10,mtmp,mZ2=-10; + + CUDA_CALL( cudaMemcpy(spatCoef,deviceSpatCoef,TOTVOXp*NCOVAR*sizeof(float),cudaMemcpyDeviceToHost) ); + CUDA_CALL( cudaMemcpy(Z,deviceZ,TOTVOX*NSUBS*sizeof(float),cudaMemcpyDeviceToHost) ); + for (int i=0;i<2;i++) { + for (int j=0;jfabs(Z[isub*TOTVOX+IDX])) ? mZ2:fabs(Z[isub*TOTVOX+IDX]); + beta2 = ((double)t_df + beta2*beta2)/2.0; + Phi[isub*TOTVOX+IDX] = (float)rgamma(alpha,beta2,seed); + // Phi[isub*TOTVOX+IDX] = (float)rgamma(0.5*(double)t_df,0.5*(double)t_df,seed); + // printf("%lf\n",Phi[isub*TOTVOX+IDX]); + } + } + } + CUDA_CALL( cudaMemcpy(devicePhi,Phi,NSUBS*TOTVOX*sizeof(float),cudaMemcpyHostToDevice) ); + printf("beta2_max = %lf %lf %lf\t %lf %d\n",mx,mZ,mtmp,mZ2,(int)d);fflush(NULL); +} + +void updateZGPU(unsigned char *data,float beta,unsigned long *seed) +{ + float alpha = (t_df + 1)/2; + float sqrt2 = sqrtf(2); +// float *rchisq,*drchisq; +// float dfdiv2 = t_df/2; + int N=0; + for (int i=0;i<2;i++) + N = (INDX[i].hostN > N) ? INDX[i].hostN:N; + + if (MODEL != 1) { +// rchisq = (float *)malloc(N*sizeof(float)); +// cudaMalloc((void **)&drchisq,N*sizeof(float)); + + for (int i=0;i<2;i++) { +// for (int j=0;j>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean,deviceAlphaMean, + deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, + INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,devicePhi,deviceResid,alpha,t_df); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU2");exit(0);} + } +// free(rchisq); +// cudaFree(drchisq); + } + else { + + for (int i=0;i<2;i++) { + int thr_per_block = TPB2; + int shared_memory = (NCOVAR * thr_per_block) * sizeof(float); + int blck_per_grid = (INDX[i].hostN+thr_per_block-1)/thr_per_block; + + Z_GPU1<<>>(deviceZ,deviceData,deviceCovar,deviceCov_Fix,deviceFixMean, + deviceAlphaMean,deviceWM,deviceSpatCoef,beta,INDX[i].deviceVox,NSUBS,NSUBTYPES,NCOVAR,NCOV_FIX, + INDX[i].hostN,TOTVOXp,TOTVOX,devStates,deviceIdx,deviceIdxSC,sqrt2,deviceResid); + if ((err = cudaGetLastError()) != cudaSuccess) {printf("Error %s %s\n",cudaGetErrorString(err)," in Z_GPU1");exit(0);} + + } + } +} + + +__device__ inline float dProbSDNormGPU(const float x) +//Calculate the cumulative probability of standard normal distribution; +{ + const float b1 = 0.319381530f; + const float b2 = -0.356563782f; + const float b3 = 1.781477937f; + const float b4 = -1.821255978f; + const float b5 = 1.330274429f; + const float p = 0.2316419f; + const float c = 0.39894228f; + float t,sgnx,out,y; + + sgnx = copysignf(1.0f,x); + y = sgnx*x; + + t = 1.0f / ( 1.0f + p * y ); + + out = ((x >= 0.0f) - sgnx*c * expf( -y * y / 2.0f ) * t * + ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); + + return out; +} + +__global__ void ProbLogitGPU(float *prb,float *alpha,float *SpatCoef,float beta,float *WM,int *dIdx,int *dIdxSC,int *vox,const int TOTVOX,const int TOTVOXp,const int NSUBTYPES,const int N,const float logit_factor) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x*gridDim.x; + + float x; + float bwhtmat=0; + int voxel,ii,IDX,IDXSC; + + while (idx < N) { + voxel = vox[idx]; + IDX = dIdx[voxel]; + IDXSC = dIdxSC[voxel]; + + bwhtmat = beta*WM[IDX]; + + for (ii=0;ii>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN,logit_factor); + } + else { + for (int i=0;i<2;i++) + ProbSDNormGPU<<<(INDX[i].hostN+511)/512, 512 >>>(devicePrb,deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp,NSUBTYPES,INDX[i].hostN); + } +} + +__global__ void reduce_for_Mpredict(float *in,float *out,unsigned char *data,const int N) +{ + __shared__ float cache[NN]; + int cacheIndex = threadIdx.x; + int ivox = threadIdx.x + blockIdx.x * blockDim.x; + float c = 0; + float y,t; + float temp = 0; + + while (ivox < N) { + int tmp = (int)data[ivox]; + float inv = in[ivox]; + y = tmp*logf(inv + 1E-35f) + (1 - tmp)*logf(1.0f - inv + 1E-35f) - c; + t = temp + y; + c = (t - temp) - y; + temp = t; + ivox += blockDim.x * gridDim.x; + } + + cache[cacheIndex] = temp; + + __syncthreads(); + + int i = (blockDim.x >> 1); + while (i != 0) { + if (cacheIndex < i) + cache[cacheIndex] += cache[cacheIndex + i]; + __syncthreads(); + i = i >> 1; + } + + if (cacheIndex == 0) + out[blockIdx.x] = cache[0]; +} + +double compute_predictGPU(float beta,unsigned char *data,unsigned char *msk,float *covar,float *predict,double **Qhat,int first) +{ + double *Mpredict,DIC; + float *hf_sum,*df_sum; + + Mpredict = (double *)calloc(NSUBTYPES,sizeof(double)); + + CUDA_CALL( cudaMalloc((void **)&df_sum,BPG*sizeof(float)) ); + hf_sum = (float *)calloc(BPG,sizeof(float)); + + DIC = 0; + for (int isub=0;isub>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, + deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, + NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN,logit_factor); + } + } + else { + for (int i=0;i<2;i++) { + ProbSDNormGPU_prediction<<<(INDX[i].hostN+511)/512, 512 >>>(deviceCovar,deviceCov_Fix,devicePredict,deviceFixMean, + deviceAlphaMean,deviceSpatCoef,beta,deviceWM,deviceIdx,deviceIdxSC,INDX[i].deviceVox,TOTVOX,TOTVOXp, + NSUBTYPES,NCOVAR,NSUBS,NCOV_FIX,isub,INDX[i].hostN); + } + } + + // Compute Bayes Factor +/* reduce<<>>(devicePredict,df_sum,NSUBTYPES*TOTVOX); + + cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost); + + for (int j=0;j>>(&devicePredict[ii*TOTVOX],df_sum,&deviceData[isub*TOTVOX],TOTVOX); + CUDA_CALL( cudaMemcpy(hf_sum,df_sum,BPG*sizeof(float),cudaMemcpyDeviceToHost) ); + + for (int j=0;j Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; + Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; + } + } + else { + double max = -DBL_MAX + 0.5; + for (int i=0;i Qhat[isub][i]) ? (Mpredict[i] - Mpredict[subtype]) : Qhat[isub][i]; + Qhat[isub][i] = log(exp(Qhat[isub][i] - max) + exp((Mpredict[i] - Mpredict[subtype]) - max)) + max; + } + } + } + + free(Mpredict); + CUDA_CALL( cudaFree(df_sum) ); + free(hf_sum); + + return(DIC); +} + + From 667f850ff42130084c0f4dbc12473b2ff25578e3 Mon Sep 17 00:00:00 2001 From: nicholst Date: Tue, 20 Jan 2015 14:05:32 +0000 Subject: [PATCH 11/13] Re-doing the fix for low iters (oddly, with code found in the master!) --- bsglmm/mcmc.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bsglmm/mcmc.cpp b/bsglmm/mcmc.cpp index 807b24f..bf8e614 100644 --- a/bsglmm/mcmc.cpp +++ b/bsglmm/mcmc.cpp @@ -296,10 +296,12 @@ void mcmc(float *covar,float *covar_fix,unsigned char *data,float *WM,unsigned c CUDA_CALL( cudaEventCreate(&stop) ); //cudaEventRecord(start,0); - SAVE_ITER = (MAXITER - BURNIN)/10000; - if (SAVE_ITER==0) - SAVE_ITER = 100; - + if ((MAXITER-BURNIN)<10000){ + SAVE_ITER = MAXITER-BURNIN; + } + else { + SAVE_ITER = (MAXITER - BURNIN)/10000; + } PRNtITER = 100; for (iter=0;iter<=MAXITER;iter++) {//printf("iter = %d\n",iter);fflush(NULL); From bd62a53d31aa088db0ea188c5a1fb02e912a0b0f Mon Sep 17 00:00:00 2001 From: nicholst Date: Tue, 20 Jan 2015 14:08:50 +0000 Subject: [PATCH 12/13] Added a warning not to trust the CPU code --- bsglmm/main.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bsglmm/main.cu b/bsglmm/main.cu index acf7041..6d0d7d9 100644 --- a/bsglmm/main.cu +++ b/bsglmm/main.cu @@ -274,6 +274,9 @@ int main (int argc, char * const argv[]) { } else { + fprintf(stderr,"WARNING!!!\n") + fprintf(stderr,"WARNING!!! CPU code not tested! Results might not be right!\n") + fprintf(stderr,"WARNING!!!\n") for (int i=0;i<2;i++) { INDX[i].hostVox = (int *)calloc(INDX[i].hostN,sizeof(int)); } From 59f42bb3d8a43b5e4b1bc64736f111e79f2d5df1 Mon Sep 17 00:00:00 2001 From: nicholst Date: Tue, 20 Jan 2015 16:15:39 +0000 Subject: [PATCH 13/13] Missing semicolons! --- bsglmm/main.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bsglmm/main.cu b/bsglmm/main.cu index 6d0d7d9..285a5bf 100644 --- a/bsglmm/main.cu +++ b/bsglmm/main.cu @@ -274,9 +274,9 @@ int main (int argc, char * const argv[]) { } else { - fprintf(stderr,"WARNING!!!\n") - fprintf(stderr,"WARNING!!! CPU code not tested! Results might not be right!\n") - fprintf(stderr,"WARNING!!!\n") + fprintf(stderr,"WARNING!!!\n"); + fprintf(stderr,"WARNING!!! CPU code not tested! Results might not be right!\n"); + fprintf(stderr,"WARNING!!!\n"); for (int i=0;i<2;i++) { INDX[i].hostVox = (int *)calloc(INDX[i].hostN,sizeof(int)); }