From c6ecb4572724bc5fd97d6107b8a259abfb768a12 Mon Sep 17 00:00:00 2001
From: Piotr Jarosik <jarosik.piotr@gmail.com>
Date: Mon, 14 Feb 2022 19:01:19 +0100
Subject: [PATCH] Implemented ReconstructLri3D operator, refs #ARRUS-126

Note: the implementation of the operator is specific for the Vermon matrix probe.
---
 api/python/arrus/utils/imaging.py         | 206 ++++++++++++++++++++++
 api/python/arrus/utils/iq_raw_2_lri_3d.cu | 150 ++++++++++++++++
 2 files changed, 356 insertions(+)
 create mode 100644 api/python/arrus/utils/iq_raw_2_lri_3d.cu

diff --git a/api/python/arrus/utils/imaging.py b/api/python/arrus/utils/imaging.py
index 9e79f3162..29b3a82d2 100644
--- a/api/python/arrus/utils/imaging.py
+++ b/api/python/arrus/utils/imaging.py
@@ -2152,3 +2152,209 @@ def process(self, data):
             .reshape(self._n_frames, -1) \
             .copy()
 
+
+class ReconstructLri3D(Operation):
+    """
+    Rx beamforming for synthetic aperture imaging for matrix array.
+
+    tx_foc, tx_ang_zx, tx_ang_zy: arrays
+
+    Expected input data shape: batch_size, n_emissions, n_rx_x, n_rx_y, n_samples
+    :param x_grid: output image grid points (OX coordinates)
+    :param y_grid: output image grid points (OY coordinates)
+    :param z_grid: output image grid points  (OZ coordinates)
+    :param rx_tang_limits: RX apodization angle limits (given as the tangent of the angle), \
+      a pair of values (min, max). If not provided or None, [-0.5, 0.5] range will be used
+    """
+
+    def __init__(self, x_grid, y_grid, z_grid, tx_foc, tx_ang_zx, tx_ang_zy,
+                 tx_ap_cent_x, tx_ap_cent_y,
+                 speed_of_sound, rx_tang_limits=None):
+        self.tx_ap_cent_y = tx_ap_cent_y
+        self.tx_ap_cent_x = tx_ap_cent_x
+        self.tx_ang_zy = tx_ang_zy
+        self.tx_ang_zx = tx_ang_zx
+        self.tx_foc = tx_foc
+        self.x_grid = x_grid
+        self.y_grid = y_grid
+        self.z_grid = z_grid
+        self.speed_of_sound = speed_of_sound
+        import cupy as cp
+        self.num_pkg = cp
+        self.rx_tang_limits = rx_tang_limits
+
+    def set_pkgs(self, num_pkg, **kwargs):
+        if num_pkg is np:
+            raise ValueError("ReconstructLri3D operation is implemented for GPU only.")
+
+    def prepare(self, const_metadata):
+        import cupy as cp
+
+        current_dir = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+        _kernel_source = Path(os.path.join(current_dir, "iq_raw_2_lri_3d.cu")).read_text()
+        self._kernel_module = self.num_pkg.RawModule(code=_kernel_source)
+        self._kernel_module.compile()
+        self._kernel = self._kernel_module.get_function("iqRaw2Lri3D")
+
+        # INPUT PARAMETERS.
+        # Input data shape.
+        self.n_seq, self.n_tx, self.n_rx_y, self.n_rx_x, self.n_samples = const_metadata.input_shape
+
+        seq = const_metadata.context.raw_sequence
+        # TODO note: we assume here that a single TX/RX has the below properties
+        # the same for each TX/RX. Validation is missing here.
+        ref_tx_rx = seq.ops[0]
+        ref_tx = ref_tx_rx.tx
+        ref_rx = ref_tx_rx.rx
+        probe_model = const_metadata.context.device.probe.model
+        acq_fs = (const_metadata.context.device.sampling_frequency / ref_rx.downsampling_factor)
+        start_sample = ref_rx.sample_range[0]
+
+        self.y_size = len(self.y_grid)
+        self.x_size = len(self.x_grid)
+        self.z_size = len(self.z_grid)
+        output_shape = (self.n_seq, self.n_tx, self.y_size, self.x_size, self.z_size)
+        self.output_buffer = self.num_pkg.zeros(output_shape, dtype=self.num_pkg.complex64)
+        x_block_size = min(self.z_size, 8)
+        y_block_size = min(self.x_size, 8)
+        z_block_size = min(self.y_size, 8)
+        self.block_size = (x_block_size, y_block_size, z_block_size)
+        self.grid_size = (int((self.z_size-1)//z_block_size + 1),
+                          int((self.x_size-1)//x_block_size + 1),
+                          int((self.y_size-1)//y_block_size + 1))
+
+        self.y_pix = self.num_pkg.asarray(self.y_grid, dtype=self.num_pkg.float32)
+        self.x_pix = self.num_pkg.asarray(self.x_grid, dtype=self.num_pkg.float32)
+        self.z_pix = self.num_pkg.asarray(self.z_grid, dtype=self.num_pkg.float32)
+
+        # System and transmit properties.
+        self.sos = self.num_pkg.float32(self.speed_of_sound)
+        self.fs = self.num_pkg.float32(const_metadata.data_description.sampling_frequency)
+        self.fn = self.num_pkg.float32(ref_tx.excitation.center_frequency)
+        self.tx_foc = self.num_pkg.asarray(self.tx_foc).astype(np.float32)
+        self.tx_ang_zx = self.num_pkg.asarray(self.tx_ang_zx).astype(np.float32)
+        self.tx_ang_zy = self.num_pkg.asarray(self.tx_ang_zy).astype(np.float32)
+
+        # Could be determined automatically based on
+        self.tx_ap_cent_x = self.num_pkg.asarray(self.tx_ap_cent_x).astype(np.float32)
+        self.tx_ap_cent_y = self.num_pkg.asarray(self.tx_ap_cent_y).astype(np.float32)
+
+        # Probe description
+        # TODO specific for Vermon mat-3d probe.
+        probe_model = const_metadata.context.device.probe.model
+        pitch = 0.3e-3 # probe_model.pitch
+        self.n_elements = 32
+        # General regular position of elements
+        element_pos = np.arange(-(self.n_elements - 1)/2, self.n_elements/2)
+        element_pos_x = element_pos*pitch
+        element_pos_y = np.zeros_like(element_pos)
+        # E.g. rows
+        # 0:8 -> 0:8
+        # 8:16 -> 9:17 (+ 1 offset)
+        # 16:24 -> 18:26 (+2 offset)
+        # 24:32 -> 27:35 (+ 3 offset)
+        for i in range(4):
+            element_pos_y[i*8:(i+1)*8] = (element_pos[i*8:(i+1)*8]+i)*pitch
+
+        element_pos_x = element_pos_x.astype(np.float32)
+        element_pos_y = element_pos_y.astype(np.float32)
+        # Put the data into GPU constant memory.
+        device_props = cp.cuda.runtime.getDeviceProperties(0)
+        if device_props["totalConstMem"] < 256*2*4:  # 2 float32 arrays, 256 elements max
+            raise ValueError("There is not enough constant memory available!")
+        x_elem = np.asarray(element_pos_x, dtype=self.num_pkg.float32)
+        self._x_elem_const = _get_const_memory_array(
+            self._kernel_module, name="xElemConst", input_array=x_elem)
+        y_elem = np.asarray(element_pos_y, dtype=self.num_pkg.float32)
+        self._y_elem_const = _get_const_memory_array(
+            self._kernel_module, name="yElemConst", input_array=y_elem)
+
+        def get_min_max_x_y(aperture):
+            cords = np.argwhere(aperture)
+            y, x = zip(*cords)
+            return np.min(x), np.max(x), np.min(y), np.max(y)
+
+        # TODO assumption, that probe has the same number elements in both dimensions
+        apertures = (tx_rx.tx.aperture.reshape((self.n_elements, self.n_elements)) for tx_rx in seq.ops)
+        min_max_x_y = (get_min_max_x_y(aperture) for aperture in apertures)
+        min_x, max_x, min_y, max_y = zip(*min_max_x_y)
+        min_x, max_x = np.atleast_1d(min_x), np.atleast_1d(max_x)
+        min_y, max_y = np.atleast_1d(min_y), np.atleast_1d(max_y)
+        self.tx_ap_first_elem_x = self.num_pkg.asarray(min_x, dtype=self.num_pkg.int32)
+        self.tx_ap_last_elem_x = self.num_pkg.asarray(max_x, dtype=self.num_pkg.int32)
+        self.tx_ap_first_elem_y = self.num_pkg.asarray(min_y, dtype=self.num_pkg.int32)
+        self.tx_ap_last_elem_y = self.num_pkg.asarray(max_y, dtype=self.num_pkg.int32)
+
+        # Find the tx_center_delay
+        # 1. Find the position of the center.
+        ap_center_x = (element_pos_x[min_x] + element_pos_x[max_x])/2
+        ap_center_y = (element_pos_y[min_y] + element_pos_y[max_y])/2
+        ap_center_elem_x = np.interp(ap_center_x, element_pos_x, np.arange(len(element_pos_x)))
+        ap_center_elem_y = np.interp(ap_center_y, element_pos_y, np.arange(len(element_pos_y)))
+        ap_center_elem_x = np.round(ap_center_elem_x).astype(np.int32)
+        ap_center_elem_y = np.round(ap_center_elem_y).astype(np.int32)
+        # Make sure, that for all TX/RXs:
+        # The center element is in the aperture
+        # All TX/RX have (almost) the same delay in the aperture's center
+        tx_center_delay = None
+        for i, tx_rx in enumerate(seq.ops):
+            tx = tx_rx.tx
+            tx_center_x = ap_center_elem_x[i]
+            tx_center_y = ap_center_elem_y[i]
+            aperture = tx.aperture.reshape((self.n_elements, self.n_elements))
+            # TODO the below will work only for full aperture transmits
+            delays = tx.delays.reshape((self.n_elements, self.n_elements))
+            if not aperture[tx_center_y, tx_center_x]:
+                # The aperture's center should transmit signal
+                raise ValueError("TX aperture center should be turned on.")
+            if tx_center_delay is None:
+                tx_center_delay = delays[tx_center_y, tx_center_x]
+            else:
+                # Make sure that the center' delays is the same for all TX/RXs
+                current_center_delay = delays[tx_center_y, tx_center_x]
+                if not np.isclose(tx_center_delay, current_center_delay):
+                    raise ValueError(f"TX/RX {i}: center delay is not close "
+                                     f"to the center delay of other TX/RXs."
+                                     f"Assumed that the center element is:"
+                                     f"({tx_center_y, tx_center_x}). "
+                                     f"Center delays should be equalized "
+                                     f"for all TX/RXs. ")
+
+        # Min/max tang
+        if self.rx_tang_limits is not None:
+            self.min_tang, self.max_tang = self.rx_tang_limits
+        else:
+            # Default:
+            self.min_tang, self.max_tang = -0.5, 0.5
+        self.min_tang = self.num_pkg.float32(self.min_tang)
+        self.max_tang = self.num_pkg.float32(self.max_tang)
+        burst_factor = ref_tx.excitation.n_periods / (2*self.fn)
+        self.initial_delay = -start_sample/65e6+burst_factor+tx_center_delay
+        self.initial_delay = self.num_pkg.float32(self.initial_delay)
+
+        self.rx_apod = scipy.signal.windows.hamming(20).astype(np.float32)
+        self.rx_apod = self.num_pkg.asarray(self.rx_apod)
+        self.n_rx_apod = self.num_pkg.int32(len(self.rx_apod))
+
+        return const_metadata.copy(input_shape=output_shape)
+
+    def process(self, data):
+        data = self.num_pkg.ascontiguousarray(data)
+        params = (
+            self.output_buffer, data,
+            self.n_seq, self.n_tx, self.n_rx_y, self.n_rx_x, self.n_samples,
+            self.z_pix, self.z_size,
+            self.x_pix, self.x_size,
+            self.y_pix, self.y_size,
+            self.sos, self.fs, self.fn,
+            self.tx_foc, self.tx_ang_zx, self.tx_ang_zy,
+            self.tx_ap_cent_x, self.tx_ap_cent_y,
+            self.tx_ap_first_elem_x, self.tx_ap_last_elem_x,
+            self.tx_ap_first_elem_y, self.tx_ap_last_elem_y,
+            self.min_tang, self.max_tang,
+            self.min_tang, self.max_tang,
+            self.initial_delay,
+            self.rx_apod, self.n_rx_apod
+        )
+        self._kernel(self.grid_size, self.block_size, params)
+        return self.output_buffer
\ No newline at end of file
diff --git a/api/python/arrus/utils/iq_raw_2_lri_3d.cu b/api/python/arrus/utils/iq_raw_2_lri_3d.cu
new file mode 100644
index 000000000..742c7a699
--- /dev/null
+++ b/api/python/arrus/utils/iq_raw_2_lri_3d.cu
@@ -0,0 +1,150 @@
+#include <cupy/complex.cuh>
+#define M_PI 3.14159265358979
+
+__constant__ float xElemConst[256];
+__constant__ float yElemConst[256];
+
+__forceinline__ __device__ float ownHypotf(float z, float x)
+{
+    return sqrtf(z*z + x*x);
+}
+
+__forceinline__ __device__ float ownHypotf(float z, float x, float y)
+{
+    return sqrtf(z*z + x*x + y*y);
+}
+
+__forceinline__ __device__  complex<float> interpLinear(const complex<float> *input, float sample) {
+    float interpWgh = modff(sample, &sample);
+    int intSample = int(sample);
+    return input[intSample]*(1 - interpWgh) + input[intSample+1]*interpWgh;
+}
+
+__forceinline__ __device__  float interpLinearNormalized(const float *input, int inputSize, float sample) {
+    // Normalized => sample == 0 means input[0], sample == 1 means input[inputSize-1]
+    sample = sample*(inputSize-1);
+    float interpWgh = modff(sample, &sample);
+    int intSample = int(sample);
+    return input[intSample]*(1-interpWgh) + input[intSample+1]*interpWgh;
+}
+
+extern "C"
+__global__ void iqRaw2Lri3D(complex<float> *iqLri, const complex<float> *input,
+                            const int nSeq, const int nTx, const int nElemY, const int nElemX, const int nSamp,
+                            const float *zPix, const int nZPix,
+                            const float *xPix, const int nXPix,
+                            const float *yPix, const int nYPix,
+                            const float sos, const float fs, const float fn,
+                            const float *txFoc, const float *txAngZX, const float *txAngZY,
+                            const float *txApCentX, const float *txApCentY,
+                            const int *txApFstElemX, const int *txApLstElemX,
+                            const int *txApFstElemY, const int *txApLstElemY,
+                            const float minRxTangZX, const float maxRxTangZX,
+                            const float minRxTangZY, const float maxRxTangZY,
+                            const float initDel,
+                            const float *rxApod, const int nRxApod) {
+
+    int z = blockIdx.x * blockDim.x + threadIdx.x;
+    int x = blockIdx.y * blockDim.y + threadIdx.y;
+    int y = blockIdx.z * blockDim.z + threadIdx.z;
+    if (z >= nZPix || x >= nXPix || y >= nYPix) {
+        return;
+    }
+    unsigned txOffset, offset;
+    float txAngZn, txAngAz;
+    float txDist, rxDist, rxTang, txApod, rxApodX, rxApodY, time, iSamp;
+    float modSin, modCos, pixWgh;
+    complex<float> samp, pix;
+    complex<float> modFactor;
+    const float omega = 2 * M_PI * fn;
+    const float sosInv = 1 / sos;
+    const float zDistInv = 1 / zPix[z];
+    const float rngRxTangZXInv = 1 / (maxRxTangZX - minRxTangZX); // inverted zx-tangent range TODO parameter?
+    const float rngRxTangZYInv = 1 / (maxRxTangZY - minRxTangZY); // inverted zy-tangent range TODO parameter?
+    iqLri[z + x*nZPix + y*nZPix*nXPix] = 0;
+
+    for (int iTx=0; iTx < nTx; ++iTx) {
+        // Plane inclinations -> spherical coordinates
+        txAngZn = atanf(ownHypotf(tanf(txAngZX[iTx]), tanf(txAngZY[iTx]))); // Zenith
+        txAngAz = atan2f(tanf(txAngZY[iTx]), tanf(txAngZX[iTx])); // Azimuth
+        txOffset = iTx*nSamp*nElemX*nElemY; // TODO number of sequences in transmit?
+
+        if (!isinf(txFoc[iTx])) {
+            /* STA */
+            float zFoc = txFoc[iTx] * cosf(txAngZn);
+            float xFoc = txFoc[iTx] * sinf(txAngZn) * cosf(txAngAz) + txApCentX[iTx];
+            float yFoc = txFoc[iTx] * sinf(txAngZn) * sinf(txAngAz) + txApCentY[iTx];
+
+            float pixFocArrang;
+
+            if (txFoc[iTx] <= 0.f) {
+                /* Virtual Point Source BEHIND probe surface */
+                // Valid pixels are assumed to be always in front of the focal point (VSP)
+                pixFocArrang = 1.f;
+            }
+            else {
+                /* Virtual Point Source IN FRONT OF probe surface */
+                // Projection of the Foc-Pix vector on the ApCent-Foc vector (dot product) ...
+                // to determine if the pixel is behind (-) or in front of (+) the focal point (VSP).
+                pixFocArrang = (((zPix[z]-zFoc)* zFoc + (xPix[x]-xFoc)*(xFoc-txApCentX[iTx]) + (yPix[x]-yFoc)*(yFoc-txApCentY[iTx])) >= 0.f) ? 1.f : -1.f;
+            }
+            txDist = ownHypotf(zPix[z]-zFoc, xPix[x]-xFoc, yPix[y]-yFoc);
+            txDist *= pixFocArrang; // Compensation for the Pix-Foc arrangement
+            // txFoc is an artificial time, that actually does not happen
+            txDist += txFoc[iTx]; // Compensation for the reference time being the moment when txApCent fires.
+
+            // Projections of Foc-Pix vector on the rotated Foc-ApEdge vectors (dot products) ...
+            // to determine if the pixel is in the sonified area (dot product >= 0).
+            // Foc-ApEdgeFst vector is rotated left, Foc-ApEdgeLst vector is rotated right.
+            txApod = (
+                    ((-(zPix[z] - zFoc)*(xElemConst[txApFstElemX[iTx]] - xFoc) - (xPix[x] - xFoc)*zFoc) * pixFocArrang >= 0.f) &&
+                    ( ((zPix[z] - zFoc)*(xElemConst[txApLstElemX[iTx]] - xFoc) + (xPix[x] - xFoc)*zFoc) * pixFocArrang >= 0.f) &&
+                    ((-(zPix[z] - zFoc)*(yElemConst[txApFstElemY[iTx]] - yFoc) - (yPix[y] - yFoc)*zFoc) * pixFocArrang >= 0.f) &&
+                    (( (zPix[z] - zFoc)*(yElemConst[txApLstElemY[iTx]] - yFoc) + (yPix[y] - yFoc)*zFoc) * pixFocArrang >= 0.f)
+                    ) ? 1.f : 0.f;
+        }
+        else {
+            /* PWI */
+            txDist = zPix[z] * cosf(txAngZn) + ((xPix[x] - txApCentX[iTx]) * cosf(txAngAz) + (yPix[y] - txApCentY[iTx]) * sinf(txAngAz)) * sinf(txAngZn);
+
+            // Projections of ApEdge-Pix vector on the rotated unit vector of tx direction (dot products) ...
+            // to determine if the pixel is in the sonified area (dot product >= 0).
+            // For ApEdgeFst, the vector is rotated left, for ApEdgeLst the vector is rotated right.
+            txApod = (((-sinf(txAngZX[iTx])*zPix[z] + cosf(txAngZX[iTx]) * (xPix[x]-xElemConst[txApFstElemX[iTx]])) >= 0.f) &&
+                    (   (sinf(txAngZX[iTx])*zPix[z] - cosf(txAngZX[iTx]) * (xPix[x]-xElemConst[txApLstElemX[iTx]])) >= 0.f) &&
+                    (  (-sinf(txAngZY[iTx])*zPix[z] + cosf(txAngZY[iTx]) * (yPix[x]-yElemConst[txApFstElemY[iTx]])) >= 0.f) &&
+                    (   (sinf(txAngZY[iTx])*zPix[z] - cosf(txAngZY[iTx]) * (yPix[x]-yElemConst[txApLstElemY[iTx]])) >= 0.f )) ? 1.f : 0.f;
+        }
+
+        pix.real(0.0f);
+        pix.imag(0.0f);
+        pixWgh = 0.0f;
+
+        if (txApod != 0.f) {
+            for (int iElemY = 0; iElemY < nElemY; ++iElemY) {
+                rxTang = (yPix[y]-yElemConst[iElemY])*zDistInv;
+                if (rxTang < minRxTangZY || rxTang > maxRxTangZY) continue;
+                rxApodY = interpLinearNormalized(rxApod, nRxApod, (rxTang-minRxTangZY)*rngRxTangZYInv);
+                for (int iElemX = 0; iElemX < nElemX; ++iElemX) {
+                    offset = iTx*nElemY*nElemX*nSamp + iElemY*nElemX*nSamp + iElemX*nSamp;
+                    rxTang = (xPix[x] - xElemConst[iElemX])*zDistInv;
+                    if (rxTang < minRxTangZX || rxTang > maxRxTangZX) continue;
+                    rxApodX = interpLinearNormalized(rxApod, nRxApod, (rxTang-minRxTangZX)*rngRxTangZXInv);
+                    rxDist = ownHypotf(zPix[z], xPix[x] - xElemConst[iElemX], yPix[y] - yElemConst[iElemY]);
+                    time = (txDist+rxDist)*sosInv + initDel;
+                    iSamp = time*fs;
+                    if (iSamp < 0 || iSamp > static_cast<float>(nSamp-1)) continue;
+                    __sincosf(omega*time, &modSin, &modCos);
+                    modFactor = complex<float>(modCos, modSin);
+                    samp = interpLinear(input+offset, iSamp);
+                    pix += samp*modFactor*rxApodX*rxApodY;
+                    pixWgh += rxApodX*rxApodY;
+                }
+            }
+        }
+        if (pixWgh != 0.0f) {
+            iqLri[z + x*nZPix + y*nZPix*nXPix] += pix * txApod / pixWgh;
+        }
+    }
+}
+