Fix for #720 (by @lightsighter) #721

manopapad · 2022-11-30T20:25:39Z

No description provided.

manopapad · 2022-12-15T21:46:50Z

@lightsighter The fix is breaking CI. Here's a minimal reproducer:

import cunumeric as num
a = num.ones((10, 10))
v = num.ones((3, 5))
num.convolve(v, a, mode="same")

which fails at entry to launch_small_tile_kernel with:

root_rect = <0,0>..<2,4>
subrect = <0,0>..<2,4>
filter_rect = <0,0>..<9,9>
properties.sharedMemPerMultiprocessor = 98304
extents = 1010
centers = 55
tile = <10,10>
smem_size = 3200
max_smem_size = 98304
legion_python: /gpfs/fs1/mpapadakis/new/cunumeric/src/cunumeric/convolution/convolve.cu:676: void cunumeric::launch_small_tile_kernel(legate::AccessorWO<T, DIM>, legate::AccessorRO<T, DIM>, legate::AccessorRO<T, DIM>, Legion::Rect<DIM>&, Legion::Rect<DIM>&, Legion::Rect<DIM>&, const cudaDeviceProp&, const unsigned int*, const unsigned int*, Legion::Point<DIM>&, unsigned int, size_t) [with VAL = double; int DIM = 2; legate::AccessorWO<T, DIM> = Legion::FieldAccessor<LEGION_WRITE_DISCARD, double, 2, long long int, Realm::AffineAccessor<double, 2, long long int>, false>; legate::AccessorRO<T, DIM> = Legion::FieldAccessor<LEGION_READ_PRIV, double, 2, long long int, Realm::AffineAccessor<double, 2, long long int>, false>; Legion::Rect<DIM> = Realm::Rect<2, long long int>; Legion::Point<DIM> = Realm::Point<2, long long int>; size_t = long unsigned int]: Assertion `(input_pitch * sizeof(VAL)) == smem_size' failed.

lightsighter · 2022-12-30T10:51:12Z

@manopapad Another minor tweak:

template <typename VAL, int DIM>
static unsigned roundup_tile(Point<DIM>& tile,
                             const Point<DIM>& bounds,
                             const Point<DIM>& padding,
                             const unsigned max_size)
{
if (DIM == 1) {
    // In this single case we can just solve for this directly
    unsigned elements = max_size / sizeof(VAL);
    assert(elements > padding[0]);
    if (tile[0] < (elements - padding[0])) {
      tile[0] = elements - padding[0];
      if (bounds[0] < tile[0]) tile[0] = bounds[0];
    }
    return (tile[0] + padding[0]) * sizeof(VAL);
  } else {
    // Compute the initial size
    // Shrink the tile to the bounds if necessary
    unsigned result = sizeof(VAL);
    for (int d = 0; d < DIM; d++) {
      if (bounds[d] < tile[d]) tile[d] = bounds[d];
      result *= (tile[d] + padding[d]);
    }
    // Find the two smallest dimensions and increase one of them
    // until we hit the second smallest one or exceed max_smem_size
    unsigned skipdims = 0;
    bool all_same     = true;
    while (true) {
      int d1 = DIM - 1, d2 = -1;
      int t1 = tile[d1], t2 = 0;
      while (t1 == bounds[d1]) {
        skipdims |= (1 << d1);
        if (--d1 < 0) return result; // all dims at their bounds so we're done
        t1 = tile[d1];
      }
      for (int d = d1 - 1; d >= 0; d--) {
        if (skipdims & (1 << d)) continue;
        // Skip any dimension that is at its bound
        if (tile[d] == bounds[d]) {
          skipdims |= (1 << d);
          continue;
        }
        if (tile[d] < t1) {
          d2 = d1;
          t2 = t1;
          d1 = d;
          t1 = tile[d];
        } else if ((d2 < 0) || (tile[d] < t2)) {
          d2 = d;
          t2 = tile[d];
        }
      }
      if (d2 == -1) {
        // All the other dimensions are at their bounds, check that
        // the last dimension is also at its bound if not solve
        unsigned pitch = sizeof(VAL);
        for (int d = 0; d < DIM; d++) 
          if (d != d1)
            pitch *= (tile[d] + padding[d]);
        // Make sure the last dimension is as large as it can go too
        if (tile[d1] < bounds[d1]) {
          unsigned elements = max_size / pitch;
          assert(elements > padding[d1]);
          assert(tile[d1] < (elements - padding[d1]));
          tile[d1] = elements - padding[d1];
          if (bounds[d1] < tile[d1]) tile[d1] = bounds[d1];
        }
        return pitch * (tile[d1] + padding[d1]);
      }
       // If we ever get two dimensions of the same size then see what dimension
      // has the next largest value. If we can't find one that is larger then
      // we know that there is no smallest dimension so we can march all the
      // dimensions together at this point
      if (t1 == t2) {
        d2 = -1;
        for (int d = 0; d < DIM; d++) {
          if (d == d1) continue;
          if (tile[d] <= tile[d1]) continue;
          if ((d2 == -1) || (tile[d] < tile[d2])) {
            d2 = d;
            t2 = tile[d];
          }
        }
        if (d2 == -1) break;
      }
      // Solve for the max we can walk
      unsigned pitch = sizeof(VAL);
      for (int d = 0; d < DIM; d++)
        if (d != d1) pitch *= (tile[d] + padding[d]);
      unsigned elements = max_size / pitch;
      if ((elements <= padding[d1]) || (t1 >= (elements - padding[d1]))) {
        skipdims |= (1 << d1);
        continue;
      }
      unsigned bound = elements - padding[d1];
      if (bounds[d1] < bound) {
        tile[d1] = bounds[d1];
        result   = pitch * (tile[d1] + padding[d1]);
      } else if (bound < t2) {
        tile[d1] = bound;
        result   = pitch * (bound + padding[d1]);
        all_same = false;
        break;
      } else {
        tile[d1] = t2;
        result   = pitch * (t2 + padding[d1]);
      }
    }
    if (all_same) {
      // Step all the dimensions together until we hit
      // the shared memory upper bound we're targetting
      // This algorithm is in theory slow, but the max
      // memory sizes of caches are "small" and the amount
      // of memory will grow polynomially in the number
      // of dimensions so it should converge quickly
      while (true) {
        unsigned next_size = sizeof(VAL);
        for (int d = 0; d < DIM; d++)
          if (skipdims & (1 << d))
            next_size *= (tile[d] + padding[d]);
          else if (tile[d] == bounds[d]) {
            next_size *= (tile[d] + padding[d]);
            skipdims |= (1 << d);
          } else
            next_size *= (tile[d] + 1 + padding[d]);
        if ((next_size > max_size) || (next_size == result)) break;
        result = next_size;
        for (int d = 0; d < DIM; d++) {
          if (skipdims && (1 << d)) continue;
          tile[d]++;
        }
      }
    }
    return result;
  }
}

manopapad · 2023-01-03T22:58:32Z

With the latest fix we no longer hit the assertion, and the original infinite loop issue is also fixed.

However, now that the code completes, we are producing the wrong results on the 3d testcase with 8 GPUs:

import numpy as np
import cunumeric as cn
import scipy.signal as sig
a = cn.random.randint(0, 5, (10, 10, 10))
v = cn.random.randint(0, 5, (3, 5, 3))
anp = a.__array__()
vnp = v.__array__()
out = cn.convolve(a, v, mode="same")
out_np = sig.convolve(anp, vnp, mode="same")
assert np.array_equal(out, out_np)

where:

out[0,:,:] =
[[ 54  68  75  67  83  93  79  24  16   0]
 [ 70 106 120 110 139 119 113  55  18   0]
 [102 127 146 149 172 153 161  70  33   0]
 [100 146 158 132 152 132 139  69  34   0]
 [116 160 156 144 148 139 142  78  33   0]
 [111 170 145 154 136 124 137  87  48   0]
 [113 156 151 145 141 117  91  60  46   0]
 [115 139 149 125 144 163 104  76  25   0]
 [ 76 121 130 101 120 109 110  74  36   0]
 [ 57  81 101  86  76  77  58  35  24   0]]

and

out_np[0,:,:] =
[[ 54  68  75  67  83  93  79  58  55  27]
 [ 70 106 120 110 139 119 113  91  69  64]
 [102 127 146 149 172 153 161 121  98  58]
 [100 146 158 132 152 132 139 119 115  81]
 [116 160 156 144 148 139 142 127 117  63]
 [111 170 145 154 136 124 137 153 140  77]
 [113 156 151 145 141 117  91 112 148  88]
 [115 139 149 125 144 163 104 130 114  77]
 [ 76 121 130 101 120 109 110 106  90  77]
 [ 57  81 101  86  76  77  58  62  57  61]]

Fix for nv-legate#720 (by @lightsighter)

09c6981

manopapad added the category:bug-fix PR is a bug fix and will be classified as such in release notes label Nov 30, 2022

Merge branch 'branch-22.12' into bug/720

887d270

manopapad added 2 commits December 30, 2022 10:24

Merge branch 'branch-22.12' into bug/720

2352003

Another fix

9d2e66d

marcinz changed the base branch from branch-22.12 to branch-23.03 January 26, 2023 00:58

Merge branch 'branch-23.03' into bug/720

97a495d

manopapad mentioned this pull request Feb 24, 2023

Fix is_complete_for check nv-legate/legate#587

Merged

Merge branch 'branch-23.03' into bug/720

3cc7143

manopapad merged commit 7f1c95e into nv-legate:branch-23.03 Mar 1, 2023

manopapad deleted the bug/720 branch July 19, 2023 17:29

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix for #720 (by @lightsighter) #721

Fix for #720 (by @lightsighter) #721

manopapad commented Nov 30, 2022

manopapad commented Dec 15, 2022

lightsighter commented Dec 30, 2022

manopapad commented Jan 3, 2023

Fix for #720 (by @lightsighter) #721

Fix for #720 (by @lightsighter) #721

Conversation

manopapad commented Nov 30, 2022

manopapad commented Dec 15, 2022

lightsighter commented Dec 30, 2022

manopapad commented Jan 3, 2023