From 7a2b328511ff38ae568ec29092a02110201d48f9 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Wed, 18 Sep 2024 11:53:47 +0300 Subject: [PATCH 1/5] [avx2] Fix VER_ISP for 32x4 --- src/strategies/avx2/intra-avx2.c | 70 +++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 59b27be4..6efe55c1 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -2622,6 +2622,67 @@ static void angular_pdpc_ver_8x4_scale1_avx2(uvg_pixel* dst, const uvg_pixel* re } +static void angular_pdpc_ver_8x2_scale2_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) +{ + // NOTE: This function is just the w8 function, retrofitted to work with width 16 and up when scale is 1. + // Since scale is 1, limit is 6 and therefore there is no meaningful work to be done when x > 6, so only the first column of 8x2 chunks is handled. + // This function handles cases where prediction angle is high. For PDPC, this means the needed reference samples are close together, enabling more effective loading. + const int scale = 2; + const int log2_width = uvg_g_convert_to_log2[width]; + + const int limit = 6; + + __m128i vseq = _mm_set_epi64x(1, 0); + __m128i vidx = _mm_slli_epi32(vseq, log2_width); + __m256i v32s = _mm256_set1_epi16(32); + + const int offset = scale * 16; + const int inv_angle_offset = mode_disp * 64; + const int shuf_offset = mode_disp * 16; + + const __m256i vweight = _mm256_load_si256((const __m256i*) & intra_pdpc_w8_ver_weight[offset]); + const int16_t* shifted_inv_angle_sum = &intra_pdpc_shifted_inv_angle_sum[inv_angle_offset]; + const __m128i vshuf = _mm_loadu_si128((__m128i*) & intra_pdpc_shuffle_vectors_8x2_scale2_ver[shuf_offset]); + + // For width 8, height must be at least 2. Handle 2 lines at once. + for (int y = 0; y < height; y += 2) { + /*ALIGNED(32) int16_t left[16] = { 0 }; + for (int yy = 0; yy < 2; ++yy) { + for (int xx = 0; xx < limit; ++xx) { + left[yy * 8 + xx] = ref_side[(y + yy) + shifted_inv_angle_sum[xx] + 1]; + } + }*/ + __m128i vleft = _mm_loadu_si128((__m128i*) & ref_side[y + shifted_inv_angle_sum[0] + 1]); + vleft = _mm_shuffle_epi8(vleft, vshuf); + + __m128i vdst = _mm_i64gather_epi64((const long long int*)(dst + y * width), vidx, 1); + __m256i vdst16 = _mm256_cvtepu8_epi16(vdst); + __m256i vleft16 = _mm256_cvtepu8_epi16(vleft); + //__m256i vleft = _mm256_loadu_si256((__m256i*)left); + + __m256i accu = _mm256_sub_epi16(vleft16, vdst16); + accu = _mm256_mullo_epi16(vweight, accu); + accu = _mm256_add_epi16(accu, v32s); + accu = _mm256_srai_epi16(accu, 6); + accu = _mm256_add_epi16(vdst16, accu); + + __m128i lo = _mm256_castsi256_si128(accu); + __m128i hi = _mm256_extracti128_si256(accu, 1); + __m128i filtered = _mm_packus_epi16(lo, hi); + + // TODO: if this if branch is deemed to cause slow down, make another version of this, where this check is not needed. + // If this does not slow down significantly, make this same check in other functions to reduce the function call switch case complexity + if (width == 8) { + _mm_store_si128((__m128i*)(dst + (y * width)), filtered); + } + else { + *(uint64_t*)(dst + (y + 0) * width) = _mm_extract_epi64(filtered, 0); + *(uint64_t*)(dst + (y + 1) * width) = _mm_extract_epi64(filtered, 1); + } + } +} + + static void angular_pdpc_ver_w16_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* ref_side, const int width, const int height, const int mode_disp) { __m256i v32s = _mm256_set1_epi16(32); @@ -3631,19 +3692,18 @@ static void uvg_angular_pred_avx2( else angular_pdpc_ver_4x4_scale0_avx2(dst, ref_side, width, height, mode_disp); } - else /*if (scale == 1)*/ { + else if (scale == 1) { if (mode_disp < 8) angular_pdpc_ver_8x4_scale1_high_angle_avx2(dst, ref_side, width, height, mode_disp); else angular_pdpc_ver_8x4_scale1_avx2(dst, ref_side, width, height, mode_disp); } - // This branch was never executed. There is no case where width == 8 and scale == 2 and PDPC is enabled. - /*else { + else { if (mode_disp < 10) - angular_pdpc_ver_w8_high_angle_avx2(dst, ref_side, height, mode_disp); + angular_pdpc_ver_w8_high_angle_avx2(dst, ref_side, height, 2, mode_disp); else angular_pdpc_ver_8x2_scale2_avx2(dst, ref_side, width, height, mode_disp); - }*/ + } break; case 16: // 16 width and higher done with the same functions case 32: From c4d6490b98d86f976bc71ade3643a8360e1d2de8 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Tue, 10 Sep 2024 10:49:14 +0300 Subject: [PATCH 2/5] [avx2] change unaligned reads to unaligned --- src/strategies/avx2/intra-avx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 6efe55c1..0db1caf0 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -525,7 +525,7 @@ static void angular_pred_w4_hor_high_angle_avx2(uvg_pixel* dst, const uvg_pixel* for (int y = 0; y < height; y += 4) { // This solution assumes the delta int values to be 64-bit // Cast from 16-bit to 64-bit. - __m128i vidx = _mm_load_si128((__m128i*)delta_int); + __m128i vidx = _mm_loadu_si128((__m128i*)delta_int); __m256i vidx256 = _mm256_cvtepu16_epi64(vidx); __m256i vp = _mm256_i64gather_epi64((const long long int*)&ref_main[y], vidx256, 1); From f55dd7f24fde2ab4a17474c3f5338faaf06f15a2 Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Mon, 2 Sep 2024 12:01:55 +0300 Subject: [PATCH 3/5] [intra] Start to clean up the reference building --- src/intra.c | 141 +++++++++++++++++++------ src/strategies/generic/intra-generic.c | 28 +---- 2 files changed, 109 insertions(+), 60 deletions(-) diff --git a/src/intra.c b/src/intra.c index 39785747..87690070 100644 --- a/src/intra.c +++ b/src/intra.c @@ -779,7 +779,6 @@ void uvg_intra_build_reference_any( const uvg_pixel dc_val = 1 << (UVG_BIT_DEPTH - 1); //TODO: add used bitdepth as a variable const int is_chroma = color != COLOR_Y ? 1 : 0; - const int is_dual_tree = is_chroma && state->encoder_control->cfg.dual_tree && state->frame->is_irap; // Get multi ref index from CU under prediction or reconstrcution. Do not use MRL if not luma const uint8_t multi_ref_index = !is_chroma ? multi_ref_idx : 0; @@ -835,6 +834,9 @@ void uvg_intra_build_reference_any( left_stride = 1; } + const int log2_ratio = log2_width - log2_height; + int s = MAX(0, -log2_ratio); + int mrl_extension = (multi_ref_index << s) + 3; // Generate left reference. if (luma_px->x > 0) { // Get the number of reference pixels based on the PU coordinate within the LCU. @@ -855,7 +857,7 @@ void uvg_intra_build_reference_any( // Limit the number of available pixels based on block size and dimensions // of the picture. - px_available_left = MIN(px_available_left, cu_height * 2 + multi_ref_index); + px_available_left = MIN(px_available_left, cu_height + pu_loc->height); px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma); // Copy pixels from coded CUs. @@ -868,7 +870,7 @@ void uvg_intra_build_reference_any( // If first isp split, take samples as if it were normal square block int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); - for (int i = px_available_left; i < tmp_h + multi_ref_index * 2; ++i) { + for (int i = px_available_left; i < tmp_h + mrl_extension; ++i) { out_left_ref[i + 1 + multi_ref_index] = nearest_pixel; } } else { @@ -876,7 +878,7 @@ void uvg_intra_build_reference_any( uvg_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val; // If first isp split, take samples as if it were normal square block int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); - for (int i = 0; i < tmp_h + multi_ref_index; i++) { + for (int i = 0; i < tmp_h + mrl_extension; i++) { // Reserve space for top left reference out_left_ref[i + 1 + multi_ref_index] = nearest_pixel; } @@ -898,9 +900,22 @@ void uvg_intra_build_reference_any( else if (px.x == 0) { // LCU left border case uvg_pixel *top_left_corner = &extra_ref_lines[multi_ref_index * 128]; - for (int i = 0; i <= multi_ref_index; ++i) { - out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride]; - out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - multi_ref_index]; + switch (multi_ref_index) { + case 0: + out_left_ref[0] = left_border[(-1) * left_stride]; + out_top_ref[0] = top_left_corner[MAX_REF_LINE_IDX - 1]; + break; + case 1: + for (int i = 0; i <= 1; ++i) { + out_left_ref[i] = left_border[(i - 1 - 1) * left_stride]; + out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - 1]; + } break; + case 2: + for (int i = 0; i <= 2; ++i) { + out_left_ref[i] = left_border[(i - 1 - 2) * left_stride]; + out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - 2]; + } break; + default: break; } } else if (px.y == 0) { @@ -910,9 +925,23 @@ void uvg_intra_build_reference_any( } else { // Inner case - for (int i = 0; i <= multi_ref_index; ++i) { - out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride]; - out_top_ref[i] = top_border[i - 1 - multi_ref_index]; + switch (multi_ref_index) { + case 0: + for (int i = 0; i <= 0; ++i) { + out_left_ref[i] = left_border[(i - 1 - 0) * left_stride]; + out_top_ref[i] = top_border[i - 1 - 0]; + } break; + case 1: + for (int i = 0; i <= 1; ++i) { + out_left_ref[i] = left_border[(i - 1 - 1) * left_stride]; + out_top_ref[i] = top_border[i - 1 - 1]; + } break; + case 2: + for (int i = 0; i <= 2; ++i) { + out_left_ref[i] = left_border[(i - 1 - 2) * left_stride]; + out_top_ref[i] = top_border[i - 1 - 2]; + } break; + default: break; } } } @@ -926,10 +955,22 @@ void uvg_intra_build_reference_any( else if (px.x == 0) { // Picture left border case. Reference pixel cannot be taken from outside LCU border uvg_pixel nearest = out_left_ref[1 + multi_ref_index]; - for (int i = 0; i <= multi_ref_index; ++i) { - out_left_ref[i] = nearest; - out_top_ref[i] = nearest; + switch (multi_ref_index) { + case 2: + out_left_ref[2] = nearest; + out_top_ref[2] = nearest; + // Fall through + case 1: + out_left_ref[1] = nearest; + out_top_ref[1] = nearest; + // Fall through + case 0: + out_left_ref[0] = nearest; + out_top_ref[0] = nearest; + break; + default: break; } + } else { // Picture top border case. Multi ref will be 0. @@ -960,6 +1001,8 @@ void uvg_intra_build_reference_any( // Generate top reference. int px_available_top; + s = MAX(0, log2_ratio); + mrl_extension = (multi_ref_index << s) + 3; if (luma_px->y > 0) { // Get the number of reference pixels based on the PU coordinate within the LCU. if (isp_mode && !is_first_isp_block && !is_chroma) { @@ -978,7 +1021,7 @@ void uvg_intra_build_reference_any( // Limit the number of available pixels based on block size and dimensions // of the picture. - px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index); + px_available_top = MIN(px_available_top, cu_width + pu_loc->width); px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma); // Copy all the pixels we can. @@ -990,7 +1033,7 @@ void uvg_intra_build_reference_any( // If first isp split, take samples as if it were normal square block int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); - for (int i = px_available_top; i < tmp_w + multi_ref_index * 2; ++i) { + for (int i = px_available_top; i < tmp_w + mrl_extension; ++i) { out_top_ref[i + 1 + multi_ref_index] = nearest_pixel; } } else { @@ -999,7 +1042,7 @@ void uvg_intra_build_reference_any( // If first isp split, take samples as if it were normal square block int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); - for (int i = 0; i < tmp_w + multi_ref_index * 2; i++) { + for (int i = 0; i < tmp_w + mrl_extension; i++) { out_top_ref[i + 1] = nearest_pixel; } } @@ -1110,9 +1153,22 @@ void uvg_intra_build_reference_inner( else if (px.x == 0) { // LCU left border case uvg_pixel* top_left_corner = &extra_ref_lines[multi_ref_index * 128]; - for (int i = 0; i <= multi_ref_index; ++i) { - out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride]; - out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - multi_ref_index]; + switch (multi_ref_index) { + case 0: + out_left_ref[0] = left_border[(-1) * left_stride]; + out_top_ref[0] = top_left_corner[MAX_REF_LINE_IDX - 1]; + break; + case 1: + for (int i = 0; i <= 1; ++i) { + out_left_ref[i] = left_border[(i - 1 - 1) * left_stride]; + out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - 1]; + } break; + case 2: + for (int i = 0; i <= 2; ++i) { + out_left_ref[i] = left_border[(i - 1 - 2) * left_stride]; + out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - 2]; + } break; + default: break; } } else if (px.y == 0) { @@ -1122,9 +1178,23 @@ void uvg_intra_build_reference_inner( } else { // Inner case - for (int i = 0; i <= multi_ref_index; ++i) { - out_left_ref[i] = left_border[(i - 1 - multi_ref_index) * left_stride]; - out_top_ref[i] = top_border[i - 1 - multi_ref_index]; + switch (multi_ref_index) { + case 0: + for (int i = 0; i <= 0; ++i) { + out_left_ref[i] = left_border[(i - 1 - 0) * left_stride]; + out_top_ref[i] = top_border[i - 1 - 0]; + } break; + case 1: + for (int i = 0; i <= 1; ++i) { + out_left_ref[i] = left_border[(i - 1 - 1) * left_stride]; + out_top_ref[i] = top_border[i - 1 - 1]; + } break; + case 2: + for (int i = 0; i <= 2; ++i) { + out_left_ref[i] = left_border[(i - 1 - 2) * left_stride]; + out_top_ref[i] = top_border[i - 1 - 2]; + } break; + default: break; } } } @@ -1166,14 +1236,15 @@ void uvg_intra_build_reference_inner( // Limit the number of available pixels based on block size and dimensions // of the picture. - px_available_left = MIN(px_available_left, cu_height * 2); + px_available_left = MIN(px_available_left, cu_height + pu_loc->height); + // if (is_first_isp_block && isp_mode == ISP_MODE_HOR && px_available_left == cu_height * 2) px_available_left -= (pu_loc->height); px_available_left = MIN(px_available_left, (pic_px->y - luma_px->y) >> is_chroma); // Copy pixels from coded CUs. int i = multi_ref_index; // Offset by multi_ref_index // Do different loop for heights smaller than 4 (possible for some ISP splits) - if (px.y % 4 != 0 || px_available_left < 4) { + if (px.y % 4 != 0 || px_available_left % 4 != 0) { do { out_left_ref[i + 1] = left_border[(i + 0 - multi_ref_index) * left_stride]; i += 1; @@ -1193,20 +1264,17 @@ void uvg_intra_build_reference_inner( uvg_pixel nearest_pixel = out_left_ref[i]; // If first isp split, take samples as if it were normal square block + const int log2_ratio = log2_width - log2_height; + int s = MAX(0, -log2_ratio); + int mrl_extension = (multi_ref_index << s) + 3; int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); - for (; i < tmp_h; i += 4) { + for (; i < tmp_h + mrl_extension; i += 4) { out_left_ref[i + 1] = nearest_pixel; out_left_ref[i + 2] = nearest_pixel; out_left_ref[i + 3] = nearest_pixel; out_left_ref[i + 4] = nearest_pixel; } - // Extend for MRL - if (multi_ref_index) { - for (; i < height * 2 + multi_ref_index; ++i) { - out_left_ref[i + 1] = nearest_pixel; - } - } // Generate top reference. @@ -1228,7 +1296,8 @@ void uvg_intra_build_reference_inner( // Limit the number of available pixels based on block size and dimensions // of the picture. - px_available_top = MIN(px_available_top, cu_width * 2 + multi_ref_index); + px_available_top = MIN(px_available_top, cu_width + pu_loc->width); + //if (is_first_isp_block && isp_mode == ISP_MODE_VER && px_available_top == cu_width * 2) px_available_top -= MIN(pu_loc->width, 4); px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma); if (entropy_sync && px.y == 0) px_available_top = MIN(px_available_top, ((LCU_WIDTH >> is_chroma) - px.x) -1); @@ -1236,16 +1305,18 @@ void uvg_intra_build_reference_inner( // Copy all the pixels we can. i = 0; do { - memcpy(out_top_ref + i + 1 + multi_ref_index, top_border + i, 4 * sizeof(uvg_pixel)); - i += 4; + memcpy(out_top_ref + i + 1 + multi_ref_index, top_border + i, sizeof(uvg_pixel)); + i += 1; } while (i < px_available_top); // Extend the last pixel for the rest of the reference values. nearest_pixel = out_top_ref[i + multi_ref_index]; // If first isp split, take samples as if it were normal square block + s = MAX(0, -log2_ratio); + mrl_extension = (multi_ref_index << s) + 3; int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); - for (; i < tmp_w + (multi_ref_index * 2); i += 4) { + for (; i < tmp_w + mrl_extension; i += 4) { out_top_ref[i + 1 + multi_ref_index] = nearest_pixel; out_top_ref[i + 2 + multi_ref_index] = nearest_pixel; out_top_ref[i + 3 + multi_ref_index] = nearest_pixel; diff --git a/src/strategies/generic/intra-generic.c b/src/strategies/generic/intra-generic.c index 54a67ef6..31baf99a 100644 --- a/src/strategies/generic/intra-generic.c +++ b/src/strategies/generic/intra-generic.c @@ -144,9 +144,6 @@ static void uvg_angular_pred_generic( const uvg_pixel *ref_side; uvg_pixel* work = width == height || vertical_mode ? dst : temp_dst; - const int top_ref_length = isp_mode == ISP_MODE_VER ? width + cu_dim : width << 1; - const int left_ref_length = isp_mode == ISP_MODE_HOR ? height + cu_dim : height << 1; - // Set ref_main and ref_side such that, when indexed with 0, they point to // index 0 in block coordinates. if (sample_disp < 0) { @@ -160,28 +157,9 @@ static void uvg_angular_pred_generic( for (int i = -size_side; i <= -1; i++) { ref_main[i] = ref_side[MIN((-i * modedisp2invsampledisp[abs(mode_disp)] + 256) >> 9, size_side)]; } - } - else { - memcpy(&temp_above[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); - memcpy(&temp_left[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); - - ref_main = vertical_mode ? temp_above : temp_left; - ref_side = vertical_mode ? temp_left : temp_above; - - const int log2_ratio = log2_width - log2_height; - const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio); - const int max_index = (multi_ref_index << s) + 2; - int ref_length; - if (isp_mode) { - ref_length = vertical_mode ? top_ref_length : left_ref_length; - } - else { - ref_length = vertical_mode ? width << 1 : height << 1; - } - const uvg_pixel val = ref_main[ref_length + multi_ref_index]; - for (int j = 1; j <= max_index; j++) { - ref_main[ref_length + multi_ref_index + j] = val; - } + } else { + ref_main = (uvg_pixel*)(vertical_mode ? in_ref_above : in_ref_left); + ref_side = vertical_mode ? in_ref_left : in_ref_above; } From a1413a4ef7d0c76bce69142e5aabff5ee9e4e05e Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 5 Sep 2024 09:28:57 +0300 Subject: [PATCH 4/5] [intra] Fixes --- src/intra.c | 2 +- src/strategies/avx2/intra-avx2.c | 22 ++-------------------- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/src/intra.c b/src/intra.c index 87690070..9f297158 100644 --- a/src/intra.c +++ b/src/intra.c @@ -1300,7 +1300,7 @@ void uvg_intra_build_reference_inner( //if (is_first_isp_block && isp_mode == ISP_MODE_VER && px_available_top == cu_width * 2) px_available_top -= MIN(pu_loc->width, 4); px_available_top = MIN(px_available_top, (pic_px->x - luma_px->x) >> is_chroma); - if (entropy_sync && px.y == 0) px_available_top = MIN(px_available_top, ((LCU_WIDTH >> is_chroma) - px.x) -1); + if (entropy_sync && px.y == 0) px_available_top = MIN(px_available_top, ((LCU_WIDTH >> is_chroma) - px.x)); // Copy all the pixels we can. i = 0; diff --git a/src/strategies/avx2/intra-avx2.c b/src/strategies/avx2/intra-avx2.c index 0db1caf0..f68122f6 100644 --- a/src/strategies/avx2/intra-avx2.c +++ b/src/strategies/avx2/intra-avx2.c @@ -3330,26 +3330,8 @@ static void uvg_angular_pred_avx2( } } else { - memcpy(&temp_main[0], &in_ref_above[0], (top_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); - memcpy(&temp_side[0], &in_ref_left[0], (left_ref_length + 1 + multi_ref_index) * sizeof(uvg_pixel)); - - ref_main = vertical_mode ? temp_main : temp_side; - ref_side = vertical_mode ? temp_side : temp_main; - - const int log2_ratio = log2_width - log2_height; - const int s = MAX(0, vertical_mode ? log2_ratio : -log2_ratio); - const int max_index = (multi_ref_index << s) + 2; - int ref_length; - if (isp_mode) { - ref_length = vertical_mode ? top_ref_length : left_ref_length; - } - else { - ref_length = vertical_mode ? width << 1 : height << 1; - } - const uvg_pixel val = ref_main[ref_length + multi_ref_index]; - for (int j = 1; j <= max_index; j++) { - ref_main[ref_length + multi_ref_index + j] = val; - } + ref_main = (uvg_pixel*)(vertical_mode ? in_ref_above : in_ref_left); + ref_side = vertical_mode ? in_ref_left : in_ref_above; } // compensate for line offset in reference line buffers From 3c5493543c02f3f245d85efc390ba9c61bd70e2a Mon Sep 17 00:00:00 2001 From: Joose Sainio Date: Thu, 5 Sep 2024 14:40:42 +0300 Subject: [PATCH 5/5] [intra] Extend the reference inside the reference building --- src/intra.c | 53 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/src/intra.c b/src/intra.c index 9f297158..bc76f838 100644 --- a/src/intra.c +++ b/src/intra.c @@ -198,8 +198,8 @@ static void intra_filter_reference( refs->filtered_initialized = true; } - const int_fast8_t ref_width = 2 * (1 << log2_width) + 1; - const int_fast8_t ref_height = 2 * (1 << log2_height) + 1; + const int_fast16_t ref_width = 2 * (1 << (log2_width)) + 1; + const int_fast16_t ref_height = 2 * (1 << (log2_height)) + 1; uvg_intra_ref *ref = &refs->ref; uvg_intra_ref *filtered_ref = &refs->filtered_ref; @@ -208,7 +208,7 @@ static void intra_filter_reference( filtered_ref->top[0] = filtered_ref->left[0]; // Top to bottom - for (int_fast8_t y = 1; y < ref_height - 1; ++y) { + for (int_fast16_t y = 1; y < ref_height - 1; ++y) { uvg_pixel *p = &ref->left[y]; filtered_ref->left[y] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2; } @@ -216,7 +216,7 @@ static void intra_filter_reference( filtered_ref->left[ref_height - 1] = ref->left[ref_height - 1]; // Left to right - for (int_fast8_t x = 1; x < ref_width - 1; ++x) { + for (int_fast16_t x = 1; x < ref_width - 1; ++x) { uvg_pixel *p = &ref->top[x]; filtered_ref->top[x] = (p[-1] + 2 * p[0] + p[1] + 2) >> 2; } @@ -712,7 +712,16 @@ static void intra_predict_regular( } if (used_ref == &refs->filtered_ref && !refs->filtered_initialized) { - intra_filter_reference(log2_width, log2_height, refs); + int temp_log2_width = log2_width; + int temp_log2_height = log2_height; + if (color == COLOR_Y && isp_mode == ISP_MODE_NO_ISP) { + temp_log2_width = cur_cu->log2_width; + temp_log2_height = cur_cu->log2_height; + } else if (color != COLOR_Y) { + temp_log2_width = cur_cu->log2_chroma_width; + temp_log2_height = cur_cu->log2_chroma_height; + } + intra_filter_reference(temp_log2_width, temp_log2_height, refs); } if (mode == 0) { @@ -797,7 +806,7 @@ void uvg_intra_build_reference_any( // Init pointers to LCUs reconstruction buffers, such that index 0 refers to block coordinate 0. const uvg_pixel *left_ref; bool extra_ref = false; - // On the left LCU edge, if left neighboring LCU is available, + // On the left LCU edge, if left neighboring LCU is available, // left_ref needs to point to correct extra reference line if MRL is used. if (luma_px->x > 0 && lcu_px.x == 0 && multi_ref_index != 0) { left_ref = &extra_ref_lines[multi_ref_index * 128]; @@ -836,7 +845,7 @@ void uvg_intra_build_reference_any( const int log2_ratio = log2_width - log2_height; int s = MAX(0, -log2_ratio); - int mrl_extension = (multi_ref_index << s) + 3; + int mrl_extension = (multi_ref_index << s) + (height << s) + 2; // Generate left reference. if (luma_px->x > 0) { // Get the number of reference pixels based on the PU coordinate within the LCU. @@ -870,7 +879,8 @@ void uvg_intra_build_reference_any( // If first isp split, take samples as if it were normal square block int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); - for (int i = px_available_left; i < tmp_h + mrl_extension; ++i) { + int total_height = MIN(tmp_h + mrl_extension, INTRA_REF_LENGTH); + for (int i = px_available_left; i < total_height; ++i) { out_left_ref[i + 1 + multi_ref_index] = nearest_pixel; } } else { @@ -878,7 +888,8 @@ void uvg_intra_build_reference_any( uvg_pixel nearest_pixel = luma_px->y > 0 ? top_border[0] : dc_val; // If first isp split, take samples as if it were normal square block int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); - for (int i = 0; i < tmp_h + mrl_extension; i++) { + int total_height = MIN(tmp_h + mrl_extension, INTRA_REF_LENGTH); + for (int i = 0; i < total_height; i++) { // Reserve space for top left reference out_left_ref[i + 1 + multi_ref_index] = nearest_pixel; } @@ -901,11 +912,11 @@ void uvg_intra_build_reference_any( // LCU left border case uvg_pixel *top_left_corner = &extra_ref_lines[multi_ref_index * 128]; switch (multi_ref_index) { - case 0: + case 0: out_left_ref[0] = left_border[(-1) * left_stride]; out_top_ref[0] = top_left_corner[MAX_REF_LINE_IDX - 1]; break; - case 1: + case 1: for (int i = 0; i <= 1; ++i) { out_left_ref[i] = left_border[(i - 1 - 1) * left_stride]; out_top_ref[i] = top_left_corner[(128 * -i) + MAX_REF_LINE_IDX - 1 - 1]; @@ -1002,7 +1013,7 @@ void uvg_intra_build_reference_any( // Generate top reference. int px_available_top; s = MAX(0, log2_ratio); - mrl_extension = (multi_ref_index << s) + 3; + mrl_extension = (multi_ref_index << s) + (width << s) + 2; if (luma_px->y > 0) { // Get the number of reference pixels based on the PU coordinate within the LCU. if (isp_mode && !is_first_isp_block && !is_chroma) { @@ -1018,7 +1029,7 @@ void uvg_intra_build_reference_any( const int num_cus = uvg_count_available_edge_cus(cu_loc, lcu, false); px_available_top = !is_chroma ? num_cus * 4 : num_cus * 2; } - + // Limit the number of available pixels based on block size and dimensions // of the picture. px_available_top = MIN(px_available_top, cu_width + pu_loc->width); @@ -1033,7 +1044,8 @@ void uvg_intra_build_reference_any( // If first isp split, take samples as if it were normal square block int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); - for (int i = px_available_top; i < tmp_w + mrl_extension; ++i) { + int total_width = MIN(tmp_w + mrl_extension, INTRA_REF_LENGTH); + for (int i = px_available_top; i < total_width; ++i) { out_top_ref[i + 1 + multi_ref_index] = nearest_pixel; } } else { @@ -1042,7 +1054,8 @@ void uvg_intra_build_reference_any( // If first isp split, take samples as if it were normal square block int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); - for (int i = 0; i < tmp_w + mrl_extension; i++) { + int total_width = MIN(tmp_w + mrl_extension, INTRA_REF_LENGTH); + for (int i = 0; i < total_width; i++) { out_top_ref[i + 1] = nearest_pixel; } } @@ -1266,9 +1279,10 @@ void uvg_intra_build_reference_inner( // If first isp split, take samples as if it were normal square block const int log2_ratio = log2_width - log2_height; int s = MAX(0, -log2_ratio); - int mrl_extension = (multi_ref_index << s) + 3; + int mrl_extension = ((multi_ref_index + 0) << s) + (height << s) + 2; int tmp_h = is_first_isp_block ? cu_height * 2 : (isp_mode ? cu_height + height : height * 2); - for (; i < tmp_h + mrl_extension; i += 4) { + int total_height = MIN(tmp_h + mrl_extension, INTRA_REF_LENGTH - 2); + for (; i < total_height; i += 4) { out_left_ref[i + 1] = nearest_pixel; out_left_ref[i + 2] = nearest_pixel; out_left_ref[i + 3] = nearest_pixel; @@ -1314,9 +1328,10 @@ void uvg_intra_build_reference_inner( // If first isp split, take samples as if it were normal square block s = MAX(0, -log2_ratio); - mrl_extension = (multi_ref_index << s) + 3; + mrl_extension = ((multi_ref_index + 0) << s) + (width << s) + 2; int tmp_w = is_first_isp_block ? cu_width * 2 : (isp_mode ? cu_width + width : width * 2); - for (; i < tmp_w + mrl_extension; i += 4) { + int total_width = MIN(tmp_w+ mrl_extension, INTRA_REF_LENGTH - 2); + for (; i < total_width; i += 4) { out_top_ref[i + 1 + multi_ref_index] = nearest_pixel; out_top_ref[i + 2 + multi_ref_index] = nearest_pixel; out_top_ref[i + 3 + multi_ref_index] = nearest_pixel;