82 ocean_assert(imageTopLeft !=
nullptr && buffer !=
nullptr);
83 ocean_assert(imageStrideElements >= 1u * tPatchSize);
85 ocean_assert(factorRight <= 128u && factorBottom <= 128u);
87 const unsigned int factorLeft = 128u - factorRight;
88 const unsigned int factorTop = 128u - factorBottom;
90 constexpr unsigned int blocks15 = tPatchSize / 15u;
91 constexpr unsigned int remainingAfterBlocks15 = tPatchSize % 15u;
93 constexpr bool partialBlock15 = remainingAfterBlocks15 > 10u;
94 constexpr unsigned int remainingAfterPartialBlock15 = partialBlock15 ? 0u : remainingAfterBlocks15;
96 constexpr bool block7 = remainingAfterPartialBlock15 >= 7u;
97 constexpr unsigned int remainingAfterBlock7 = remainingAfterPartialBlock15 % 7u;
99 constexpr bool partialBlock7 = remainingAfterBlock7 >= 3u;
100 constexpr unsigned int remainingAfterPartialBlock7 = partialBlock7 ? 0u : remainingAfterBlock7;
102 constexpr unsigned int blocks1 = remainingAfterPartialBlock7;
105 const uint8x8_t factorsLeftRight_u_8x8 = vreinterpret_u8_u16(vdup_n_u16(uint16_t(factorLeft | (factorRight << 8u))));
107 const uint32x4_t factorsTop_u_32x4 = vdupq_n_u32(factorTop);
108 const uint32x4_t factorsBottom_u_32x4 = vdupq_n_u32(factorBottom);
110 for (
unsigned int y = 0u; y < tPatchSize; ++y)
112 for (
unsigned int x = 0u; x < blocks15; ++x)
114 const uint8x16_t top_u_8x16 = vld1q_u8(imageTopLeft);
115 const uint8x16_t bottom_u_8x16 = vld1q_u8(imageTopLeft + imageStrideElements);
118 const uint8x16_t topB_u_8x16 = vextq_u8(top_u_8x16, vreinterpretq_u8_u32(factorsTop_u_32x4), 1);
119 const uint8x16_t bottomB_u_8x16 = vextq_u8(bottom_u_8x16, vreinterpretq_u8_u32(factorsTop_u_32x4), 1);
123 const uint32x4_t topLowA_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(top_u_8x16), factorsLeftRight_u_8x8));
124 const uint32x4_t bottomLowA_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(bottom_u_8x16), factorsLeftRight_u_8x8));
127 const uint32x4_t topLowB_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(topB_u_8x16), factorsLeftRight_u_8x8));
128 const uint32x4_t bottomLowB_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(bottomB_u_8x16), factorsLeftRight_u_8x8));
131 const uint32x4_t topHighA_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(top_u_8x16), factorsLeftRight_u_8x8));
132 const uint32x4_t bottomHighA_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(bottom_u_8x16), factorsLeftRight_u_8x8));
135 const uint32x4_t topHighB_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(topB_u_8x16), factorsLeftRight_u_8x8));
136 const uint32x4_t bottomHighB_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(bottomB_u_8x16), factorsLeftRight_u_8x8));
140 const uint16x4_t resultLowA_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topLowA_u_32x4, factorsTop_u_32x4), bottomLowA_u_32x4, factorsBottom_u_32x4), 14);
141 const uint16x4_t resultHighA_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topHighA_u_32x4, factorsTop_u_32x4), bottomHighA_u_32x4, factorsBottom_u_32x4), 14);
143 const uint16x4_t resultLowB_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topLowB_u_32x4, factorsTop_u_32x4), bottomLowB_u_32x4, factorsBottom_u_32x4), 14);
144 const uint16x4_t resultHighB_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topHighB_u_32x4, factorsTop_u_32x4), bottomHighB_u_32x4, factorsBottom_u_32x4), 14);
146 const uint16x8_t resultA_u_16x8 = vcombine_u16(resultLowA_u_16x4, resultHighA_u_16x4);
147 const uint16x8_t resultB_u_16x8 = vcombine_u16(resultLowB_u_16x4, resultHighB_u_16x4);
150 const uint8x16_t result_u_8x16 = vreinterpretq_u8_u16(vsliq_n_u16(resultA_u_16x8, resultB_u_16x8, 8));
153 const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks15) && (!block7 && !partialBlock7 && blocks1 == 0u);
157 uint8_t tempBuffer[16];
158 vst1q_u8(tempBuffer, result_u_8x16);
160 memcpy(buffer, &tempBuffer, 15);
164 vst1q_u8(buffer, result_u_8x16);
171 if constexpr (partialBlock15)
173 ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
175 uint8x16_t top_u_8x16;
176 uint8x16_t bottom_u_8x16;
178 if (y < tPatchSize - 1u)
180 top_u_8x16 = vld1q_u8(imageTopLeft);
181 bottom_u_8x16 = vld1q_u8(imageTopLeft + imageStrideElements);
185 constexpr unsigned int overlapping = 16u - (remainingAfterBlocks15 + 1u);
187 top_u_8x16 = vld1q_u8(imageTopLeft - overlapping);
188 bottom_u_8x16 = vld1q_u8(imageTopLeft + imageStrideElements - overlapping);
190 top_u_8x16 = vextq_u8(top_u_8x16, vreinterpretq_u8_u32(factorsTop_u_32x4), overlapping);
191 bottom_u_8x16 = vextq_u8(bottom_u_8x16, vreinterpretq_u8_u32(factorsTop_u_32x4), overlapping);
195 const uint8x16_t topB_u_8x16 = vextq_u8(top_u_8x16, top_u_8x16, 1);
196 const uint8x16_t bottomB_u_8x16 = vextq_u8(bottom_u_8x16, bottom_u_8x16, 1);
200 const uint32x4_t topLowA_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(top_u_8x16), factorsLeftRight_u_8x8));
201 const uint32x4_t bottomLowA_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(bottom_u_8x16), factorsLeftRight_u_8x8));
204 const uint32x4_t topLowB_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(topB_u_8x16), factorsLeftRight_u_8x8));
205 const uint32x4_t bottomLowB_u_32x4 = vpaddlq_u16(vmull_u8(vget_low_u8(bottomB_u_8x16), factorsLeftRight_u_8x8));
208 const uint32x4_t topHighA_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(top_u_8x16), factorsLeftRight_u_8x8));
209 const uint32x4_t bottomHighA_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(bottom_u_8x16), factorsLeftRight_u_8x8));
212 const uint32x4_t topHighB_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(topB_u_8x16), factorsLeftRight_u_8x8));
213 const uint32x4_t bottomHighB_u_32x4 = vpaddlq_u16(vmull_u8(vget_high_u8(bottomB_u_8x16), factorsLeftRight_u_8x8));
217 const uint16x4_t resultLowA_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topLowA_u_32x4, factorsTop_u_32x4), bottomLowA_u_32x4, factorsBottom_u_32x4), 14);
218 const uint16x4_t resultHighA_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topHighA_u_32x4, factorsTop_u_32x4), bottomHighA_u_32x4, factorsBottom_u_32x4), 14);
220 const uint16x4_t resultLowB_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topLowB_u_32x4, factorsTop_u_32x4), bottomLowB_u_32x4, factorsBottom_u_32x4), 14);
221 const uint16x4_t resultHighB_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topHighB_u_32x4, factorsTop_u_32x4), bottomHighB_u_32x4, factorsBottom_u_32x4), 14);
223 const uint16x8_t resultA_u_16x8 = vcombine_u16(resultLowA_u_16x4, resultHighA_u_16x4);
224 const uint16x8_t resultB_u_16x8 = vcombine_u16(resultLowB_u_16x4, resultHighB_u_16x4);
227 const uint8x16_t result_u_8x16 = vreinterpretq_u8_u16(vsliq_n_u16(resultA_u_16x8, resultB_u_16x8, 8));
229 ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
230 const bool isLastBlock = y + 1u == tPatchSize;
234 uint8_t tempBuffer[16];
235 vst1q_u8(tempBuffer, result_u_8x16);
237 memcpy(buffer, &tempBuffer, remainingAfterBlocks15);
241 vst1q_u8(buffer, result_u_8x16);
244 imageTopLeft += remainingAfterBlocks15;
245 buffer += remainingAfterBlocks15;
248 if constexpr (block7)
250 const uint8x8_t top_u_8x8 = vld1_u8(imageTopLeft);
251 const uint8x8_t bottom_u_8x8 = vld1_u8(imageTopLeft + imageStrideElements);
254 const uint8x8_t topB_u_8x8 = vext_u8(top_u_8x8, factorsLeftRight_u_8x8, 1);
255 const uint8x8_t bottomB_u_8x8 = vext_u8(bottom_u_8x8, factorsLeftRight_u_8x8, 1);
259 const uint32x4_t topA_u_32x4 = vpaddlq_u16(vmull_u8(top_u_8x8, factorsLeftRight_u_8x8));
260 const uint32x4_t bottomA_u_32x4 = vpaddlq_u16(vmull_u8(bottom_u_8x8, factorsLeftRight_u_8x8));
263 const uint32x4_t topB_u_32x4 = vpaddlq_u16(vmull_u8(topB_u_8x8, factorsLeftRight_u_8x8));
264 const uint32x4_t bottomB_u_32x4 = vpaddlq_u16(vmull_u8(bottomB_u_8x8, factorsLeftRight_u_8x8));
268 const uint16x4_t resultA_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topA_u_32x4, factorsTop_u_32x4), bottomA_u_32x4, factorsBottom_u_32x4), 14);
269 const uint16x4_t resultB_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topB_u_32x4, factorsTop_u_32x4), bottomB_u_32x4, factorsBottom_u_32x4), 14);
273 const uint8x8_t result_u_8x8 = vreinterpret_u8_u16(vsli_n_u16(resultA_u_16x4, resultB_u_16x4, 8));
275 const bool isLastBlock = (y + 1u == tPatchSize) && (!partialBlock7 && blocks1 == 0u);
279 uint8_t tempBuffer[8];
280 vst1_u8(tempBuffer, result_u_8x8);
282 memcpy(buffer, &tempBuffer, 7);
286 vst1_u8(buffer, result_u_8x8);
293 if constexpr (partialBlock7)
295 ocean_assert(blocks1 == 0u);
298 uint8x8_t bottom_u_8x8;
300 if (y < tPatchSize - 1u)
302 top_u_8x8 = vld1_u8(imageTopLeft);
303 bottom_u_8x8 = vld1_u8(imageTopLeft + imageStrideElements);
307 constexpr unsigned int overlapping = 8u - (remainingAfterBlock7 + 1u);
309 top_u_8x8 = vld1_u8(imageTopLeft - overlapping);
310 bottom_u_8x8 = vld1_u8(imageTopLeft + imageStrideElements - overlapping);
312 top_u_8x8 = vext_u8(top_u_8x8, factorsLeftRight_u_8x8, overlapping);
313 bottom_u_8x8 = vext_u8(bottom_u_8x8, factorsLeftRight_u_8x8, overlapping);
318 const uint8x8_t topB_u_8x8 = vext_u8(top_u_8x8, factorsLeftRight_u_8x8, 1);
319 const uint8x8_t bottomB_u_8x8 = vext_u8(bottom_u_8x8, factorsLeftRight_u_8x8, 1);
323 const uint32x4_t topA_u_32x4 = vpaddlq_u16(vmull_u8(top_u_8x8, factorsLeftRight_u_8x8));
324 const uint32x4_t bottomA_u_32x4 = vpaddlq_u16(vmull_u8(bottom_u_8x8, factorsLeftRight_u_8x8));
327 const uint32x4_t topB_u_32x4 = vpaddlq_u16(vmull_u8(topB_u_8x8, factorsLeftRight_u_8x8));
328 const uint32x4_t bottomB_u_32x4 = vpaddlq_u16(vmull_u8(bottomB_u_8x8, factorsLeftRight_u_8x8));
332 const uint16x4_t resultA_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topA_u_32x4, factorsTop_u_32x4), bottomA_u_32x4, factorsBottom_u_32x4), 14);
333 const uint16x4_t resultB_u_16x4 = vrshrn_n_u32(vmlaq_u32(vmulq_u32(topB_u_32x4, factorsTop_u_32x4), bottomB_u_32x4, factorsBottom_u_32x4), 14);
337 const uint8x8_t result_u_8x8 = vreinterpret_u8_u16(vsli_n_u16(resultA_u_16x4, resultB_u_16x4, 8));
339 ocean_assert(blocks1 == 0u);
340 const bool isLastBlock = y + 1u == tPatchSize;
344 uint8_t tempBuffer[8];
345 vst1_u8(tempBuffer, result_u_8x8);
347 memcpy(buffer, &tempBuffer, remainingAfterBlock7);
351 vst1_u8(buffer, result_u_8x8);
354 imageTopLeft += remainingAfterBlock7;
355 buffer += remainingAfterBlock7;
358 if constexpr (blocks1 != 0u)
360 const unsigned int factorTopLeft = factorTop * factorLeft;
361 const unsigned int factorTopRight = factorTop * factorRight;
363 const unsigned int factorBottomLeft = factorBottom * factorLeft;
364 const unsigned int factorBottomRight = factorBottom * factorRight;
366 const uint8_t*
const imageBottomLeft = imageTopLeft + imageStrideElements;
368 for (
unsigned int n = 0u; n < blocks1; ++n)
370 buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[1u + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[1u + n] * factorBottomRight + 8192u) / 16384u);
373 imageTopLeft += blocks1;
377 imageTopLeft += imageStrideElements - tPatchSize;