// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
$import math
$assert IN_PTRS in ["MULTI", "REUSE"]
$assert OUT_PTRS in ["MULTI", "SWITCH", "MOV", "DEC"]
$assert SIZE in [8, 16, 32]
$assert VECTOR_SIZE in [64, 128]
$TILE_SIZE = int(VECTOR_SIZE/SIZE)
$NUM_ITERS = int(math.log2(TILE_SIZE))
$SUFFIX = ''
$NUM_D_REGISTERS=int(VECTOR_SIZE/64)
$if VECTOR_SIZE == 128:
$ SUFFIX = 'q'

#include <arm_neon.h>

#include <assert.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/transpose.h>

void xnn_x${SIZE}_transposec_ukernel__${TILE_SIZE}x${TILE_SIZE}_${IN_PTRS.lower()}_${OUT_PTRS.lower()}_zip_neon(
    const uint${SIZE}_t* input,
    uint${SIZE}_t* output,
    size_t input_stride,
    size_t output_stride,
    size_t block_width,
    size_t block_height) XNN_OOB_READS
{
  assert(output_stride >= block_height * sizeof(uint${SIZE}_t));
  assert(input_stride >= block_width * sizeof(uint${SIZE}_t));

  const size_t tile_height = ${TILE_SIZE};
  const size_t tile_width = ${TILE_SIZE};
  const size_t tile_hbytes = tile_height * sizeof(uint${SIZE}_t);
  const size_t tile_wbytes = tile_width * sizeof(uint${SIZE}_t);
  const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
  $if IN_PTRS == "MULTI":
    const size_t input_offset = tile_height * input_stride;
  $if OUT_PTRS in ["MOV", "DEC"]:
    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t) - tile_hbytes;
  $else:
    const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint${SIZE}_t);

  $if IN_PTRS == "MULTI":
    const uint${SIZE}_t* i0 = input;
    $for N in range(1, TILE_SIZE):
      const uint${SIZE}_t* i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
  $else:
    const uint${SIZE}_t* i0 = input;
  $if OUT_PTRS == "MULTI":
    uint${SIZE}_t* o0 = (uint${SIZE}_t*) output;
    $for N in range(1, TILE_SIZE):
      uint${SIZE}_t* o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N-1} + output_stride);
  $elif OUT_PTRS == "SWITCH":
    uint${SIZE}_t* o = (uint${SIZE}_t*) output;
  $else:
    uint${SIZE}_t* o = (uint${SIZE}_t*) ((uintptr_t) output - tile_hbytes);
  $if OUT_PTRS == "SWITCH":
    $if int(VECTOR_SIZE/SIZE) > 2:
      const size_t minus_output_stride = -output_stride;
  $elif OUT_PTRS != "MULTI":
    const size_t minus_output_stride = -output_stride;

  do {
    $if OUT_PTRS == "MULTI":
      if XNN_UNPREDICTABLE(block_width < 2) {
        o1 = o0;
      }
      $for N in range(2, TILE_SIZE, 2):
        if XNN_UNPREDICTABLE(block_width <= ${N}) {
          o${N} = o0;
        }
        if XNN_UNPREDICTABLE(block_width < ${N+2}) {
          o${N+1} = o0;
        }
    $elif OUT_PTRS in ["MOV", "DEC"]:
      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
      const size_t oN_stride = rem * output_stride;
      const size_t oN_offset = oN_stride + tile_hbytes;
    $else:
      const size_t rem = min(block_width - 1, ${TILE_SIZE-1});
      const size_t oN_stride = rem * output_stride;
    size_t bh = block_height;
    for (; bh >= ${TILE_SIZE}; bh -= ${TILE_SIZE}) {
      $for N in range(TILE_SIZE):
        $if IN_PTRS == "REUSE":
          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i0); i0 = (uint${SIZE}_t*) ((uintptr_t) i0 + input_stride);
        $else:
          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N}); i${N} = (uint${SIZE}_t*) ((uintptr_t) i${N} + input_offset);

      $for N in range(TILE_SIZE >> 1):
        const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});

      $for M in range(1, NUM_ITERS):
        $for N in range(TILE_SIZE >> 1):
          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);

      $if OUT_PTRS == "SWITCH":
        uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
        switch (rem) {
          $for N in reversed(range(2, TILE_SIZE)):
            case ${N}:
              vst1${SUFFIX}_u${SIZE}(oN, v0_${N>>1}.val[${N%2}]); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
          case 1:
            vst1${SUFFIX}_u${SIZE}(oN, v0_0.val[1]);
          case 0:
            vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]); o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);
            break;
          default:
            XNN_UNREACHABLE;
        }
      $elif OUT_PTRS in ["MOV", "DEC"]:
        o = (uint${SIZE}_t*) ((uintptr_t) o + oN_offset);
        vst1${SUFFIX}_u${SIZE}(o, v0_${(TILE_SIZE-1)>>1}.val[1]);
        $if OUT_PTRS == "MOV":
          uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
        $for N in reversed(range(2, TILE_SIZE, 2)):
          if XNN_UNPREDICTABLE(block_width > ${N+1}) {
            $if OUT_PTRS == "MOV":
              o = oN;
            $else:
              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
          }
          vst1${SUFFIX}_u${SIZE}(o, v0_${N>>1}.val[0]);
          $if OUT_PTRS == "MOV":
            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
          if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
            $if OUT_PTRS == "MOV":
              o = oN;
            $else:
              o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
          }
          vst1${SUFFIX}_u${SIZE}(o, v0_${(N-1)>>1}.val[1]);
          $if OUT_PTRS == "MOV":
            oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
        if XNN_UNPREDICTABLE(block_width > 1) {
          $if OUT_PTRS == "MOV":
            o = oN;
          $else:
            o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
        }
        vst1${SUFFIX}_u${SIZE}(o, v0_0.val[0]);
      $else:
        $for N in reversed(range(TILE_SIZE)):
          vst1${SUFFIX}_u${SIZE}(o${N}, v0_${N>>1}.val[${N%2}]); o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + tile_hbytes);
    }
    $if OUT_PTRS in ["MOV", "DEC"]:
      o = (uint${SIZE}_t*) ((uintptr_t) o + tile_hbytes);

    if (bh != 0) {
      $if IN_PTRS == "REUSE":
        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0);
        $for N in range(1, TILE_SIZE - 1, 2):
          const uint${SIZE}_t *i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
          if XNN_UNPREDICTABLE(bh < ${N+1}) {
            i${N} = i${N-1};
          }
          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N});
          const uint${SIZE}_t *i${N+1} = (const uint${SIZE}_t*) ((uintptr_t) i${N} + input_stride);
          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
            i${N+1} = i${N};
          }
          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1});
      $else:
        const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_0 = vld1${SUFFIX}_u${SIZE}(i0);
        $for N in range(1, TILE_SIZE - 1, 2):
          if XNN_UNPREDICTABLE(bh < ${N+1}) {
            i${N} = i0;
          }
          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N} = vld1${SUFFIX}_u${SIZE}(i${N});
          if XNN_UNPREDICTABLE(bh <= ${N+1}) {
            i${N+1} = i0;
          }
          const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${N+1} = vld1${SUFFIX}_u${SIZE}(i${N+1});
      const uint${SIZE}x${TILE_SIZE}_t v${NUM_ITERS}_${TILE_SIZE-1} = vmov${SUFFIX}_n_u${SIZE}(0);

      $for N in range(TILE_SIZE >> 1):
          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS}_${N}, v${NUM_ITERS}_${N+(TILE_SIZE>>1)});

      $for M in range(1, NUM_ITERS):
        $for N in range(TILE_SIZE >> 1):
          const uint${SIZE}x${TILE_SIZE}x2_t v${NUM_ITERS-M-1}_${N} = vzip${SUFFIX}_u${SIZE}(v${NUM_ITERS-M}_${N>>1}.val[${N%2}], v${NUM_ITERS-M}_${(N>>1)+int(TILE_SIZE/4)}.val[${N%2}]);

      $if VECTOR_SIZE == 128:
        $for N in range(TILE_SIZE):
          uint${SIZE}x${TILE_SIZE>>1}_t v${N}_low = vget_low_u${SIZE}(v0_${N>>1}.val[${N%2}]);

        if (bh & ${TILE_SIZE>>1}) {
          $if OUT_PTRS == "SWITCH":
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            switch (rem) {
              $for N in reversed(range(2, TILE_SIZE)):
                case ${N}:
                  vst1_u${SIZE}(oN, v${N}_low); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
              case 1:
                vst1_u${SIZE}(oN, v1_low);
              case 0:
                $if NUM_ITERS > 1:
                  vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
                $else:
                  vst1_u${SIZE}(o, v0_low);
                break;
              default:
                XNN_UNREACHABLE;
            }
          $elif OUT_PTRS in ["MOV", "DEC"]:
            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            vst1_u${SIZE}(o, v${TILE_SIZE-1}_low);
            $if OUT_PTRS == "MOV":
              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            $for N in reversed(range(2, TILE_SIZE, 2)):
              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              vst1_u${SIZE}(o, v${N}_low);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              vst1_u${SIZE}(o, v${N-1}_low);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width > 1) {
              $if OUT_PTRS == "MOV":
                o = oN;
              $else:
                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            }
            $if NUM_ITERS > 1:
              vst1_u${SIZE}(o, v0_low); o += ${TILE_SIZE>>1};
            $else:
              vst1_u${SIZE}(o, v0_low);
          $else:
            $for N in reversed(range(TILE_SIZE)):
              $if NUM_ITERS>1:
                vst1_u${SIZE}(o${N}, v${N}_low); o${N} += ${TILE_SIZE>>1};
              $else:
                vst1_u${SIZE}(o${N}, v${N}_low);
          $if NUM_ITERS > 1:
            $for N in range(TILE_SIZE):
              v${N}_low = vget_high_u${SIZE}(v0_${N>>1}.val[${N%2}]);
        }
      $else:
        $for N in range(TILE_SIZE):
          uint${SIZE}x${TILE_SIZE}_t v${N}_low = v0_${(N>>1)}.val[${N%2}];

      $if NUM_ITERS>=NUM_D_REGISTERS:
        if (bh & ${TILE_SIZE>>NUM_D_REGISTERS}) {
          $if OUT_PTRS == "SWITCH":
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            switch (rem) {
              $for N in reversed(range(2, TILE_SIZE)):
                case ${N}:
                  $if SIZE == 32:
                    vst1_lane_u32(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
                  $else:
                    vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
              case 1:
                $if SIZE == 32:
                  vst1_lane_u32(oN, v1_low, 0);
                $else:
                  vst1_lane_u32((void*) oN, vreinterpret_u32_u${SIZE}(v1_low), 0);
              case 0:
                $if SIZE == 32:
                  vst1_lane_u32(o, v0_low, 0);
                $else:
                  vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS};
                break;
              default:
                XNN_UNREACHABLE;
            }
          $elif OUT_PTRS in ["MOV", "DEC"]:
            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            $if SIZE == 32:
              vst1_lane_u32(o, v${TILE_SIZE-1}_low, 0);
            $else:
              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${TILE_SIZE-1}_low), 0);
            $if OUT_PTRS == "MOV":
              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            $for N in reversed(range(2, TILE_SIZE, 2)):
              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              $if SIZE == 32:
                vst1_lane_u32(o, v${N}_low, 0);
              $else:
                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N}_low), 0);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              $if SIZE == 32:
                vst1_lane_u32(o, v${N-1}_low, 0);
              $else:
                vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v${N-1}_low), 0);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width > 1) {
              $if OUT_PTRS == "MOV":
                o = oN;
              $else:
                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            }
            $if SIZE == 32:
              vst1_lane_u32(o, v0_low, 0);
            $else:
              vst1_lane_u32((void*) o, vreinterpret_u32_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>NUM_D_REGISTERS};
          $else:
            $for N in reversed(range(TILE_SIZE)):
              $if SIZE == 32:
                vst1_lane_u32(o${N}, v${N}_low, 0);
              $else:
                vst1_lane_u32((void*) o${N}, vreinterpret_u32_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>NUM_D_REGISTERS};
          $if NUM_ITERS > NUM_D_REGISTERS:
            $for N in range(TILE_SIZE):
              $if SIZE == 16:
                v${N}_low = vext_u16(v${N}_low, v${N}_low, 2);
              $else:
                v${N}_low = vext_u8(v${N}_low, v${N}_low, 4);
        }
      $if NUM_ITERS>NUM_D_REGISTERS:
        if (bh & ${TILE_SIZE>>(NUM_D_REGISTERS+1)}) {
          $if OUT_PTRS == "SWITCH":
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            switch (rem) {
              $for N in reversed(range(2, TILE_SIZE)):
                case ${N}:
                  $if SIZE == 16:
                    vst1_lane_u16(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
                  $else:
                    vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v${N}_low), 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
              case 1:
                $if SIZE == 16:
                  vst1_lane_u16(oN, v1_low, 0);
                $else:
                  vst1_lane_u16((void*) oN, vreinterpret_u16_u${SIZE}(v1_low), 0);
              case 0:
                $if SIZE == 16:
                  vst1_lane_u16(o, v0_low, 0);
                $else:
                  $if NUM_ITERS>(NUM_D_REGISTERS+1):
                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
                  $else:
                    vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0);
                break;
              default:
                XNN_UNREACHABLE;
            }
          $elif OUT_PTRS in ["MOV", "DEC"]:
            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            $if SIZE == 16:
              vst1_lane_u16(o, v${TILE_SIZE-1}_low, 0);
            $else:
              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${TILE_SIZE-1}_low), 0);
            $if OUT_PTRS == "MOV":
              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            $for N in reversed(range(2, TILE_SIZE, 2)):
              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              $if SIZE == 16:
                vst1_lane_u16(o, v${N}_low, 0);
              $else:
                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N}_low), 0);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              $if SIZE == 16:
                vst1_lane_u16(o, v${N-1}_low, 0);
              $else:
                vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v${N-1}_low), 0);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width > 1) {
              $if OUT_PTRS == "MOV":
                o = oN;
              $else:
                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            }
            $if SIZE == 16:
              vst1_lane_u16(o, v0_low, 0);
            $else:
              vst1_lane_u16((void*) o, vreinterpret_u16_u${SIZE}(v0_low), 0); o += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
          $else:
            $for N in reversed(range(TILE_SIZE)):
              $if SIZE == 16:
                vst1_lane_u16(o${N}, v${N}_low, 0);
              $else:
                vst1_lane_u16((void*) o${N}, vreinterpret_u16_u${SIZE}(v${N}_low), 0); o${N} += ${TILE_SIZE>>(NUM_D_REGISTERS+1)};
          $if NUM_ITERS>(NUM_D_REGISTERS+1):
            $for N in range(TILE_SIZE):
              v${N}_low = vext_u8(v${N}_low, v${N}_low, 2);
        }
      $if SIZE == 8:
        if (bh & 1) {
          $if OUT_PTRS == "SWITCH":
            uint${SIZE}_t* oN = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            switch (rem) {
              $for N in reversed(range(2, TILE_SIZE)):
                case ${N}:
                  vst1_lane_u8(oN, v${N}_low, 0); oN = (uint${SIZE}_t*) ((uintptr_t) oN + minus_output_stride);
              case 1:
                vst1_lane_u8(oN, v1_low, 0);
              case 0:
                vst1_lane_u8(o, v0_low, 0);
                break;
              default:
                XNN_UNREACHABLE;
            }
          $elif OUT_PTRS in ["MOV", "DEC"]:
            o = (uint${SIZE}_t*) ((uintptr_t) o + oN_stride);
            vst1_lane_u8(o, v${TILE_SIZE-1}_low, 0);
            $if OUT_PTRS == "MOV":
              uint${SIZE}_t *oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            $for N in reversed(range(2, TILE_SIZE, 2)):
              if XNN_UNPREDICTABLE(block_width > ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              vst1_lane_u8(o, v${N}_low, 0);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              if XNN_UNPREDICTABLE(block_width >= ${N+1}) {
                $if OUT_PTRS == "MOV":
                  o = oN;
                $else:
                  o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
              }
              vst1_lane_u8(o, v${N-1}_low, 0);
              $if OUT_PTRS == "MOV":
                oN = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            if XNN_UNPREDICTABLE(block_width > 1) {
              $if OUT_PTRS == "MOV":
                o = oN;
              $else:
                o = (uint${SIZE}_t*) ((uintptr_t) o + minus_output_stride);
            }
            vst1_lane_u8(o, v0_low, 0);
          $else:
            $for N in reversed(range(TILE_SIZE)):
              vst1_lane_u8(o${N}, v${N}_low, 0);
        }
    }

    $if IN_PTRS == "MULTI":
      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
      $for N in range(1, TILE_SIZE):
        i${N} = (const uint${SIZE}_t*) ((uintptr_t) i${N-1} + input_stride);
    $else:
      i0 = (const uint${SIZE}_t*) ((uintptr_t) i0 + input_reset);
    $if OUT_PTRS == "MULTI":
      o0 = (uint${SIZE}_t*) ((uintptr_t) o0 + output_reset);
      $for N in range(1, TILE_SIZE):
        o${N} = (uint${SIZE}_t*) ((uintptr_t) o${N} + output_reset);
    $else:
      o = (uint${SIZE}_t*) ((uintptr_t) o + output_reset);
    block_width = doz(block_width, tile_width);
  } while (block_width != 0);
}
