/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #version 450 core #define PRECISION ${PRECISION} #define BUF_T ${buffer_scalar_type(DTYPE)} #define VEC4_T ${texel_type(DTYPE)} #define SCALAR_T ${texel_component_type(DTYPE)} #include "indexing_utils.h" $if DTYPE == "half": #extension GL_EXT_shader_16bit_storage : require layout(std430) buffer; layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out; layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { BUF_T buffer_in[]; }; layout(set = 0, binding = 2) uniform PRECISION restrict Sizes { ivec4 sizes; }; layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes { ivec4 original_sizes; }; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; /* * Computes special prepacking for a 2D transpose convolution. Each shader * invocation calculates the input buffer locations to read into the desired * texel. This packing was originally developed on CPU here: * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211 */ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim); if (any(greaterThanEqual(idx, sizes))) { return; } // Map tensor_idx to normal buffer_i const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); // Compute modified tensor_idx by inverting the CPU function const int N = original_sizes.w; const int C = original_sizes.z; const int H = original_sizes.y; const int W = original_sizes.x; const int J = sizes.y / H; const int K = sizes.x / (4*W); const ivec4 p1 = p0 / (4*K); const ivec4 p2 = p1 / W; const ivec4 p3 = p2 / H; const ivec4 n = p0 % (4*K); const ivec4 c = (p3 % J) * 4 + (p3 / J); const ivec4 h = H-1 - p2 % H; const ivec4 w = W-1 - p1 % W; // Map modified tensor_idx to modifed buffer_i // Zero out if modified tensor idx is out of bounds const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w; const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C)))); VEC4_T texel = VEC4_T(0); if (mask.x) { texel.x = SCALAR_T(buffer_in[buf_i.x]); } if (mask.y) { texel.y = SCALAR_T(buffer_in[buf_i.y]); } if (mask.z) { texel.z = SCALAR_T(buffer_in[buf_i.z]); } if (mask.w) { texel.w = SCALAR_T(buffer_in[buf_i.w]); } imageStore(image_out, pos.xy, texel); }