/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include #include #include #include #include #include using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; using executorch::aten::RuntimeContext; using executorch::runtime::can_cast; using executorch::runtime::CppTypeToScalarType; using torch::executor::Error; namespace cadence { namespace impl { namespace HiFi { namespace native { namespace { template < bool can_cast, typename CTYPE_A, typename CTYPE_B, typename CTYPE_IN, typename CTYPE_OUT> struct MulInner; template < typename CTYPE_A, typename CTYPE_B, typename CTYPE_IN, typename CTYPE_OUT> struct MulInner { static void run(const Tensor& a, const Tensor& b, Tensor& out) { torch::executor::apply_binary_elementwise_fn( // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) [](const CTYPE_A val_a, const CTYPE_B val_b) { CTYPE_IN a_casted = static_cast(val_a); CTYPE_IN b_casted = static_cast(val_b); CTYPE_IN value = a_casted * b_casted; return static_cast(value); }, a, b, out); } }; struct ReportCanCastBug { static void run(const Tensor&, const Tensor&, Tensor&) { ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); } }; template < typename CTYPE_A, typename CTYPE_B, typename CTYPE_IN, typename CTYPE_OUT> struct MulInner : public ReportCanCastBug {}; } // namespace Tensor& mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { ET_KERNEL_CHECK( ctx, torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, InvalidArgument, out); ET_KERNEL_CHECK( ctx, executorch::runtime::tensor_is_realhb_type(out), InvalidArgument, out); ScalarType a_type = a.scalar_type(); ScalarType b_type = b.scalar_type(); ScalarType common_type = executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true); ScalarType out_type = out.scalar_type(); constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); bool optimized = 1; /*find broadcast*/ const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) optimized = 0; if ((a_dim == 0) || (b_dim == 0)) optimized = 0; if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) optimized = 0; if (optimized) { float* a_data = a.mutable_data_ptr(); float* b_data = b.mutable_data_ptr(); float* out_data = out.mutable_data_ptr(); if (broadcast == 1) { int out_shape[kNnlibMaxDim]; int inp1_shape[kNnlibMaxDim]; int inp2_shape[kNnlibMaxDim]; for (int i = 0; i < kNnlibMaxDim; i++) { out_shape[i] = 1; inp1_shape[i] = 1; inp2_shape[i] = 1; } int off_o = kNnlibMaxDim - out.dim(); int off_a = kNnlibMaxDim - a.dim(); int off_b = kNnlibMaxDim - b.dim(); for (int i = 0; i < out.dim(); i++) out_shape[i + off_o] = out.size(i); for (int i = 0; i < a.dim(); i++) inp1_shape[i + off_a] = a.size(i); for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); xa_nn_elm_mul_broadcast_4D_f32xf32_f32( out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); } else { xa_nn_elm_mul_f32xf32_f32(out_data, a_data, b_data, out.numel()); } return out; } ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() { ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() { using CTYPE_IN = typename torch::executor:: promote_types::type; ET_DCHECK(CppTypeToScalarType::value == common_type); ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() { MulInner< can_cast::value, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>::run(a, b, out); }); }); }); return out; } } // namespace native } // namespace HiFi } // namespace impl } // namespace cadence