/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "CompilationBuilder.h" #include "HalUtils.h" #include "Manager.h" #include "ModelBuilder.h" #include "NeuralNetworks.h" #include "TestNeuralNetworksWrapper.h" // Uncomment the following line to generate some debugging output that // may be useful when analyzing failures: // // #define VERBOSE VERBOSE // Uncomment the following line to generate some debugging output that // may be useful to determine test coverage for support of dynamic // temporaries (http://b/132458982): // // #define TRACE_DYNTEMP TRACE_DYNTEMP // We randomly generate tests (model + input data) at runtime, and verify // that we get the same results whether we do partitioned compilation/execution // or non partitioned compilation/execution. We perform a test as follows: // // (1) Randomly generate a model (graph and weights), randomly generate input // data, randomly assign inputs and outputs to CPU memory or to shared // memory. // // Randomly leaves dimensions unset for intermediate operands. // // (2) Randomly generate drivers based on the sample driver, each of which // executes models on the CPU. They differ according to which operations // they support. // // (3) Compile and execute without partitioning, saving off the results. // // (4) Compile and execute with partitioning. // // (5) Verify that the saved results from (3) match the results from (4). // // For simplicity, all data (model inputs, model outputs, weights, // temps) are of the same type: a 2-D TENSOR_FLOAT32 where the two // dimensions are fixed throughout a particular test case (and // randomly determined). This prevents us from having to find a // mechanism to "resize" data (e.g., if ADD#a operates on data of size // 2x2, ADD#b operates on data of size 3x3, and the outputs of ADD#a // and ADD#b become inputs of ADD#c, do we need to insert one or more // operations between (say) ADD#a and ADD#c to convert ADD#2's data // from size 2x2 to size 3x3 in order to match ADD#b). In the few // cases where an operand cannot be of this type, it is a constant // (e.g., activation functions and RNN bias). // // Each operation we generate has a signature (described in more // detail later). The randomly generated drivers decide which // operations they can execute by checking operation signatures. Once // we have built the model and know the set of signatures, we randomly // assign each signature to a driver. No signature is supported by // multiple drivers -- we're not testing the logic that the // partitioning algorithm uses to select the best driver for an // operation. namespace android { namespace V1_0 = ::android::hardware::neuralnetworks::V1_0; namespace V1_1 = ::android::hardware::neuralnetworks::V1_1; namespace V1_2 = ::android::hardware::neuralnetworks::V1_2; namespace V1_3 = ::android::hardware::neuralnetworks::V1_3; using CompilationBuilder = nn::CompilationBuilder; using DeviceManager = nn::DeviceManager; using Device = nn::Device; using SharedDevice = nn::SharedDevice; using ExecutionPlan = nn::ExecutionPlan; using ExecutionStep = nn::ExecutionStep; using HalCacheToken = nn::HalCacheToken; using HalVersion = nn::HalVersion; using HidlModel = V1_3::Model; using LogicalStep = nn::LogicalStep; using ModelBuilder = nn::ModelBuilder; using Result = nn::test_wrapper::Result; using SampleDriver = nn::sample_driver::SampleDriver; using WrapperCompilation = nn::test_wrapper::Compilation; using WrapperExecution = nn::test_wrapper::Execution; using WrapperMemory = nn::test_wrapper::Memory; using WrapperModel = nn::test_wrapper::Model; using WrapperOperandType = nn::test_wrapper::OperandType; using WrapperType = nn::test_wrapper::Type; namespace { /// Configure test size ////////////////////////////////////////////////////////// // We may exceed this in order to connect otherwise disjoint subgraphs. static const unsigned kMaxNumOperations = 100; // We build models to process 2-D square tensors up to this size in each dimension; // note that the API promotes by-value weights larger than 128 to by-reference, // so we want to ensure that we can pick both types that exceed and types that do // not exceed this size. static const unsigned kMaxProblemSize = 8; // First seed for pseudorandom test generation. static const unsigned kFirstSeed = 0; // Number of test cases. static const unsigned kNumTestCases = 225; // Force all graph weights into a single pool (as we recommend to users) // or allow them to be distributed across multiple pools (more stress // on the partitioning algorithm and the rest of the runtime)? // Forcing all graph weights into a single pool may be necessary to // prevent large graphs from running up against http://b/70302693 // "NNAPI overuses (?) fds". static const bool kAllWeightsInOnePool = false; ////////////////////////////////////////////////////////////////////////////////// // The signature of an operation consists of the operation type (e.g., // ADD) and the activation function (use -1 in the case of an // operation type for which the activation function is inapplicable). typedef std::pair Signature; // This class adds some simple utilities on top of WrapperModel. For example, // it provides access to certain features from ModelBuilder that are not exposed // by the base class (such as inputCount() and operation index). class TestModel : public WrapperModel { public: uint32_t addOperation(ANeuralNetworksOperationType type, const std::vector& inputs, const std::vector& outputs) { const uint32_t operationIndex = operationCount(); mOperations.push_back(outputs); WrapperModel::addOperation(type, inputs, outputs); return operationIndex; } uint32_t operationCount() const { return mOperations.size(); } uint32_t inputCount() const { return builder()->inputCount(); } uint32_t outputCount() const { return builder()->outputCount(); } const std::vector& getOperationOutputs(uint32_t index) const { CHECK(index < mOperations.size()); return mOperations[index]; } // All values are immediately copied into the model (we need to do // this ourselves in cases where the underlying NNAPI does not). void setOperandValue(uint32_t index, const std::vector& value) { const size_t length = value.size() * sizeof(float); if (length <= ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES) { WrapperModel::setOperandValue(index, value.data(), length); } else { mOperandValues.push_back(value); WrapperModel::setOperandValue(index, mOperandValues.back().data(), length); } } void setOperandValue(uint32_t index, const std::vector& value) { const size_t length = value.size() * sizeof(int32_t); CHECK(length <= ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES); WrapperModel::setOperandValue(index, value.data(), length); } void setOperandValue(uint32_t index, int32_t value) { CHECK(sizeof(value) <= ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES); WrapperModel::setOperandValue(index, &value, sizeof(value)); } private: const ModelBuilder* builder() const { return reinterpret_cast(getHandle()); } // Representation of operations: vector index is operation number, // vector value is operation's output operands. std::vector> mOperations; // Large operand values -- not immediately copied into the // WrapperModel, so remembered here instead. std::vector> mOperandValues; }; // This class adds some simple utilities on top of WrapperCompilation in order // to provide access to certain features from CompilationBuilder that are not // exposed by the base class. class TestCompilation : public WrapperCompilation { public: TestCompilation(const WrapperModel* model) : WrapperCompilation(model) {} TestCompilation(const WrapperModel* model, std::vector> devices) { ModelBuilder* m = reinterpret_cast(model->getHandle()); CompilationBuilder* c = nullptr; int result = m->createCompilation(&c, devices); EXPECT_EQ(result, 0); mCompilation = reinterpret_cast(c); } using WrapperCompilation::finish; Result setPartitioning(uint32_t partitioning) { return static_cast(builder()->forTest_setPartitioning(partitioning)); } const ExecutionPlan& getExecutionPlan() const { return builder()->forTest_getExecutionPlan(); } private: const CompilationBuilder* builder() const { return reinterpret_cast(getHandle()); } CompilationBuilder* builder() { return reinterpret_cast(getHandle()); } }; // This class is used to manage a collection of memory regions, // disjoint windows onto a set of Memory instances, each of which is // associated with a single shared memory region. Each region and // Memory instance is assigned a number. The usage pattern is as // follows: // - Call addMemory() and addRegion() as many times as needed to // declare (but not define) Memory instances and declare region // instances. // - Call layout() to define the Memory instances. // - Call getRegion() as many times as needed to get the details // of memory regions (such as address, or Memory/offset/length). // The Memory instances created by layout() are owned by the // TestMemories instance, and are destroyed when the TestMemories // instance is destroyed. class TestMemories { public: TestMemories() = default; TestMemories(const TestMemories&) = delete; TestMemories& operator=(const TestMemories&) = delete; unsigned addMemory() { CHECK(!mLayoutDone); mMemorySizes.push_back(0); return memoryCount() - 1; } unsigned memoryCount() const { return mMemorySizes.size(); } unsigned addRegion(unsigned memoryIndex, uint32_t length) { CHECK(!mLayoutDone); CHECK(memoryIndex < memoryCount()); uint32_t& memorySize = mMemorySizes[memoryIndex]; auto desc = std::make_tuple(memoryIndex, (uint32_t)memorySize, length); mRegions.push_back(desc); memorySize += length; return regionCount() - 1; } unsigned regionCount() const { return mRegions.size(); } void layout(); void* getRegion(unsigned regionIndex, const WrapperMemory** pMemory, uint32_t* pOffset, uint32_t* pLength) { CHECK(mLayoutDone); CHECK(regionIndex < regionCount()); const auto& regionDescriptor = mRegions[regionIndex]; const WrapperMemory* memory = &mMemories[std::get<0>(regionDescriptor)]; uint32_t offset = std::get<1>(regionDescriptor); uint32_t length = std::get<2>(regionDescriptor); uint8_t* buffer = reinterpret_cast(memory->get())->getPointer(); CHECK(buffer != nullptr); if (pMemory) *pMemory = memory; if (pOffset) *pOffset = offset; if (pLength) *pLength = length; return buffer + offset; } void* getRegion(unsigned regionIndex) { return getRegion(regionIndex, nullptr, nullptr, nullptr); } private: // Index is the memory index; value is the size of the memory // (aggregate size of all regions in the memory). std::vector mMemorySizes; // Index is the memory index. std::vector mMemories; // Index is the region index; tuple represents memory index, // region offset within memory, region length. std::vector> mRegions; // For validity checking. bool mLayoutDone = false; }; void TestMemories::layout() { CHECK(!mLayoutDone); for (uint32_t memorySize : mMemorySizes) { auto [n, ashmem] = nn::MemoryAshmem::create(memorySize); CHECK_EQ(n, ANEURALNETWORKS_NO_ERROR); CHECK(ashmem != nullptr); ANeuralNetworksMemory* memory = reinterpret_cast(ashmem.release()); mMemories.emplace_back(memory); } mLayoutDone = true; } class RandomPartitioningTest : public ::testing::TestWithParam { public: RandomPartitioningTest() : mRandNumEng(GetParam() /* seed */), mRandNumUnitDist(0.0, 1.0) {} static Signature getSignature(const HidlModel& model, const V1_3::Operation& operation); protected: static SharedDevice makeTestDriver(HalVersion version, const char* name, std::set signatures); static HalVersion getMinHalVersion(ANeuralNetworksOperationType type); static std::string to_string(HalVersion version); bool randBool() { return randUInt(2) == 1; } double randFrac() { // [0.0, 1.0) return mRandNumUnitDist(mRandNumEng); } unsigned randUInt(unsigned limit) { // [0, limit) return unsigned(randFrac() * limit); } // Represents an operation in which every input and output operand // is a TENSOR_FLOAT32 of dimensions [problemSize, problemSize] except: // - One input operand may be an activation function. // - Any number of input operands may be "special" in some other way // (and in this implementation, not produced by any other operation). // We require that: // - There be at least one input operand that is neither an // activation function nor "special". struct OperationPattern { HalVersion mMinHalVersion; int mOperationType; unsigned mNumInputs; unsigned mNumOutputs; int mActivationFunctionInputIndex; // <0 if none // Returns operand index, or <0 if input is normal (must not // be called for an activation function operand). Function // should have the following prototype: // // int makeSpecialInput(unsigned problemSize, TestModel* model, unsigned inputIndex); // int (RandomPartitioningTest::*mMakeSpecialInput)(unsigned, TestModel*, unsigned); }; static const OperationPattern kOperationPatterns[]; // See OperationPattern::mMakeSpecialInput. This function is used to // manufacture an ELU input operand that doesn't fit the general operand // pattern known to the graph generator infrastructure. int makeEluSpecialInput([[maybe_unused]] unsigned problemSize, TestModel* model, unsigned inputIndex) { if (inputIndex != 1) { return -1; } // input operand 1 is alpha, a scalar const WrapperOperandType alphaType(WrapperType::FLOAT32, {}); return int(model->addConstantOperand(&alphaType, 1.0f)); } // See OperationPattern::mMakeSpecialInput. This function is used to // manufacture an RNN input operand that doesn't fit the general operand // pattern known to the graph generator infrastructure. int makeRnnSpecialInput(unsigned problemSize, TestModel* model, unsigned inputIndex) { if (inputIndex != 3) { return -1; } // input operand 3 is bias, a 1-D tensor const WrapperOperandType biasType(WrapperType::TENSOR_FLOAT32, {problemSize}); const uint32_t operandIndex = model->addOperand(&biasType); std::vector biasValue(problemSize); std::generate(biasValue.begin(), biasValue.end(), [this] { return randFrac(); }); model->setOperandValue(operandIndex, biasValue); return int(operandIndex); } // See OperationPattern::mMakeSpecialInput. This function is used to // manufacture a TRANSPOSE input operand that doesn't fit the general operand // pattern known to the graph generator infrastructure. int makeTransposeSpecialInput(unsigned /* problemSize */, TestModel* model, unsigned inputIndex) { if (inputIndex != 1) { return -1; } // input operand 1 is perm, a 1-D tensor const WrapperOperandType permType(WrapperType::TENSOR_INT32, {2}); const uint32_t operandIndex = model->addOperand(&permType); std::vector permValue = {1, 0}; model->setOperandValue(operandIndex, permValue); return int(operandIndex); } #ifdef VERBOSE class ModelStats { public: ModelStats(const ModelBuilder* model) : mBuilder(model) {} ModelStats(const WrapperModel* model) : mBuilder(reinterpret_cast(model->getHandle())) {} friend std::ostream& operator<<(std::ostream& out, const ModelStats& stats) { const uint32_t operandCount = stats.mBuilder->operandCount(); const uint32_t inputCount = stats.mBuilder->inputCount(); const uint32_t outputCount = stats.mBuilder->outputCount(); out << "operationCount = " << stats.mBuilder->operationCount() << ", operandCount = " << operandCount << ", inputCount = " << inputCount << " (" << (double(inputCount) / operandCount) << ")" << ", outputCount = " << outputCount << " (" << (double(outputCount) / operandCount) << ")"; return out; } private: const ModelBuilder* mBuilder; }; template static void dump(T_iterator I, T_iterator E) { std::cout << "{"; for (; I != E; I++) { std::cout << " " << *I; } std::cout << " }" << std::endl; } #endif std::mt19937 mRandNumEng; private: std::uniform_real_distribution mRandNumUnitDist; }; const RandomPartitioningTest::OperationPattern RandomPartitioningTest::kOperationPatterns[] = { {HalVersion::V1_0, ANEURALNETWORKS_ADD, 3, 1, 2, nullptr}, {HalVersion::V1_0, ANEURALNETWORKS_LOGISTIC, 1, 1, -1, nullptr}, {HalVersion::V1_0, ANEURALNETWORKS_MUL, 3, 1, 2, nullptr}, {HalVersion::V1_0, ANEURALNETWORKS_RNN, 6, 2, 5, &RandomPartitioningTest::makeRnnSpecialInput}, {HalVersion::V1_0, ANEURALNETWORKS_TANH, 1, 1, -1, nullptr}, {HalVersion::V1_1, ANEURALNETWORKS_SUB, 3, 1, 2, nullptr}, {HalVersion::V1_1, ANEURALNETWORKS_TRANSPOSE, 2, 1, -1, &RandomPartitioningTest::makeTransposeSpecialInput}, {HalVersion::V1_2, ANEURALNETWORKS_MAXIMUM, 2, 1, -1, nullptr}, {HalVersion::V1_2, ANEURALNETWORKS_NEG, 1, 1, -1, nullptr}, {HalVersion::V1_2, ANEURALNETWORKS_SIN, 1, 1, -1, nullptr}, {HalVersion::V1_3, ANEURALNETWORKS_ELU, 2, 1, -1, &RandomPartitioningTest::makeEluSpecialInput}, {HalVersion::V1_3, ANEURALNETWORKS_HARD_SWISH, 1, 1, -1, nullptr}, }; HalVersion RandomPartitioningTest::getMinHalVersion(ANeuralNetworksOperationType type) { static const auto kOperationToVersion = [] { std::map result; for (const auto& pattern : kOperationPatterns) { result[pattern.mOperationType] = pattern.mMinHalVersion; } return result; }(); return kOperationToVersion.at(type); } Signature RandomPartitioningTest::getSignature(const HidlModel& model, const V1_3::Operation& operation) { static const auto kOperationToActivation = [] { std::map result; for (const auto& pattern : kOperationPatterns) { result[pattern.mOperationType] = pattern.mActivationFunctionInputIndex; } return result; }(); const ANeuralNetworksOperationType operationType = static_cast(operation.type); const int activationFunctionInputIndex = kOperationToActivation.at(operationType); if (activationFunctionInputIndex < 0) { return Signature(operationType, -1); } const V1_3::Operand& operand = model.main.operands[operation.inputs[activationFunctionInputIndex]]; CHECK(operand.lifetime == V1_3::OperandLifeTime::CONSTANT_COPY); CHECK(operand.type == V1_3::OperandType::INT32); int32_t value; memcpy(&value, &model.operandValues[operand.location.offset], operand.location.length); return Signature(operationType, value); } std::string RandomPartitioningTest::to_string(HalVersion version) { switch (version) { case HalVersion::V1_0: return "V1_0"; case HalVersion::V1_1: return "V1_1"; case HalVersion::V1_2: return "V1_2"; case HalVersion::V1_3: return "V1_3"; default: return "V_UNKNOWN"; } }; class TestDriver : public SampleDriver { public: // Behaves like SampleDriver, except that it only supports // operations with the specified signatures. TestDriver(const char* name, std::set signatures) : SampleDriver(name), mSignatures(std::move(signatures)) {} hardware::Return getCapabilities_1_3(getCapabilities_1_3_cb _hidl_cb) override { android::nn::initVLogMask(); const V1_0::PerformanceInfo kPerf = {.execTime = 0.75f, .powerUsage = 0.75f}; V1_3::Capabilities capabilities = { .relaxedFloat32toFloat16PerformanceScalar = kPerf, .relaxedFloat32toFloat16PerformanceTensor = kPerf, .operandPerformance = nn::nonExtensionOperandPerformance(kPerf), .ifPerformance = kPerf, .whilePerformance = kPerf}; _hidl_cb(V1_3::ErrorStatus::NONE, capabilities); return hardware::Void(); } hardware::Return getSupportedOperations_1_3(const HidlModel& model, getSupportedOperations_1_3_cb cb) override { if (nn::validateModel(model)) { const size_t count = model.main.operations.size(); std::vector supported(count); for (size_t i = 0; i < count; i++) { supported[i] = (mSignatures.count(RandomPartitioningTest::getSignature( model, model.main.operations[i])) != 0); } cb(V1_3::ErrorStatus::NONE, supported); } else { cb(V1_3::ErrorStatus::INVALID_ARGUMENT, {}); } return hardware::Void(); } hardware::Return prepareModel_1_3( const HidlModel& model, V1_1::ExecutionPreference preference, V1_3::Priority priority, const V1_3::OptionalTimePoint& deadline, const hardware::hidl_vec& modelCache, const hardware::hidl_vec& dataCache, const HalCacheToken& token, const sp& callback) override { // NOTE: We verify that all operations in the model are supported. V1_3::ErrorStatus outStatus = V1_3::ErrorStatus::INVALID_ARGUMENT; auto ret = getSupportedOperations_1_3( model, [&outStatus](V1_3::ErrorStatus inStatus, const hardware::hidl_vec& supportedOperations) { if (inStatus == V1_3::ErrorStatus::NONE) { if (std::all_of(supportedOperations.begin(), supportedOperations.end(), [](bool v) { return v; })) { outStatus = V1_3::ErrorStatus::NONE; } } }); if (ret.isOk() && (outStatus == V1_3::ErrorStatus::NONE)) { return SampleDriver::prepareModel_1_3(model, preference, priority, deadline, modelCache, dataCache, token, callback); } else { callback->notify_1_3(V1_3::ErrorStatus::INVALID_ARGUMENT, nullptr); return V1_3::ErrorStatus::INVALID_ARGUMENT; } } private: const std::set mSignatures; }; SharedDevice RandomPartitioningTest::makeTestDriver(HalVersion version, const char* name, std::set signatures) { switch (version) { case HalVersion::V1_0: return V1_0::utils::Device::create(name, new TestDriver(name, std::move(signatures))) .value(); case HalVersion::V1_1: return V1_1::utils::Device::create(name, new TestDriver(name, std::move(signatures))) .value(); case HalVersion::V1_2: return V1_2::utils::Device::create(name, new TestDriver(name, std::move(signatures))) .value(); case HalVersion::V1_3: return V1_3::utils::Device::create(name, new TestDriver(name, std::move(signatures))) .value(); default: ADD_FAILURE() << "Unexpected HalVersion " << static_cast(version); return nullptr; } } INSTANTIATE_TEST_SUITE_P(Seed, RandomPartitioningTest, ::testing::Range(kFirstSeed, kFirstSeed + kNumTestCases)); TEST_P(RandomPartitioningTest, Test) { LOG(INFO) << "RandomPartitioningTest: GetParam() = " << GetParam(); #ifdef VERBOSE std::cout << std::setprecision(2) << std::fixed << std::setw(4); #endif const unsigned problemSize = 1 + randUInt(kMaxProblemSize); const WrapperOperandType problemType(WrapperType::TENSOR_FLOAT32, {problemSize, problemSize}); const WrapperOperandType unknownDimensionsTypes[] = { {WrapperType::TENSOR_FLOAT32, {}}, {WrapperType::TENSOR_FLOAT32, {0, 0}}, {WrapperType::TENSOR_FLOAT32, {0, problemSize}}, {WrapperType::TENSOR_FLOAT32, {problemSize, 0}}, }; const unsigned kUnknownDimensionsTypesCount = sizeof(unknownDimensionsTypes) / sizeof(unknownDimensionsTypes[0]); static const WrapperOperandType activationFunctionType(WrapperType::INT32, {}); const unsigned numOperations = 2 + randUInt(kMaxNumOperations - 1); const bool allowDeadOperations = (randFrac() < 0.2); const bool allowUnknownDimensions = (randFrac() < 0.25); // TODO: The current algorithm builds the graph in a forward // direction (i.e., later-generated operations consume outputs // from earlier-generated operations). In order to get more // variation in graph topology, perhaps we should also create an // algorithm to build the graph in a backward direction (i.e., // later-generated operations produce outputs to be consumed by // earlier-generated operations). [[maybe_unused]] const bool buildForward = randBool(); // TODO: Add a form of forced connectivity that operates by // joining disjoint subgraphs rather than by forcing a root. const bool forceCommonRoot = (randFrac() < 0.75); auto computeMode = WrapperExecution::getComputeMode(); // We check randFrac() independent of compute mode, because we don't want // the random number sequence to change depending on compute mode: Compute // mode should only affect how we perform the inference, not how we build the // Model, the Compilation, or the Execution. if (randFrac() < 0.5 && computeMode == WrapperExecution::ComputeMode::ASYNC) { computeMode = WrapperExecution::ComputeMode::FENCED; } TestModel model; std::vector modelInputs; std::vector modelOutputs; std::set operandsWithUnknownDimensions; // Each region in weights is a problem-sized 2-D TENSOR_FLOAT32. TestMemories weights; // Keep track of all normal (i.e., not activation function and not // "special") operands that are values (from setOperandValue*()). // .first: operand index // .second: if the operand is already defined (via setOperandValue*()) then ~0U; // otherwise, the operand has yet to be defined, and this is the corresponding // region index in "weights" std::vector> valueOperands; // An operand is "dead" if it is not consumed by another operation // and is not a model output. Key is operand index; value is // operation index. std::map deadOperands; // An operation is "dead" if all of its outputs are dead. std::set deadOperations; // Collect the signatures of operations in this model. std::set signatures; // For reporting purposes, keep track of the number of root // operations (those that do not consume results produced by other // operations). unsigned rootOperationCount = 0; // Track whether we added operands with unknown dimensions. In this case, // partitioned compilation will fail if such an operand is read in a // different partition than it is written, and the partition that does the // writing is scheduled on a pre-HAL 1.2 (pre-Android Q) device. bool hasUnknownDimensions = false; // Generate operations. for (unsigned i = 0; i < numOperations; i++) { const unsigned operationPatternIndex = randUInt(std::size(kOperationPatterns)); const auto& operationPattern = kOperationPatterns[operationPatternIndex]; // INPUTS ////////////////////////////////////////////////////////////////////////////////// std::vector operationInputs(operationPattern.mNumInputs, ~0U); // First, process activation function and special inputs, and // keep track of which inputs remain. std::vector normalOperationInputIndexes; int32_t activationFunction = -1; for (unsigned operationInputIndex = 0; operationInputIndex < operationPattern.mNumInputs; operationInputIndex++) { if (int(operationInputIndex) == operationPattern.mActivationFunctionInputIndex) { const uint32_t operandIndex = model.addOperand(&activationFunctionType); activationFunction = randUInt(4); if (activationFunction == ANEURALNETWORKS_FUSED_RELU1) { // workaround for http://b/69011131 activationFunction = ANEURALNETWORKS_FUSED_NONE; } model.setOperandValue(operandIndex, activationFunction); operationInputs[operationInputIndex] = operandIndex; continue; } if (operationPattern.mMakeSpecialInput != nullptr) { const int operandIndex = (this->*(operationPattern.mMakeSpecialInput))( problemSize, &model, operationInputIndex); if (operandIndex >= 0) { operationInputs[operationInputIndex] = operandIndex; continue; } } normalOperationInputIndexes.push_back(operationInputIndex); } CHECK(!normalOperationInputIndexes.empty()); signatures.insert(Signature(operationPattern.mOperationType, activationFunction)); // A (normal) operation input can be one of: // - a new or existing model input // - an output of an existing operation // - an OperandValue // - an OperandValueFromMemory // Some guidelines: // - We generally don't want all of an operation's inputs to be values (constants) const unsigned normalOperationInputCount = normalOperationInputIndexes.size(); // How many of this operation's inputs are constants? unsigned normalOperationInputConstantCount = 0; // How many of this operation's inputs are model inputs? unsigned normalOperationInputModelInputCount = 0; // We begin by deciding what kind of input each (normal) operation will be; we don't // actually pick input operand indexes at this time, because we might override this // decision later. enum InputKind { IK_SUBGRAPH_INPUT, IK_OPERATION_OUTPUT, IK_VALUE }; std::vector normalOperationInputKinds(normalOperationInputCount); std::generate( normalOperationInputKinds.begin(), normalOperationInputKinds.end(), [this, &model, numOperations, normalOperationInputCount, &normalOperationInputConstantCount, &normalOperationInputModelInputCount]() -> InputKind { // Constant? Becomes less likely the more // constants we already have as inputs to // this operation. if (randFrac() < 0.3 * (1 - double(normalOperationInputConstantCount) / normalOperationInputCount)) { normalOperationInputConstantCount++; return IK_VALUE; } // Model input? Becomes less likely the // more model inputs we already have as // inputs to this operation, and the further // along we are in generating this model // (i.e., the more operations we have // generated). if ((model.operationCount() == 0) || (randFrac() < 0.5 * (1 - double(normalOperationInputModelInputCount) / normalOperationInputCount) * std::min(0.3, (1 - double(model.operationCount()) / numOperations)))) { normalOperationInputModelInputCount++; return IK_SUBGRAPH_INPUT; } // Else output of an existing operation. return IK_OPERATION_OUTPUT; }); // Now force common root or model input, if necessary. (A // model must have at least one input.) auto force = [this, &normalOperationInputKinds, normalOperationInputCount](InputKind forceKind) { if (std::none_of(normalOperationInputKinds.begin(), normalOperationInputKinds.end(), [forceKind](InputKind kind) { return kind == forceKind; })) { normalOperationInputKinds[randUInt(normalOperationInputCount)] = forceKind; } }; if (forceCommonRoot && (model.operationCount() != 0)) { force(IK_OPERATION_OUTPUT); } if (modelInputs.empty()) { CHECK(model.operationCount() == 0); force(IK_SUBGRAPH_INPUT); } // Finally create the normal inputs. bool isRootOperation = true; for (unsigned i = 0; i < normalOperationInputCount; i++) { uint32_t operandIndex = ~0U; switch (normalOperationInputKinds[i]) { case IK_SUBGRAPH_INPUT: { if (!modelInputs.empty() && (randFrac() < 0.5)) { operandIndex = modelInputs[randUInt(modelInputs.size())]; } else { operandIndex = model.addOperand(&problemType); modelInputs.push_back(operandIndex); } break; } case IK_OPERATION_OUTPUT: { decltype(deadOperands.begin()) deadOperandI; if (!deadOperands.empty() && (randFrac() < 0.5)) { deadOperandI = deadOperands.begin(); std::advance(deadOperandI, randUInt(deadOperands.size())); operandIndex = deadOperandI->first; } else { const uint32_t existingOperationIndex = randUInt(model.operationCount()); const auto& existingOperationOutputs = model.getOperationOutputs(existingOperationIndex); operandIndex = existingOperationOutputs[randUInt(existingOperationOutputs.size())]; deadOperandI = deadOperands.find(operandIndex); CHECK(deadOperandI == deadOperands.end() || deadOperandI->second == existingOperationIndex); } if (deadOperandI != deadOperands.end()) { const uint32_t correspondingOperation = deadOperandI->second; deadOperands.erase(deadOperandI); auto deadOperationI = deadOperations.find(correspondingOperation); if (deadOperationI != deadOperations.end()) { deadOperations.erase(deadOperationI); } } isRootOperation = false; break; } case IK_VALUE: { if (!valueOperands.empty() && (randFrac() < 0.25)) { operandIndex = valueOperands[randUInt(valueOperands.size())].first; } else { operandIndex = model.addOperand(&problemType); if (randFrac() < 0.5) { std::vector value(problemSize * problemSize); std::generate(value.begin(), value.end(), [this] { return randFrac(); }); model.setOperandValue(operandIndex, value); valueOperands.push_back(std::make_pair(operandIndex, ~0U)); } else { unsigned memoryIndex = ~0U; if ((weights.memoryCount() != 0) && (kAllWeightsInOnePool || (randFrac() < 0.5))) { memoryIndex = randUInt(weights.memoryCount()); } else { memoryIndex = weights.addMemory(); } const size_t length = problemSize * problemSize * sizeof(float); const unsigned regionIndex = weights.addRegion(memoryIndex, length); valueOperands.push_back(std::make_pair(operandIndex, regionIndex)); } } break; } default: FAIL(); } operationInputs[normalOperationInputIndexes[i]] = operandIndex; } if (isRootOperation) { rootOperationCount++; } // OUTPUTS ///////////////////////////////////////////////////////////////////////////////// std::vector operationOutputs(operationPattern.mNumOutputs); std::generate( operationOutputs.begin(), operationOutputs.end(), [&operandsWithUnknownDimensions, &model, &problemType, &unknownDimensionsTypes, &hasUnknownDimensions, allowUnknownDimensions, this] { // Before the fix for http://b/132458982, 3% unknowns causes // ~35% of partitionings to fail. After the fix, 3% // unknowns causes ~3% of partitionings to fail. (This is // determined by removing the fallback code and noting the // number of failures.) if (allowUnknownDimensions && randFrac() < 0.03) { hasUnknownDimensions = true; uint32_t opndIdx = model.addOperand( &unknownDimensionsTypes[randUInt(kUnknownDimensionsTypesCount)]); operandsWithUnknownDimensions.insert(opndIdx); return opndIdx; } else { return model.addOperand(&problemType); } }); // OPERATION /////////////////////////////////////////////////////////////////////////////// const uint32_t operationIndex = model.addOperation(operationPattern.mOperationType, operationInputs, operationOutputs); deadOperations.insert(operationIndex); std::for_each(operationOutputs.begin(), operationOutputs.end(), [&deadOperands, operationIndex](uint32_t operandIndex) { deadOperands.insert(std::make_pair(operandIndex, operationIndex)); }); } // Now finalize the weights. weights.layout(); for (const auto& valueOperand : valueOperands) { const uint32_t operandIndex = valueOperand.first; const unsigned regionIndex = valueOperand.second; if (regionIndex == ~0U) { continue; } const WrapperMemory* memory; uint32_t offset, length; float* region = static_cast(weights.getRegion(regionIndex, &memory, &offset, &length)); CHECK(length == problemSize * problemSize * sizeof(float)); std::generate(region, region + problemSize * problemSize, [this] { return randFrac(); }); model.setOperandValueFromMemory(operandIndex, memory, offset, length); } // Now select model outputs. for (uint32_t operationIdx = 0, operationCount = model.operationCount(); operationIdx < operationCount; operationIdx++) { const auto& outputs = model.getOperationOutputs(operationIdx); for (uint32_t outputIdx = 0, outputCount = outputs.size(); outputIdx < outputCount; outputIdx++) { bool modelOutput = false; const uint32_t operandIndex = outputs[outputIdx]; const auto deadOperandI = deadOperands.find(operandIndex); if (deadOperandI != deadOperands.end()) { // This is not consumed within the model, so unless we // make it an output of the model, it's dead. The // further along we are in generating this model // (i.e., the more operations we have generated), the // more likely we are to classify this operation // output as a model output. const double probabilityOfModelOutput = 0.50 * [](double x) { return x * x; }((operationIdx + 1) / operationCount); modelOutput = (randFrac() < probabilityOfModelOutput); } else { // This is consumed within the model, so we'll rarely // make it an output of the model. modelOutput = (randFrac() < 0.05); } if (!modelOutput) { continue; } modelOutputs.push_back(operandIndex); if (deadOperandI != deadOperands.end()) { deadOperands.erase(deadOperandI); const auto deadOperationI = deadOperations.find(operationIdx); if (deadOperationI != deadOperations.end()) { deadOperations.erase(deadOperationI); } } } } if (!allowDeadOperations) { // For each dead operation, pick a random output to become a model output. for (uint32_t deadOperationIndex : deadOperations) { const auto& deadOperationOutputs = model.getOperationOutputs(deadOperationIndex); const uint32_t deadOperandIndex = deadOperationOutputs[randUInt(deadOperationOutputs.size())]; modelOutputs.push_back(deadOperandIndex); } } // A model must have at least one output. if (modelOutputs.empty()) { const auto& outputs = model.getOperationOutputs(randUInt(model.operationCount())); modelOutputs.push_back(outputs[randUInt(outputs.size())]); } if (computeMode == WrapperExecution::ComputeMode::FENCED) { if (std::any_of(modelOutputs.begin(), modelOutputs.end(), [&operandsWithUnknownDimensions](uint32_t opndIdx) { return operandsWithUnknownDimensions.count(opndIdx) != 0; })) { // Workaround for http://b/162980246: Fenced execution is documented // as requiring model outputs to have fully specified dimensions, // either from Model or from Execution, but its implementation // requires this to come from Model. This test only guarantees that // they have fully specified dimensions from Execution. So in the // case of a Model where some output does not have fully specified // dimensions, perform asynchronous execution instead. computeMode = WrapperExecution::ComputeMode::ASYNC; } } model.identifyInputsAndOutputs(modelInputs, modelOutputs); #ifdef VERBOSE { std::cout << "Original model: " << ModelStats(&model) << std::endl; std::cout << "rootOperationCount = " << rootOperationCount << ", deadOperations = "; if (allowDeadOperations) { std::cout << deadOperations.size(); } else { std::cout << "forbidden (converted " << deadOperations.size() << ")"; } std::cout << std::endl; } #endif ASSERT_EQ(model.finish(), Result::NO_ERROR); // Non-partitioned compilation. TestCompilation c(&model); ASSERT_EQ(c.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR); ASSERT_EQ(c.finish(), Result::NO_ERROR); // Create some drivers for partitioned compilation. CHECK(!signatures.empty()); std::vector> signaturesForDriver(signatures.size()); // First assign each signature to a random driver (a driver is // just represented as an entry in the signaturesForDriver // vector). for (Signature signature : signatures) { signaturesForDriver[randUInt(signatures.size())].insert(signature); } // Now remove each entry that has no signatures. auto firstExtra = std::remove_if(signaturesForDriver.begin(), signaturesForDriver.end(), [](const std::set& sigSet) { return sigSet.empty(); }); if (firstExtra != signaturesForDriver.end()) { signaturesForDriver.erase(firstExtra, signaturesForDriver.end()); } // Now actually create the drivers. std::vector> devices; for (unsigned i = 0; i < signaturesForDriver.size(); i++) { const auto& signaturesForThisDriver = signaturesForDriver[i]; // Minimum HAL version for this driver is highest minimum HAL version of // any operation supported by this driver. const HalVersion minHalVersion = getMinHalVersion( std::max_element(signaturesForThisDriver.begin(), signaturesForThisDriver.end(), [](const Signature& a, const Signature& b) { return getMinHalVersion(a.first) < getMinHalVersion(b.first); }) ->first); const HalVersion actualHalVersion = static_cast(static_cast(minHalVersion) + randUInt(static_cast(HalVersion::LATEST) - static_cast(minHalVersion) + 1)); const std::string name = "TestDriver(" + std::to_string(i) + "){" + to_string(actualHalVersion) + "}"; #ifdef VERBOSE std::cout << "Creating " + name + " for collection of signatures that requires HAL " + to_string(minHalVersion) << std::endl; #endif auto device = DeviceManager::forTest_makeDriverDevice( makeTestDriver(actualHalVersion, name.c_str(), signaturesForThisDriver)); devices.push_back(device); } // CPU fallback device devices.push_back(DeviceManager::getCpuDevice()); // Partitioned compilation. // // If a test case has both (a) unknown intermediate operand sizes and // (b) partitions scheduled on pre-HAL 1.2 (pre-Android Q) devices, fallback // is needed if the non-fallback partitioning fails. // // The issue is that prior to HAL 1.2, an output operand must have a known // size provided either in the Model or in the Request; and in the case of // partitioning, an intermediate operand of the original model that becomes // an output operand of a partition won't have a known size provided in the // Request. // // If a test case has a step model with no inputs or no outputs, fallback is needed. // This is because our HAL specification requires a model to have at least one // input and one output. // // If a fallback is needed, we retry the compilation with a fallback and require // the fallback to succeed. Otherwise, we require the partitioning to succeed // without CPU fallback. TestCompilation cNoFallback(&model, devices); TestCompilation cWithFallback(&model, devices); ASSERT_EQ(cNoFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR); auto compilationResult = cNoFallback.finish(); const bool fallbackNeededForDynamicTemporaries = compilationResult == Result::OP_FAILED && hasUnknownDimensions && cNoFallback.getExecutionPlan().hasDynamicTemporaries() && std::any_of(devices.begin(), devices.end(), [](const std::shared_ptr& device) { return !isCompliantVersion(nn::kHalVersionV1_2ToApi.canonical, device->getFeatureLevel()); }); const bool fallbackNeededForStepModelWithNoInputsOrNoOutputs = cNoFallback.getExecutionPlan().forTest_hasStepModelWithNoInputsOrNoOutputs(); const bool fallbackNeeded = fallbackNeededForDynamicTemporaries || fallbackNeededForStepModelWithNoInputsOrNoOutputs; if (fallbackNeeded) { ASSERT_EQ(compilationResult, Result::OP_FAILED); ASSERT_EQ(cWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback), Result::NO_ERROR); compilationResult = cWithFallback.finish(); ASSERT_EQ(compilationResult, Result::NO_ERROR); ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE); ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice()); } else { ASSERT_EQ(compilationResult, Result::NO_ERROR); const ExecutionPlan& plan = cNoFallback.getExecutionPlan(); if (signaturesForDriver.size() == 1) { ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE); ASSERT_TRUE(plan.forTest_simpleGetDevice() != DeviceManager::getCpuDevice()); } else { ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND); auto stepToDeviceId = [](const std::shared_ptr& step) { return step->executionStep()->getDevice(); }; std::set deviceSet; for (const auto& step : plan.forTest_compoundGetSteps()) { deviceSet.insert(stepToDeviceId(step)); } // TODO(b/178517567): Figure out why we sometimes have 1 more // signature than we have devices -- this means that we've scheduled // one or more operations onto the CPU fallback device, which is not // something we ever expect to do. ASSERT_TRUE(deviceSet.size() == signaturesForDriver.size() || deviceSet.size() == signaturesForDriver.size() + 1); } } TestCompilation& c2 = (fallbackNeeded ? cWithFallback : cNoFallback); #ifdef TRACE_DYNTEMP { const ExecutionPlan& plan = c2.getExecutionPlan(); const size_t dynamicTemporaryCount = plan.forTest_flatGetDynamicTemporaries().size(); std::cout << "TRACE_DYNTEMP: dynamic temporary count = " << dynamicTemporaryCount << std::endl; if (plan.forTest_getKind() == ExecutionPlan::Kind::COMPOUND) { size_t stepsWithModelOutputsThatAreDownstreamInputs = 0; size_t countOfModelOutputsThatAreDownstreamInputs = 0; for (const auto& step : plan.forTest_compoundGetSteps()) { if (const size_t count = step->executionStep() ->getModelOutputsThatAreDownstreamInputs() .size()) { ++stepsWithModelOutputsThatAreDownstreamInputs; countOfModelOutputsThatAreDownstreamInputs += count; } } if (countOfModelOutputsThatAreDownstreamInputs != 0) { std::cout << "TRACE_DYNTEMP: model outputs that are downstream inputs: " << countOfModelOutputsThatAreDownstreamInputs << " / " << modelOutputs.size() << ", over " << stepsWithModelOutputsThatAreDownstreamInputs << " / " << plan.forTest_compoundGetSteps().size() << " steps" << std::endl; EXPECT_LE(countOfModelOutputsThatAreDownstreamInputs, modelOutputs.size()); } } else { EXPECT_EQ(dynamicTemporaryCount, size_t(0)) << "Only COMPOUND plan should have dynamic temporaries"; } } #endif #ifdef VERBOSE { std::cout << "signatures = " << signatures.size() << ", devices = " << devices.size() << std::endl; // TODO: When dumping steps, include non-ExecutionSteps. const ExecutionPlan& plan = c2.getExecutionPlan(); switch (plan.forTest_getKind()) { case ExecutionPlan::Kind::SIMPLE: std::cout << "plan: simple" << std::endl; break; case ExecutionPlan::Kind::COMPOUND: { const auto& steps = plan.forTest_compoundGetSteps(); std::set devicesInPlan; for (const auto& step : steps) { if (const auto* executionStep = step->tryExecutionStep()) { devicesInPlan.insert(executionStep->getDevice().get()); } } std::cout << "plan: compound, " << steps.size() << " steps over " << devicesInPlan.size() << " devices" << std::endl; for (unsigned i = 0; i < steps.size(); i++) { if (const auto executionStep = steps[i]->tryExecutionStep()) { std::cout << "Step " << i << ": " << ModelStats(executionStep->getStepModel()) << ", device = " << executionStep->getDevice()->getName() << std::endl; } } break; } default: std::cout << "Unexpected plan kind: " << static_cast(plan.forTest_getKind()); break; } } #endif // For execution: // - create golden inputs (one long vector) and golden output value // - golden inputs will be copied to actual inputs before each // of the two executions // - golden output will be used to fill actual outputs before each // of the two executions // - create actual inputs and outputs // - first execution (non-partitioned) // - initialize inputs and (to avoid unrelated oddities) outputs // - execute // - copy outputs to a save area (one long vector) // - second execution (partitioned) // - (to avoid unrelated oddities) initialize inputs and outputs // - execute // - compare outputs to save area // If the runtime and drivers are working properly, execution // should not change the inputs. Nonetheless, we reinitialize the // inputs for each execution, so as to avoid unrelated problems // appearing to be problems related to unpartitioned execution // versus partitioned execution. Similarly, execution behavior // should not be dependent on the outputs; but we'll initialize the // outputs anyway. std::vector goldenInputs(problemSize * problemSize * model.inputCount()); std::generate(goldenInputs.begin(), goldenInputs.end(), [this] { return randFrac(); }); #ifdef VERBOSE { std::cout << "flat inputs = "; dump(goldenInputs.begin(), goldenInputs.end()); } #endif const float goldenOutput = randFrac(); // Create the memory for the actual inputs and outputs. struct InputOutputDescriptor { enum Kind { INPUT, OUTPUT }; Kind mKind; // The input or output either resides in a local buffer // (mVector, in which case mMemoryRegion is ignored); or in a // shared memory region within a TestMemories instance // (mMemoryRegion, in which case mVector is ignored). enum Location { VECTOR, REGION }; Location getLocation() const { return !mVector.empty() ? VECTOR : REGION; } std::vector mVector; unsigned mMemoryRegion; }; std::vector ioDescriptors(model.inputCount() + model.outputCount()); for (unsigned i = 0; i < ioDescriptors.size(); i++) { ioDescriptors[i].mKind = (i < model.inputCount() ? InputOutputDescriptor::INPUT : InputOutputDescriptor::OUTPUT); } // We randomly interleave inputs and outputs in creation // order, because when we we create memory regions in a // TestMemories instance, the order in which regions are // created within a single Memory is the order they'll be laid // out in that memory; and when we have inputs and outputs // within the same Memory, we want the possibility that // they'll be interleaved. std::shuffle(ioDescriptors.begin(), ioDescriptors.end(), mRandNumEng); TestMemories ioMemories; for (auto& desc : ioDescriptors) { if (randFrac() < 0.5) { desc.mVector.resize(problemSize * problemSize); } else { // TODO: common this with the way we create IK_VALUE inputs? unsigned memoryIndex = ~0U; if ((ioMemories.memoryCount() != 0) && (randFrac() < 0.5)) { memoryIndex = randUInt(ioMemories.memoryCount()); } else { memoryIndex = ioMemories.addMemory(); } const size_t length = problemSize * problemSize * sizeof(float); desc.mMemoryRegion = ioMemories.addRegion(memoryIndex, length); } } ioMemories.layout(); // Function to set up actual inputs and outputs (initializing them // and telling the WrapperExecution about them). auto prepareForExecution = [&model, &ioDescriptors, &ioMemories, &goldenInputs, &goldenOutput, problemSize, &problemType](WrapperExecution* e) { uint32_t inputIndex = 0, outputIndex = 0; for (auto& desc : ioDescriptors) { if (desc.getLocation() == InputOutputDescriptor::VECTOR) { if (desc.mKind == InputOutputDescriptor::INPUT) { const size_t inputOffset = inputIndex * problemSize * problemSize; std::copy(goldenInputs.begin() + inputOffset, goldenInputs.begin() + inputOffset + problemSize * problemSize, desc.mVector.begin()); e->setInput(inputIndex++, desc.mVector.data(), desc.mVector.size() * sizeof(float)); } else { std::fill(desc.mVector.begin(), desc.mVector.begin() + problemSize * problemSize, goldenOutput); e->setOutput(outputIndex++, desc.mVector.data(), desc.mVector.size() * sizeof(float), &problemType.operandType); } } else { const WrapperMemory* memory; uint32_t offset, length; float* region = static_cast( ioMemories.getRegion(desc.mMemoryRegion, &memory, &offset, &length)); CHECK(length == problemSize * problemSize * sizeof(float)); if (desc.mKind == InputOutputDescriptor::INPUT) { const size_t inputOffset = inputIndex * problemSize * problemSize; std::copy(goldenInputs.begin() + inputOffset, goldenInputs.begin() + inputOffset + problemSize * problemSize, region); e->setInputFromMemory(inputIndex++, memory, offset, length); } else { std::fill(region, region + problemSize * problemSize, goldenOutput); e->setOutputFromMemory(outputIndex++, memory, offset, length, &problemType.operandType); } } }; CHECK(inputIndex == model.inputCount()); CHECK(outputIndex == model.outputCount()); }; // Non-partitioned execution. WrapperExecution e(&c); ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e)); ASSERT_EQ(e.compute(computeMode), Result::NO_ERROR); // Copy the outputs of the non-partitioned execution to a save area. std::vector nonPartitionedOutputs(problemSize * problemSize * model.outputCount()); { uint32_t outputIndex = 0; for (const auto& desc : ioDescriptors) { if (desc.mKind != InputOutputDescriptor::OUTPUT) { continue; } const size_t outputOffset = outputIndex * problemSize * problemSize; if (desc.getLocation() == InputOutputDescriptor::VECTOR) { std::copy(desc.mVector.begin(), desc.mVector.end(), nonPartitionedOutputs.begin() + outputOffset); } else { float* region = static_cast(ioMemories.getRegion(desc.mMemoryRegion)); std::copy(region, region + problemSize * problemSize, nonPartitionedOutputs.begin() + outputOffset); } #ifdef VERBOSE { std::cout << "nonpartitioned output[" << outputIndex << "] = "; dump(nonPartitionedOutputs.begin() + outputOffset, nonPartitionedOutputs.begin() + outputOffset + problemSize * problemSize); } #endif outputIndex++; } } // Partitioned execution. WrapperExecution e2(&c2); ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e2)); ASSERT_EQ(e2.compute(computeMode), Result::NO_ERROR); // Compare the outputs of the partitioned execution to the save // area containing the outpus of the non-partitioned execution. { uint32_t outputIndex = 0; for (const auto& desc : ioDescriptors) { if (desc.mKind != InputOutputDescriptor::OUTPUT) { continue; } SCOPED_TRACE(outputIndex); const size_t outputOffset = outputIndex * problemSize * problemSize; if (desc.getLocation() == InputOutputDescriptor::VECTOR) { #ifdef VERBOSE std::cout << " partitioned output[" << outputIndex << "] = "; dump(desc.mVector.begin(), desc.mVector.end()); #endif ASSERT_TRUE(std::equal(desc.mVector.begin(), desc.mVector.end(), nonPartitionedOutputs.begin() + outputOffset)); } else { float* region = static_cast(ioMemories.getRegion(desc.mMemoryRegion)); #ifdef VERBOSE std::cout << "part output[" << outputIndex << "] = "; dump(region, region + problemSize * problemSize); #endif ASSERT_TRUE(std::equal(region, region + problemSize * problemSize, nonPartitionedOutputs.begin() + outputOffset)); } outputIndex++; } } } } // namespace } // namespace android