/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define LOG_TAG "ExecutionBuilder" #include "ExecutionBuilder.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "BurstBuilder.h" #include "CompilationBuilder.h" #include "Manager.h" #include "ModelArgumentInfo.h" #include "ModelBuilder.h" #include "Telemetry.h" #include "TypeManager.h" namespace android { namespace nn { // Partial validation of output shapes returned from driver, to ensure they // conform to a very specific set of rules. static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model, const std::vector& shapes) { // Enforces the following rules (some of which are from b/154054474): // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE. // If the vector is not empty, it must have as many entries as the step model has outputs. // - If NONE, then either shapes vector is empty, or every shape is // marked isSufficient and, if a tensor, has known rank. // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty. At least one entry // is marked !isSufficient. switch (executionStatus) { case ErrorStatus::NONE: { NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount()) << "With execution ErrorStatus " << executionStatus << " output shapes vector must be empty or of length " << model->outputCount() << " but has length " << shapes.size(); NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(), [](const OutputShape& shape) { return shape.isSufficient; })) << "With execution ErrorStatus " << executionStatus << " at least one output shape is unexpectedly marked !isSufficient"; const TypeManager* tm = TypeManager::get(); for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount; ++outputIndex) { const Operand& outputOperand = model->getOutputOperand(outputIndex); NN_RET_CHECK(!tm->isTensorType(outputOperand.type) || (shapes[outputIndex].dimensions.size() != 0)) << "With execution ErrorStatus " << executionStatus << " output#" << outputIndex << " shape unexpectedly has zero rank"; } break; } case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: { NN_RET_CHECK(shapes.size() == model->outputCount()) << "With execution ErrorStatus " << executionStatus << " output shapes vector must be of length " << model->outputCount() << " but has length " << shapes.size(); NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(), [](const OutputShape& shape) { return !shape.isSufficient; })) << "With execution ErrorStatus " << executionStatus << " at least one output shape must have been marked !isSufficient"; break; } default: { NN_RET_CHECK(shapes.size() == 0) << "With execution ErrorStatus " << executionStatus << " output shapes vector must be empty but has length " << shapes.size(); break; } } return true; } static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model, const std::vector& shapes) { return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode), model, shapes); } static MeasureTiming measureTiming(const ExecutionBuilder* execution) { return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO; } static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType, const char* tag, bool allowUnspecified) { if (newType != nullptr) { const Extension::OperandTypeInformation* info = nullptr; if (isExtension(operand.type)) { NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info)); } if (validateOperandType(*newType, info, tag, allowUnspecified) != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << tag << ": Invalid newType"; return false; } if (operand.dimensions.size() == 0) { return true; } if (operand.dimensions.size() != newType->dimensionCount) { LOG(ERROR) << tag << ": Setting with incompatible dimension count (existing = " << operand.dimensions.size() << ", new = " << newType->dimensionCount << ")"; return false; } for (uint32_t i = 0; i < newType->dimensionCount; i++) { if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) { LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed"; return false; } } } else { if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) && tensorHasUnspecifiedDimensions(operand)) { LOG(ERROR) << tag << ": Setting with operand type that is not fully specified"; return false; } } return true; } ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation) : mCompilation(compilation), mModel(compilation->mModel), mPlan(&compilation->mPlan), mAllowCpuFallback(DeviceManager::partitioningAllowsFallback(compilation->mPartitioning)), mInputs(mModel->inputCount()), mOutputs(mModel->outputCount()) { VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder with " << mInputs.size() << " inputs and " << mOutputs.size() << " outputs"; } SimpleExecutionBuilder::SimpleExecutionBuilder(const CompilationBuilder* compilation) : ExecutionBuilder(compilation) { CHECK(mPlan->isSimple()); } CompoundExecutionBuilder::CompoundExecutionBuilder(const CompilationBuilder* compilation) : ExecutionBuilder(compilation) { CHECK(mPlan->isCompound()); } const ModelBuilder* ExecutionBuilder::getSourceModel(uint32_t index) const { return mPlan->getSourceModels().getModel(index); } int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length) { if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast(mInputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getInputOperand(index), type, "ANeuralNetworksExecution_setInput", buffer == nullptr)) { return ANEURALNETWORKS_BAD_DATA; } if (length > 0xFFFFFFFF) { LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length; return ANEURALNETWORKS_BAD_DATA; } uint32_t l = static_cast(length); if (!mInputs[index].unspecified()) { LOG(ERROR) << "ANeuralNetworksExecution_setInput called when an input has already been " "provided"; return ANEURALNETWORKS_BAD_STATE; } int n; std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromPointer( mModel->getInputOperand(index), type, const_cast(buffer), l, mInputAndOutputPaddingEnabled); mHasCalledSetInputOutput = true; return n; } int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, const RuntimeMemory* memory, size_t offset, size_t length) { // Should be similar to StepExecutor::setInputOrOutputFromMemory() if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast(mInputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getInputOperand(index), type, "ANeuralNetworksExecution_setInputFromMemory", false)) { return ANEURALNETWORKS_BAD_DATA; } if (!memory->getValidator().validate(mCompilation, IOType::INPUT, index, type, offset, length)) { return ANEURALNETWORKS_BAD_DATA; } // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we // allow the client to specify offset == 0 && length == 0 indicating that the entire memory // region is used. We update the length here because the drivers are still expecting a real // length. For other memories that do not allow this semantic, it is checked in // MemoryValidatorBase::validate before reaching here. if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) { length = memory->getSize(); } // TODO validate the rest uint32_t poolIndex = mMemories.add(memory); if (!mInputs[index].unspecified()) { LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called when an input has already " "been provided"; return ANEURALNETWORKS_BAD_STATE; } int n; std::tie(n, mInputs[index]) = ModelArgumentInfo::createFromMemory(mModel->getInputOperand(index), type, poolIndex, offset, length, mInputAndOutputPaddingEnabled); mHasCalledSetInputOutput = true; return n; } int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length) { if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getOutputOperand(index), type, "ANeuralNetworksExecution_setOutput", true)) { return ANEURALNETWORKS_BAD_DATA; } if (length > 0xFFFFFFFF) { LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length; return ANEURALNETWORKS_BAD_DATA; } uint32_t l = static_cast(length); if (!mOutputs[index].unspecified()) { LOG(ERROR) << "ANeuralNetworksExecution_setOutput called when an output has already been " "provided"; return ANEURALNETWORKS_BAD_STATE; } int n; std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromPointer( mModel->getOutputOperand(index), type, buffer, l, mInputAndOutputPaddingEnabled); mHasCalledSetInputOutput = true; return n; } int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, const RuntimeMemory* memory, size_t offset, size_t length) { // Should be similar to StepExecutor::setInputOrOutputFromMemory() if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getOutputOperand(index), type, "ANeuralNetworksExecution_setOutputFromMemory", true)) { return ANEURALNETWORKS_BAD_DATA; } if (!memory->getValidator().validate(mCompilation, IOType::OUTPUT, index, type, offset, length)) { return ANEURALNETWORKS_BAD_DATA; } // For some types of memory, e.g. MemoryRuntimeAHWB allocated from ANNMemory_createFromDesc, we // allow the client to specify offset == 0 && length == 0 indicating that the entire memory // region is used. We update the length here because the drivers are still expecting a real // length. For other memories that do not allow this semantic, it is checked in // MemoryValidatorBase::validate before reaching here. if (validate(memory->getMemory()).ok() && offset == 0 && length == 0) { length = memory->getSize(); } // TODO validate the rest uint32_t poolIndex = mMemories.add(memory); if (!mOutputs[index].unspecified()) { LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called when an output has " "already been provided"; return ANEURALNETWORKS_BAD_STATE; } int n; std::tie(n, mOutputs[index]) = ModelArgumentInfo::createFromMemory(mModel->getOutputOperand(index), type, poolIndex, offset, length, mInputAndOutputPaddingEnabled); mHasCalledSetInputOutput = true; return n; } int ExecutionBuilder::setMeasureTiming(bool measure) { if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on " << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation " << "that was not created by ANeuralNetworksCompilation_createForDevices " << "with numDevices = 1"; return ANEURALNETWORKS_BAD_DATA; } if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } mMeasureTiming = measure; return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const { if (!completed()) { LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the " "execution has finished."; *duration = UINT64_MAX; return ANEURALNETWORKS_BAD_STATE; } if (completedWith() != Completion::NO_ERROR) { LOG(ERROR) << "ANeuralNetworksExecution_getDuration called on an execution " "that has encountered an error."; *duration = UINT64_MAX; return ANEURALNETWORKS_BAD_STATE; } if (!mMeasureTiming) { *duration = UINT64_MAX; return ANEURALNETWORKS_BAD_STATE; } Timing timingLaunched = mTimingWithoutFencedExecutionCallback; Timing timingFenced = timingLaunched; if (mFencedExecutionCallback != nullptr) { auto result = mFencedExecutionCallback(); if (!result.has_value()) { LOG(ERROR) << "Fenced execution callback failed: " << result.error().message; *duration = UINT64_MAX; return ANEURALNETWORKS_BAD_STATE; } std::tie(timingLaunched, timingFenced) = std::move(result).value(); } const OptionalDuration selectedDuration = [durationCode, &timingLaunched, &timingFenced]() -> OptionalDuration { switch (durationCode) { case ANEURALNETWORKS_DURATION_ON_HARDWARE: return timingLaunched.timeOnDevice; case ANEURALNETWORKS_DURATION_IN_DRIVER: return timingLaunched.timeInDriver; case ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE: return timingFenced.timeOnDevice; case ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER: return timingFenced.timeInDriver; default: LOG(FATAL) << "unexpected"; return std::nullopt; } }(); if (selectedDuration.has_value()) { constexpr uint64_t kMaxTiming = std::numeric_limits::max() - 1; using CommonType = std::common_type_t; const auto count = std::min(selectedDuration.value().count(), kMaxTiming); *duration = static_cast(count); } else { constexpr uint64_t kNoTiming = std::numeric_limits::max(); *duration = kNoTiming; } VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration; return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::setTimeoutDuration(uint64_t duration) { if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called on an ANeuralNetworksExecution " "created from an ANeuralNetworksCompilation that was not created by " "ANeuralNetworksCompilation_createForDevices with numDevices = 1"; return ANEURALNETWORKS_BAD_DATA; } if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setTimeout called after the execution has started."; return ANEURALNETWORKS_BAD_STATE; } if (duration > 0) { mTimeoutDuration = duration; } else { mTimeoutDuration.reset(); } return ANEURALNETWORKS_NO_ERROR; } std::optional ExecutionBuilder::getTimeoutDuration() const { return mTimeoutDuration; } TimePoint ExecutionBuilder::getComputeStartTimePoint() const { CHECK(computationStarted()) << "getComputeStartTimePoint called before " << "execution has started."; return mComputeStartTimePoint; } int ExecutionBuilder::setLoopTimeout(uint64_t duration) { if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setLoopTimeout called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } if (duration > operation_while::kTimeoutNsMaximum) { LOG(WARNING) << "ANeuralNetworksExecution_setLoopTimeout input exceeds the maximum allowed " << "duration: " << duration << " > " << operation_while::kTimeoutNsMaximum; duration = operation_while::kTimeoutNsMaximum; } mLoopTimeoutDuration = duration; return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::enableInputAndOutputPadding(bool enable) { if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } if (mHasCalledSetInputOutput) { LOG(ERROR) << "ANeuralNetworksExecution_enableInputAndOutputPadding called after an input " "or output is set."; return ANEURALNETWORKS_BAD_STATE; } mInputAndOutputPaddingEnabled = enable; return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::setReusable(bool reusable) { if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_setReusable called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } mReusable = reusable; return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::addExtensionAttribute(const char* extensionName, uint16_t attributeCodeWithinExtension, const void* data, size_t length) { if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called on an " "ANeuralNetworksExecution created from an ANeuralNetworksCompilation that " "was not created by ANeuralNetworksCompilation_createForDevices with " "numDevices = 1"; return ANEURALNETWORKS_BAD_DATA; } if (computationStarted()) { LOG(ERROR) << "ANeuralNetworksExecution_addExtensionAttribute called after the execution " "has started."; return ANEURALNETWORKS_BAD_STATE; } int32_t attributeToken = 0; if (!TypeManager::get()->getExtensionType(extensionName, attributeCodeWithinExtension, &attributeToken)) { return ANEURALNETWORKS_BAD_DATA; } if (std::find_if(mMetadata.begin(), mMetadata.end(), [attributeToken](const auto& entry) { return attributeToken == entry.token; }) != mMetadata.end()) { LOG(ERROR) << "ANeuralNetworksCompilation_addExtensionAttribute called more than once for " "the same attribute"; return ANEURALNETWORKS_BAD_DATA; } const uint8_t* dataPtr = reinterpret_cast(data); mMetadata.push_back({attributeToken, std::vector(dataPtr, dataPtr + length)}); return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) { if (!completed()) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the " "execution has finished."; return ANEURALNETWORKS_BAD_STATE; } if (completedWith() == Completion::OTHER_ERROR) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called on an execution " "that has encountered an error."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } const auto& dims = mOutputs[index].dimensions(); if (dims.empty()) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query " "dimensions of a scalar"; return ANEURALNETWORKS_BAD_DATA; } std::copy(dims.begin(), dims.end(), dimensions); return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; } int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) { if (!completed()) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the " "execution has finished."; return ANEURALNETWORKS_BAD_STATE; } if (completedWith() == Completion::OTHER_ERROR) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called on an execution " "that has encountered an error."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } *rank = static_cast(mOutputs[index].dimensions().size()); return mOutputs[index].isSufficient() ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; } bool ExecutionBuilder::checkAndSetComputationState(const char* name) { std::lock_guard lock(mStateMutex); if (!mReusable && mState == State::COMPLETED) { LOG(ERROR) << "ANeuralNetworksExecution_" << name << " called on a non-reusable execution that has already completed"; return false; } if (mState == State::COMPUTATION) { LOG(ERROR) << "ANeuralNetworksExecution_" << name << " called on an execution that has already started"; return false; } mState = State::COMPUTATION; return true; } // TODO(b/132321855): validate that we have full types for all inputs and outputs, // that the graph is not cyclic, static int validateRequest(const std::vector& inputs, const std::vector& outputs) { for (auto& p : inputs) { if (p.state() == ModelArgumentInfo::UNSPECIFIED) { LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all inputs specified"; return ANEURALNETWORKS_BAD_DATA; } } for (auto& p : outputs) { if (p.state() == ModelArgumentInfo::UNSPECIFIED) { LOG(ERROR) << "ANeuralNetworksExecution starts compute when not all outputs specified"; return ANEURALNETWORKS_BAD_DATA; } } return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::getValidationResultCode() { if (!mValidationResultCode.has_value()) { mValidationResultCode = validateRequest(mInputs, mOutputs); } return mValidationResultCode.value(); } bool ExecutionBuilder::areOutputsFullySpecified() { if (!mOutputsFullySpecified.has_value()) { mOutputsFullySpecified = true; for (uint32_t i = 0; i < mOutputs.size(); i++) { if (mOutputs[i].state() != ModelArgumentInfo::HAS_NO_VALUE && TypeManager::get()->isTensorType(mModel->getOutputOperand(i).type) && tensorHasUnspecifiedDimensions(mModel->getOutputOperand(i).type, mOutputs[i].initialDimensions())) { mOutputsFullySpecified = false; break; } } } return mOutputsFullySpecified.value(); } int ExecutionBuilder::prepareForCompute(const char* name, ExecutionMode mode) { if (!checkAndSetComputationState(name)) { return ANEURALNETWORKS_BAD_STATE; } if (int n = getValidationResultCode(); n != ANEURALNETWORKS_NO_ERROR) { return finishComputation(n, {}, mode); } return ANEURALNETWORKS_NO_ERROR; } // Attempt synchronous execution of full model on CPU. // TODO: How should we handle timing in this case? // For Q this is irrelevant: We only support timing in conjunction // with an explicit device list; and we do not support CPU fallback // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. static std::tuple, Timing> cpuFallbackFull( ExecutionBuilder* executionBuilder) { CHECK(executionBuilder != nullptr); NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull"); VLOG(EXECUTION) << "cpuFallbackFull"; // Get fallback executor. StepExecutor executor(executionBuilder, executionBuilder->getModel(), DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr, /*reusable=*/false); executor.mapInputsAndOutputsTrivially(); // Attempt fallback execution. return executor.computeOnCpuFallback(); } // Attempt synchronous execution on CPU. // TODO: How should we handle timing in this case? // For Q this is irrelevant: We only support timing in conjunction // with an explicit device list; and we do not support CPU fallback // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. static std::tuple, Timing, std::shared_ptr> cpuFallbackPartial(const ExecutionPlan& plan, std::shared_ptr controller) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial"); VLOG(EXECUTION) << "cpuFallbackPartial"; // Get fallback executor. std::shared_ptr executor; int n1 = plan.fallback(controller, &executor, nullptr, nullptr); if (n1 != ANEURALNETWORKS_NO_ERROR) { return {n1, {}, {}, nullptr}; } CHECK(executor != nullptr); // Attempt fallback execution. auto [n2, outputShapes, timing] = executor->computeOnCpuFallback(); return {n2, std::move(outputShapes), timing, executor}; } std::tuple, Timing> SimpleExecutionBuilder::computeInternal( const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeInternal"); VLOG(EXECUTION) << "SimpleExecutionBuilder::computeInternal"; if (mExecutor == nullptr) { mExecutor = mPlan->makeStepExecutor(mReusable, this); } auto burstController = burstBuilder ? burstBuilder->getControllerAt(0) : nullptr; auto [n, outputShapes, timing] = mExecutor->compute(deadline, burstController); if (n == ANEURALNETWORKS_NO_ERROR) { return {n, std::move(outputShapes), timing}; } // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE is not recoverable. if (n == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { return {n, std::move(outputShapes), {}}; } // If CPU fallback is not allowed and there was an error, end execution. if (!mAllowCpuFallback) { return {n, {}, {}}; } // If CPU execution was already attempted, do not perform CPU fallback. if (mExecutor->isCpu()) { return {n, {}, {}}; } // If the code has reached this point, a potentially recoverable error // occurred during the execution. Do an execution fallback on the CPU. return cpuFallbackFull(this); } std::tuple, Timing> CompoundExecutionBuilder::computeInternal( const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeInternal"); VLOG(EXECUTION) << "CompoundExecutionBuilder::computeInternal (from plan, iteratively)"; auto controller = mPlan->makeController(this, burstBuilder); std::vector outputShapes = getInitialOutputShapes(); // On this iteration, do I need to repeat the previous step because it // reported insufficient size? bool doInsufficientSizeFallback = false; while (true) { VLOG(EXECUTION) << "looking for next StepExecutor"; // Get the current step of the execution. std::shared_ptr executor; SharedBurst burstController; int n = doInsufficientSizeFallback ? mPlan->fallback(controller, &executor, &burstController, &outputShapes) : mPlan->next(controller, &executor, &burstController, &outputShapes); doInsufficientSizeFallback = false; if (n != ANEURALNETWORKS_NO_ERROR) { // During the interpreted execution of control flow, a loop timeout // might occur in ExecutionPlan::next(). bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT || n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT; if (mAllowCpuFallback && !missedDeadline) break; return {n, {}, {}}; } // If the code reached the end of the plan without error, then return // with no error. if (executor == nullptr) { return {ANEURALNETWORKS_NO_ERROR, outputShapes, {}}; } const bool executorIsCpu = executor->isCpu(); // Attempt to execute a single step of the execution. auto [stepN, stepOutputShapes, _] = executor->compute(deadline, burstController); // Update global outputs and dynamic temporaries. StepExecutor::UpdateOutputShapes updateOutputShapes = {}; if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes, &updateOutputShapes)) { stepN = ANEURALNETWORKS_OP_FAILED; } // If execution was successful, continue to next step. if (stepN == ANEURALNETWORKS_NO_ERROR) { if (updateOutputShapes.zeroSizedInput) { // We'll need to do full model CPU fallback VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput"; stepN = ANEURALNETWORKS_OP_FAILED; } else { CHECK(executor->areDynamicTemporariesAllocated()); continue; } } if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes); if (updateOutputShapes.mainOutputInsufficient || !updateOutputShapes.updatedDynamicTemporary) { // Either: // - At least one main model output is not of sufficient size; or // - we didn't learn anything new about dynamic temporaries. // Neither of these is recoverable, so end execution. return {stepN, outputShapes, {}}; } // Every main model output is of sufficient size. This implies that // at least one dynamic temporary is not of sufficient size. This // is recoverable. doInsufficientSizeFallback = true; continue; } // If CPU fallback is not allowed and there was an error, end execution. if (!mAllowCpuFallback) { return {stepN, {}, {}}; } // If CPU execution was already attempted, perform a full CPU fallback. if (executorIsCpu) { break; } // If the code reaches this point, attempt a partial fallback to CPU. CHECK(mAllowCpuFallback); if (updateOutputShapes.zeroSizedInput) { // Do not attempt a partial fallback. break; } while (true) { auto [fallbackN, fallbackOutputShapes, _, fallbackExecutor] = cpuFallbackPartial(*mPlan, controller); // Update global outputs and dynamic temporaries. StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {}; if (fallbackExecutor != nullptr && !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes, &outputShapes, &fallbackUpdateOutputShapes)) { fallbackN = ANEURALNETWORKS_OP_FAILED; } // If execution was successful, continue to next step. if (fallbackN == ANEURALNETWORKS_NO_ERROR) { if (fallbackUpdateOutputShapes.zeroSizedInput) { // We'll need to do full model CPU fallback VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput"; fallbackN = ANEURALNETWORKS_OP_FAILED; break; } CHECK(fallbackExecutor->areDynamicTemporariesAllocated()); goto nextStep; } if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) { VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(fallbackUpdateOutputShapes); if (fallbackUpdateOutputShapes.mainOutputInsufficient || !fallbackUpdateOutputShapes.updatedDynamicTemporary) { // Either: // - At least one main model output is not of sufficient size; or // - we didn't learn anything new about dynamic temporaries. // Neither of these is recoverable, so end execution. return {fallbackN, outputShapes, {}}; } // Every main model output is of sufficient size. This implies // that at least one dynamic temporary is not of sufficient // size. This is recoverable. continue; } // If the code reaches this point, then there was an error with the // fallback. In this case, attempt full fallback. break; } // If the code reaches this point, then there was an error with the // fallback. In this case, attempt full fallback. break; nextStep: // Bottom of the outer loop continue; } // If the code has reached this point, a potentially recoverable error // occurred during the step executions. Instead, do a full execution // fallback on the CPU. return cpuFallbackFull(this); } static bool waitForSyncFences(const std::vector& waitFor) { for (int syncFd : waitFor) { if (syncFd > 0) { auto r = syncWait(syncFd, -1); if (r != FenceState::SIGNALED) { VLOG(EXECUTION) << "syncWait failed, fd: " << syncFd; return false; } } } return true; } std::tuple SimpleExecutionBuilder::computeFencedInternal( const std::vector& waitFor, uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "SimpleExecutionBuilder::computeFencedInternal"); VLOG(EXECUTION) << "SimpleExecutionBuilder::computeFencedInternal"; if (mExecutor == nullptr) { mExecutor = mPlan->makeStepExecutor(mReusable, this); } auto [n, syncFd, callback] = mExecutor->computeFenced(waitFor, timeoutDurationAfterFence, deadline); if (n == ANEURALNETWORKS_NO_ERROR) { return {ANEURALNETWORKS_NO_ERROR, syncFd, callback}; } // If CPU fallback is not allowed and there was an error, end execution. if (!mAllowCpuFallback) { return {n, -1, nullptr}; } // If CPU execution was already attempted, return from the function with an error. if (mExecutor->isCpu()) { return {n, -1, nullptr}; } // If the code has reached this point, a potentially recoverable error // occurred during the step executions. Instead, do a full execution // fallback on the CPU. VLOG(EXECUTION) << "Performing full fallback on the CPU."; if (!waitForSyncFences(waitFor)) { return {ANEURALNETWORKS_OP_FAILED, -1, nullptr}; } auto [fallbackN, fallbackOutputShapes, fallbackTiming] = cpuFallbackFull(this); reportTimingWithoutFencedExecutionCallback(fallbackTiming); return {fallbackN, -1, nullptr}; } // In case of partitioned execution, computeFencedInternal call will return the sync // fence and the fenced compute callback returned from the last partition. // Any failed partition will result in whole execution fallback to CPU if // mAllowCpuFallback is set to true. std::tuple CompoundExecutionBuilder::computeFencedInternal( const std::vector& waitFor, uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "CompoundExecutionBuilder::computeFencedInternal"); VLOG(EXECUTION) << "CompoundExecutionBuilder::computeFencedInternal (from plan, iteratively)"; // We should have detected this earlier in the call chain and fallen back to // non-fenced execution. This is an implementation limitation: In order to // support dynamic temporarires in this code, we'd need to implement // something like the following: // - If a partition has outputs of unknown size, compute that partition in a // non fenced fashion, just as if it were scheduled on a driver that does // not support fenced execution. // - Implement something similar to the code in CompoundExecutionBuilder::computeInternal() // that handles a step execution that fails with // ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE. CHECK(!mCompilation->hasDynamicTemporaries()); // Initiate waitForFds, syncFence for the first step. std::vector waitForFds = waitFor; base::unique_fd syncFence; ExecuteFencedInfoCallback executeFencedInfoCallback; std::shared_ptr controller = mPlan->makeController(this, nullptr); while (true) { VLOG(EXECUTION) << "looking for next StepExecutor"; // Get the current step of the execution. std::shared_ptr executor; int n = mPlan->next(controller, &executor, nullptr, nullptr, syncFence.get()); if (n != ANEURALNETWORKS_NO_ERROR) { // During the interpreted execution of control flow, a loop timeout // might occur in ExecutionPlan::next(). bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT || n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT; if (mAllowCpuFallback && !missedDeadline) break; // Return -1 for the sync fence fd, and nullptr for the callback. return {n, -1, nullptr}; } // If the code reached the end of the plan without error, then return // with no error. if (executor == nullptr) { return {ANEURALNETWORKS_NO_ERROR, syncFence.release(), executeFencedInfoCallback}; } // Attempt to compute a single step of the execution. auto [stepN, syncFd, callback] = executor->computeFenced(waitForFds, timeoutDurationAfterFence, deadline); // Update waitForFds, syncFence for the next step. syncFence.reset(syncFd); executeFencedInfoCallback = callback; waitForFds.clear(); if (syncFd >= 0) { waitForFds = {syncFd}; } // If execution was successful, continue to next step. if (stepN == ANEURALNETWORKS_NO_ERROR) { continue; } // If CPU fallback is not allowed and there was an error, end execution. if (!mAllowCpuFallback) { return {stepN, -1, nullptr}; } // If the code reaches this point, then there was an error with the // fallback. In this case, attempt full fallback. break; } // If the code has reached this point, a potentially recoverable error // occurred during the step executions. Instead, do a full execution // fallback on the CPU. VLOG(EXECUTION) << "Performing full fallback on the CPU."; if (!waitForSyncFences(waitFor)) { return {ANEURALNETWORKS_OP_FAILED, -1, nullptr}; } auto [fullN, fullOutputShapes, _] = cpuFallbackFull(this); return {fullN, -1, nullptr}; } int ExecutionBuilder::computeFenced(const std::vector& waitFor, uint64_t timeoutDurationAfterFence, int* syncFence) { CHECK(syncFence != nullptr); NN_RETURN_IF_ERROR( prepareForCompute("startComputeWithDependencies", ExecutionMode::ASYNC_WITH_DEPS)); if (timeoutDurationAfterFence > 0) { if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies called with non-zero " "duration on an ANeuralNetworksExecution " "created from an ANeuralNetworksCompilation that was not created by " "ANeuralNetworksCompilation_createForDevices with numDevices = 1"; return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS); } } if (!areOutputsFullySpecified()) { LOG(ERROR) << "ANeuralNetworksExecution_startComputeWithDependencies" " not all outputs have fully specified dimensions"; return finishComputation(ANEURALNETWORKS_BAD_DATA, {}, ExecutionMode::ASYNC_WITH_DEPS); } // Unlike ExecutionBuilder::compute, we do not need to reset output dimensions here because // fenced executions do not support dynamic output shape. mComputeStartTimePoint = Clock::now(); VLOG(EXECUTION) << "ExecutionBuilder::computeFenced"; int result; const auto deadline = makeDeadline(mTimeoutDuration); std::tie(result, *syncFence, mFencedExecutionCallback) = computeFencedInternal(waitFor, timeoutDurationAfterFence, deadline); // If there is an error, call finishComputation to mark the computation as completed. // Otherwise, we will call finishComputation in SyncFenceEvent::wait(). if (result != ANEURALNETWORKS_NO_ERROR) { // TODO(miaowang): support dynamic output shape only with memory domain. // For now just return empty output shapes. result = finishComputation(result, {}, ExecutionMode::ASYNC_WITH_DEPS); } return result; } int ExecutionBuilder::compute(std::shared_ptr* synchronizationCallback, BurstBuilder* burstBuilder) { CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr) << "synchronizationCallback and burstBuilder cannot simultaneously be used"; const bool synchronous = (synchronizationCallback == nullptr); if (!synchronous) { *synchronizationCallback = nullptr; } const char* name = burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute"; const ExecutionMode mode = burstBuilder ? ExecutionMode::BURST : synchronous ? ExecutionMode::SYNC : ExecutionMode::ASYNC; NN_RETURN_IF_ERROR(prepareForCompute(name, mode)); // Validate input memory dimensions. We need to do the validation in every computation because // the memory dimensions may change between computations. for (auto& p : mInputs) { if (p.state() == ModelArgumentInfo::MEMORY) { const RuntimeMemory* memory = mMemories[p.locationAndLength().poolIndex]; if (!memory->getValidator().validateInputDimensions(p.dimensions())) { return finishComputation(ANEURALNETWORKS_OP_FAILED, {}, mode); } } } // Reset output dimensions. if (!areOutputsFullySpecified()) { for (auto& output : mOutputs) { output.reset(); } } const auto deadline = makeDeadline(mTimeoutDuration); mComputeStartTimePoint = Clock::now(); if (synchronous) { if (burstBuilder) { VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)"; } else { VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)"; } const auto [n, outputShapes, timing] = computeInternal(deadline, burstBuilder); if (mMeasureTiming) { mTimingWithoutFencedExecutionCallback = timing; } return finishComputation(n, outputShapes, mode); } else /* asynchronous */ { // TODO: For asynchronous execution, entire plan-based-path should run in an // asynchronous thread -- take the asynchronous thread logic out of // CpuExecution::compute() and use it to wrap the plan-based-path. // TODO: use a thread pool // TODO(mikie): this could have NNTRACE so we could measure the overhead // of spinning up a new thread. // Prepare the callback for asynchronous execution. // std::shared_ptr object is returned when the // execution has been successfully launched, otherwise a // nullptr is returned. The executionCallback is // abstracted in the NN API as an "event". auto executionCallback = std::make_shared(); executionCallback->setOnFinish( [this, mode](ErrorStatus error, const std::vector& outputShapes) { return finishComputation(error, outputShapes, mode); }); const auto asyncStartCompute = [this, deadline, executionCallback] { const auto [n, outputShapes, timing] = computeInternal(deadline, nullptr); const auto status = convertResultCodeToErrorStatus(n); executionCallback->notify(status, outputShapes, timing); }; if (DeviceManager::get()->syncExecRuntime()) { VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)"; asyncStartCompute(); } else { VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)"; std::thread asyncExecution(asyncStartCompute); executionCallback->bindThread(std::move(asyncExecution)); } *synchronizationCallback = executionCallback; return ANEURALNETWORKS_NO_ERROR; } } std::vector ExecutionBuilder::getInitialOutputShapes() const { std::vector outputShapes(mOutputs.size()); std::transform(mOutputs.begin(), mOutputs.end(), outputShapes.begin(), [](const auto& x) -> OutputShape { std::vector dimensions; if (x.state() != ModelArgumentInfo::HAS_NO_VALUE) { dimensions = x.dimensions(); } return {.dimensions = std::move(dimensions), .isSufficient = true}; }); return outputShapes; } // Check if the dimensions "to" is updatable by dimensions "from", where "from" must // have no lower a specification level. static bool isUpdatable(const std::vector& to, const std::vector& from) { if (to.size() == 0) return true; NN_RET_CHECK_EQ(to.size(), from.size()); for (uint32_t i = 0; i < to.size(); i++) { NN_RET_CHECK(to[i] == from[i] || to[i] == 0); } return true; } static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) { return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient && outputShape.dimensions.size() && (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) != outputShape.dimensions.end()); } bool ExecutionBuilder::updateOutputShapes(ErrorStatus status, const std::vector& outputShapes) { NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes)); if (outputShapes.size() == 0) { return true; } NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size()); for (uint32_t i = 0; i < outputShapes.size(); i++) { // Check if only unspecified dimensions or rank are overwritten. NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions(), outputShapes[i].dimensions)); const OperandType operandType = mModel->getOutputOperand(i).type; NN_RET_CHECK(!TypeManager::get()->sizeOfDataOverflowsUInt32(operandType, outputShapes[i].dimensions)); } for (uint32_t i = 0; i < outputShapes.size(); i++) { mOutputs[i].dimensions() = outputShapes[i].dimensions; mOutputs[i].isSufficient() = outputShapes[i].isSufficient; } return true; } bool ExecutionBuilder::updateMemories() { for (const auto& output : mOutputs) { if (output.state() != ModelArgumentInfo::MEMORY) continue; const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex]; NN_RET_CHECK(memory->getValidator().updateMetadata({.dimensions = output.dimensions()})); } return true; } int ExecutionBuilder::finishComputation(int result, const std::vector& outputShapes, ExecutionMode mode) { const auto status = convertResultCodeToErrorStatus(result); if (!updateOutputShapes(status, outputShapes) || !updateMemories()) { result = ANEURALNETWORKS_OP_FAILED; } bool success = result == ANEURALNETWORKS_NO_ERROR; for (const auto& output : mOutputs) { if (output.state() != ModelArgumentInfo::MEMORY) continue; const RuntimeMemory* memory = mMemories[output.locationAndLength().poolIndex]; memory->getValidator().setInitialized(success); } switch (result) { case ANEURALNETWORKS_NO_ERROR: mCompletion = Completion::NO_ERROR; break; case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE: mCompletion = Completion::OUTPUT_INSUFFICIENT_SIZE; break; default: mCompletion = Completion::OTHER_ERROR; break; } { std::lock_guard lock(mStateMutex); CHECK(mState != State::PREPARATION) << "ExecutionBuilder::finishComputation is called in the preparation state"; CHECK(mState != State::COMPLETED) << "ExecutionBuilder::finishComputation is called twice"; mState = State::COMPLETED; } telemetry::onExecutionFinish(this, mode, result); return result; } std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) { return "{ .updatedDynamicTemporary = " + std::to_string(updateOutputShapes.updatedDynamicTemporary) + ", .mainOutputInsufficient = " + std::to_string(updateOutputShapes.mainOutputInsufficient) + "}"; } bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector& from, std::vector* to, UpdateOutputShapes* update) { CHECK(update != nullptr); *update = {.updatedDynamicTemporary = false, .mainOutputInsufficient = false, .zeroSizedInput = false}; NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from)); if (from.size() == 0) { return true; } if (VLOG_IS_ON(EXECUTION)) { for (const auto& shape : from) { VLOG(EXECUTION) << "updateOutputShapes: " << shape; } } if (mExecutionStep != nullptr) { const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel(); NN_RET_CHECK_LE(indexMapping.size(), from.size()); for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) { const uint32_t toIndex = indexMapping[i]; NN_RET_CHECK_GT(to->size(), toIndex); NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions)); (*to)[toIndex] = from[i]; update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient; if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) && isZeroSizedTensor(executionResultCode, from[i])) { update->zeroSizedInput = true; } } if (!mDynamicTemporaries->empty()) { // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep? std::map operandIndexStepModelOutputToSourceModelTemp; for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) { operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first); } const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex(); for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) { const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i); const auto it = operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex); if (it == operandIndexStepModelOutputToSourceModelTemp.end()) { continue; } const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second); VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i << " sourceOperandIndex = (" << sourceOperandIndex.first << ", " << sourceOperandIndex.second << ") is a dynamic temporary"; // This is a temporary, but it might not be a dynamic temporary. const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false); if (loc == std::nullopt) { continue; } NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions)); bool changedShape = false; const uint32_t actualSize = TypeManager::get()->getSizeOfData( mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions); if (actualSize > 0) { changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex, from[i].dimensions, actualSize); } else if (!from[i].isSufficient) { NN_RET_CHECK(loc->paddedLength < UINT32_MAX / 2) << "output#" << i << " paddedLength overflow"; changedShape = mDynamicTemporaries->redeclare( sourceOperandIndex, from[i].dimensions, 2 * loc->paddedLength); } else { // The combination of not-fully-specified dimensions // and isSufficient means that we have no // information about whether the size of the dynamic // temporary is adequate. VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i; if (executionResultCode == ANEURALNETWORKS_NO_ERROR) { NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i])); // This is a zero-sized tensor, and by // definition, any dynamic temporary is an input // to an execution step. update->zeroSizedInput = true; } } if (changedShape) { // TODO: find a better place for this comment. // // isUpdatable(a, b) imposes a partial ordering a <= // b. Every fully specified dimensions vector is an // upper bound of that ordering. Therefore, any // change in dimensions moves towards an upper // bound, and hence there are a finite number of // such changes possible. // // actualSize can only be computed from dimensions // that are an upper bound. Therefore, once // actualSize is computed, it will not change. // // If dimensions are not fully specified, and // estimated size changes, it increases. There is // an upper bound on estimated size to avoid // overflow. // // Therefore, if we retry only when dimensions or // size chage, and we stop retrying if we would // otherwise overflow, we should only retry a finite // number of times. update->updatedDynamicTemporary = true; } } mDynamicTemporaries->vlogDump("finished updateOutputShapes"); } } else { NN_RET_CHECK_EQ(from.size(), to->size()); for (uint32_t i = 0, e = from.size(); i < e; i++) { NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions)); (*to)[i] = from[i]; } } return true; } StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, std::shared_ptr device, std::shared_ptr preparedModel, bool reusable, const ExecutionStep* step, DynamicTemporaries* dynamicTemporaries) : mExecutionBuilder(executionBuilder), mExecutionStep(step), mDynamicTemporaries(dynamicTemporaries), mModel(model), mDevice(device), mPreparedModel(preparedModel), mInputs(model->inputCount()), mOutputs(model->outputCount()), mReusable(reusable) { CHECK(mDevice != nullptr); CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr); CHECK(!(reusable && dynamicTemporaries != nullptr)); VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and " << mOutputs.size() << " outputs"; } bool StepExecutor::areDynamicTemporariesAllocated() const { return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex()); } void StepExecutor::mapInputsAndOutputsTrivially() { mInputs = mExecutionBuilder->mInputs; mOutputs = mExecutionBuilder->mOutputs; mMemories = mExecutionBuilder->mMemories; } void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, ModelArgumentInfo* executorInputOrOutput, const Dimensions* builderDimensions) { auto updateDimensions = [executorInputOrOutput, builderDimensions] { if (!builderDimensions) { return; } executorInputOrOutput->dimensions() = *builderDimensions; }; *executorInputOrOutput = builderInputOrOutput; switch (executorInputOrOutput->state()) { default: CHECK(false) << "unexpected ModelArgumentInfo::state"; break; case ModelArgumentInfo::HAS_NO_VALUE: case ModelArgumentInfo::UNSPECIFIED: break; case ModelArgumentInfo::POINTER: updateDimensions(); break; case ModelArgumentInfo::MEMORY: { updateDimensions(); const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex; const RuntimeMemory* memory = mExecutionBuilder->mMemories[builderPoolIndex]; const uint32_t executorPoolIndex = mMemories.add(memory); executorInputOrOutput->locationAndLength().poolIndex = executorPoolIndex; break; } } } int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory, uint32_t offset, uint32_t length, const Dimensions& dimensions, ModelArgumentInfo* inputOrOutputInfo) { // Should be similar to // ExecutionBuilder::setInputFromMemory() // ExecutionBuilder::setOutputFromMemory() uint32_t poolIndex = mMemories.add(memory); CHECK(inputOrOutputInfo->unspecified()); int n; std::tie(n, *inputOrOutputInfo) = ModelArgumentInfo::createFromMemory(inputOrOutputOperand, /*type=*/nullptr, poolIndex, offset, length); if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) { CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions)); inputOrOutputInfo->dimensions() = dimensions; } return n; } static std::string toString(std::vector dimensions) { std::string ret = "("; bool wroteOne = false; for (uint32_t dimension : dimensions) { if (wroteOne) { ret += ", "; } else { wroteOne = true; } ret += std::to_string(dimension); } ret += ")"; return ret; }; static void logArguments(const char* kind, const std::vector& args) { for (unsigned i = 0; i < args.size(); i++) { const auto& arg = args[i]; std::string prefix = kind + std::string("[") + std::to_string(i) + "] = "; switch (arg.state()) { case ModelArgumentInfo::POINTER: VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim" << toString(arg.dimensions()); break; case ModelArgumentInfo::MEMORY: VLOG(EXECUTION) << prefix << "MEMORY(" << "pool=" << arg.locationAndLength().poolIndex << ", " << "off=" << arg.locationAndLength().offset << ") dim" << toString(arg.dimensions()); break; case ModelArgumentInfo::HAS_NO_VALUE: VLOG(EXECUTION) << prefix << "HAS_NO_VALUE"; break; case ModelArgumentInfo::UNSPECIFIED: VLOG(EXECUTION) << prefix << "UNSPECIFIED"; break; default: VLOG(EXECUTION) << prefix << "state(" << arg.state() << ")"; break; } } } bool StepExecutor::isCpu() const { return mDevice == DeviceManager::getCpuDevice(); } std::pair> StepExecutor::getReusableExecution() { CHECK(mReusable); if (mExecution == nullptr) { CHECK(mPreparedModel != nullptr); const MeasureTiming measure = measureTiming(mExecutionBuilder); const OptionalDuration loopTimeoutDuration = makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); auto [n, execution] = mPreparedModel->createReusableExecution( mInputs, mOutputs, mMemories.getObjects(), measure, loopTimeoutDuration, mExecutionBuilder->getMetadata()); if (n != ANEURALNETWORKS_NO_ERROR) { return {n, nullptr}; } mExecution = std::move(execution); } return {ANEURALNETWORKS_NO_ERROR, mExecution}; } std::tuple, Timing> StepExecutor::compute( const OptionalTimePoint& deadline, const SharedBurst& burstController) { if (VLOG_IS_ON(EXECUTION)) { logArguments("input", mInputs); logArguments("output", mOutputs); } int n; std::vector outputShapes; Timing timing; if (mReusable) { auto [nCreate, execution] = getReusableExecution(); if (nCreate != ANEURALNETWORKS_NO_ERROR) { return {nCreate, {}, {}}; } std::tie(n, outputShapes, timing) = execution->compute(burstController, deadline); } else { CHECK(mPreparedModel != nullptr); const MeasureTiming measure = measureTiming(mExecutionBuilder); const OptionalDuration loopTimeoutDuration = makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); std::tie(n, outputShapes, timing) = mPreparedModel->execute( mInputs, mOutputs, mMemories.getObjects(), burstController, measure, deadline, loopTimeoutDuration, mExecutionBuilder->getMetadata()); } mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing); return {n, std::move(outputShapes), std::move(timing)}; } std::tuple StepExecutor::computeFenced( const std::vector& waitFor, uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline) { if (VLOG_IS_ON(EXECUTION)) { logArguments("input", mInputs); logArguments("output", mOutputs); } OptionalDuration optionalTimeoutDurationAfterFence; if (timeoutDurationAfterFence > 0) { optionalTimeoutDurationAfterFence = makeTimeoutDuration(timeoutDurationAfterFence); } int n; int syncFenceFd; ExecuteFencedInfoCallback executeFencedInfoCallback; Timing timing; if (mReusable) { auto [nCreate, execution] = getReusableExecution(); if (nCreate != ANEURALNETWORKS_NO_ERROR) { return {nCreate, -1, nullptr}; } std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = execution->computeFenced(waitFor, deadline, optionalTimeoutDurationAfterFence); } else { CHECK(mPreparedModel != nullptr); const MeasureTiming measure = measureTiming(mExecutionBuilder); const OptionalDuration loopTimeoutDuration = makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); std::tie(n, syncFenceFd, executeFencedInfoCallback, timing) = mPreparedModel->executeFenced( mInputs, mOutputs, mMemories.getObjects(), waitFor, measure, deadline, loopTimeoutDuration, optionalTimeoutDurationAfterFence, mExecutionBuilder->getMetadata()); } if (syncFenceFd < 0 && executeFencedInfoCallback == nullptr) { mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing); } return {n, syncFenceFd, executeFencedInfoCallback}; } // For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute. std::tuple, Timing> StepExecutor::computeOnCpuFallback() { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::computeOnCpuFallback"); VLOG(EXECUTION) << "Re-compile the model on CPU"; const ModelFactory makeModel = [this] { return mModel->makeModel(); }; // TODO: Propagate user preference and compilation priority to this point instead of using // default values of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER and // ANEURALNETWORKS_PRIORITY_MEDIUM const ExecutionPreference preference = static_cast(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER); const Priority priority = convertToCanonicalPriority(ANEURALNETWORKS_PRIORITY_DEFAULT); auto [n, preparedModel] = DeviceManager::getCpuDevice()->prepareModel( makeModel, preference, priority, {}, {}, {}, {}, {}); if (n != ANEURALNETWORKS_NO_ERROR) { return {n, {}, {}}; } // Prepare device memories for CPU fallback. std::vector memories = mMemories.getObjects(); std::vector isUsedAsInput(memories.size(), false); std::vector isUsedAsOutput(memories.size(), false); std::vector> blobAhwbs; // Mark the input and output usages. for (auto& input : mInputs) { if (input.state() == ModelArgumentInfo::MEMORY) { const uint32_t poolIndex = input.locationAndLength().poolIndex; isUsedAsInput[poolIndex] = true; } } for (auto& output : mOutputs) { if (output.state() == ModelArgumentInfo::MEMORY) { const uint32_t poolIndex = output.locationAndLength().poolIndex; // Cannot allocate output buffers with unknown shapes. if (mMemories[poolIndex]->getValidator().createdWithUnknownShape()) { LOG(ERROR) << "Cannot fallback to CPU because at least one of the output operands " "has unknown shape."; return {ANEURALNETWORKS_OP_FAILED, {}, {}}; } isUsedAsOutput[poolIndex] = true; } } // Allocate BLOB mode AHardwareBuffers and read the data from input device memories. for (uint32_t i = 0; i < memories.size(); i++) { const RuntimeMemory* memory = mMemories[i]; if (memory->getIBuffer() != nullptr) { const uint32_t size = memory->getValidator().getMetadata().logicalSize; auto [nAhwb, blobAhwb] = MemoryRuntimeAHWB::create(size); if (nAhwb != ANEURALNETWORKS_NO_ERROR) { return {nAhwb, {}, {}}; } if (isUsedAsInput[i]) { n = copyIBufferToMemory(memory->getIBuffer(), blobAhwb->getMemory()); if (n != ANEURALNETWORKS_NO_ERROR) { return {n, {}, {}}; } } memories[i] = blobAhwb.get(); blobAhwbs.push_back(std::move(blobAhwb)); } } const MeasureTiming measure = measureTiming(mExecutionBuilder); const OptionalDuration loopTimeoutDuration = makeTimeoutDuration(mExecutionBuilder->getLoopTimeoutDuration()); auto [nExecute, outputShapes, timing] = preparedModel->execute( mInputs, mOutputs, memories, nullptr, measure, {}, loopTimeoutDuration, {}); mExecutionBuilder->reportTimingWithoutFencedExecutionCallback(timing); if (nExecute != ANEURALNETWORKS_NO_ERROR) { return {nExecute, std::move(outputShapes), timing}; } // Write back to output device memories. for (uint32_t i = 0; i < memories.size(); i++) { const RuntimeMemory* memory = mMemories[i]; if (memory->getIBuffer() != nullptr && isUsedAsOutput[i]) { n = copyMemoryToIBuffer(memories[i]->getMemory(), memory->getIBuffer(), {}); if (n != ANEURALNETWORKS_NO_ERROR) { return {n, {}, {}}; } } } return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing}; } } // namespace nn } // namespace android