diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake index d40ae17e40545..d59c944c8926f 100644 --- a/cmake/onnxruntime_providers_vitisai.cmake +++ b/cmake/onnxruntime_providers_vitisai.cmake @@ -19,7 +19,16 @@ "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" ) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs}) - onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs}) + set(onnxruntime_providers_vitisai_all_srcs ${onnxruntime_providers_vitisai_cc_srcs}) + if(WIN32) + # Sets the DLL version info on Windows: https://learn.microsoft.com/en-us/windows/win32/menurc/versioninfo-resource + list(APPEND onnxruntime_providers_vitisai_all_srcs "${ONNXRUNTIME_ROOT}/core/providers/vitisai/onnxruntime_providers_vitisai.rc") + endif() + onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_all_srcs}) + if(WIN32) + # FILE_NAME preprocessor definition is used in onnxruntime_providers_vitisai.rc + target_compile_definitions(onnxruntime_providers_vitisai PRIVATE FILE_NAME=\"onnxruntime_providers_vitisai.dll\") + endif() onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} safeint_interface flatbuffers::flatbuffers Boost::mp11) target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS}) if(MSVC) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index d4e023b0f86a0..9ae3e79d86443 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1786,7 +1786,7 @@ endif() endif() endif() -if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") +if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_CUDA_MINIMAL) set(custom_op_src_patterns "${TEST_SRC_DIR}/testdata/custom_op_library/*.h" diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs index 81d4f2589151b..fa1914f2a927b 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs @@ -452,6 +452,36 @@ public struct OrtApi public IntPtr Graph_GetModelMetadata; public IntPtr GetModelCompatibilityForEpDevices; public IntPtr CreateExternalInitializerInfo; + + // v1.24 APIs + public IntPtr TensorTypeAndShape_HasShape; + public IntPtr KernelInfo_GetConfigEntries; + public IntPtr KernelInfo_GetOperatorDomain; + public IntPtr KernelInfo_GetOperatorType; + public IntPtr KernelInfo_GetOperatorSinceVersion; + public IntPtr GetInteropApi; + public IntPtr SessionGetEpDeviceForOutputs; + public IntPtr GetNumHardwareDevices; + public IntPtr GetHardwareDevices; + public IntPtr GetHardwareDeviceEpIncompatibilityDetails; + public IntPtr DeviceEpIncompatibilityDetails_GetReasonsBitmask; + public IntPtr DeviceEpIncompatibilityDetails_GetNotes; + public IntPtr DeviceEpIncompatibilityDetails_GetErrorCode; + public IntPtr ReleaseDeviceEpIncompatibilityDetails; + public IntPtr GetCompatibilityInfoFromModel; + public IntPtr GetCompatibilityInfoFromModelBytes; + public IntPtr CreateEnvWithOptions; + public IntPtr Session_GetEpGraphAssignmentInfo; + public IntPtr EpAssignedSubgraph_GetEpName; + public IntPtr EpAssignedSubgraph_GetNodes; + public IntPtr EpAssignedNode_GetName; + public IntPtr EpAssignedNode_GetDomain; + public IntPtr EpAssignedNode_GetOperatorType; + public IntPtr RunOptionsSetSyncStream; + public IntPtr GetTensorElementTypeAndShapeDataReference; + // v1.25 APIs + public IntPtr RunOptionsEnableProfiling; + public IntPtr RunOptionsDisableProfiling; } internal static class NativeMethods @@ -884,6 +914,16 @@ static NativeMethods() (DOrtCopyTensors)Marshal.GetDelegateForFunctionPointer( api_.CopyTensors, typeof(DOrtCopyTensors)); + + OrtGetCompatibilityInfoFromModel = + (DOrtGetCompatibilityInfoFromModel)Marshal.GetDelegateForFunctionPointer( + api_.GetCompatibilityInfoFromModel, + typeof(DOrtGetCompatibilityInfoFromModel)); + + OrtGetCompatibilityInfoFromModelBytes = + (DOrtGetCompatibilityInfoFromModelBytes)Marshal.GetDelegateForFunctionPointer( + api_.GetCompatibilityInfoFromModelBytes, + typeof(DOrtGetCompatibilityInfoFromModelBytes)); } internal class NativeLib @@ -3092,6 +3132,31 @@ public delegate IntPtr DOrtEpSelectionDelegate( public static DOrtReleasePrepackedWeightsContainer OrtReleasePrepackedWeightsContainer; + /// + /// Extract EP compatibility info from a precompiled model file. + /// + [UnmanagedFunctionPointer(CallingConvention.Winapi)] + public delegate IntPtr /* OrtStatus* */ DOrtGetCompatibilityInfoFromModel( + byte[] /* const ORTCHAR_T* */ model_path, + byte[] /* const char* */ ep_type, + IntPtr /* OrtAllocator* */ allocator, + out IntPtr /* char** */ compatibility_info); + + public static DOrtGetCompatibilityInfoFromModel OrtGetCompatibilityInfoFromModel; + + /// + /// Extract EP compatibility info from precompiled model bytes in memory. + /// + [UnmanagedFunctionPointer(CallingConvention.Winapi)] + public delegate IntPtr /* OrtStatus* */ DOrtGetCompatibilityInfoFromModelBytes( + byte[] /* const void* */ model_data, + UIntPtr /* size_t */ model_data_length, + byte[] /* const char* */ ep_type, + IntPtr /* OrtAllocator* */ allocator, + out IntPtr /* char** */ compatibility_info); + + public static DOrtGetCompatibilityInfoFromModelBytes OrtGetCompatibilityInfoFromModelBytes; + #endregion } // class NativeMethods diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs index 22f541e2207fa..0876db3f21209 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs @@ -524,6 +524,75 @@ public OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices( return (OrtCompiledModelCompatibility)status; } + /// + /// Extract EP compatibility info from a precompiled model file. + /// + /// + /// Parses the model file to extract the compatibility info string for a specific execution provider + /// from the model's metadata properties. This is only applicable to models that have been precompiled + /// for an EP. Standard ONNX models do not contain this information. + /// The compatibility info can then be passed to to + /// check if a precompiled model is compatible with the current system. + /// + /// Path to the ONNX model file. + /// The execution provider type string. Use to get this value. + /// The compatibility info string, or null if no compatibility info exists for the specified EP. + /// If modelPath or epType is null or empty. + /// If the model file cannot be read or parsed. + public string GetCompatibilityInfoFromModel(string modelPath, string epType) + { + if (string.IsNullOrEmpty(modelPath)) + throw new ArgumentException("modelPath must be non-empty", nameof(modelPath)); + if (string.IsNullOrEmpty(epType)) + throw new ArgumentException("epType must be non-empty", nameof(epType)); + + var allocator = OrtAllocator.DefaultInstance; + var pathBytes = NativeOnnxValueHelper.GetPlatformSerializedString(modelPath); + var epTypeUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(epType); + + NativeApiStatus.VerifySuccess( + NativeMethods.OrtGetCompatibilityInfoFromModel( + pathBytes, epTypeUtf8, allocator.Pointer, out IntPtr compatInfoPtr)); + + if (compatInfoPtr == IntPtr.Zero) + return null; + + return NativeOnnxValueHelper.StringFromNativeUtf8(compatInfoPtr, allocator); + } + + /// + /// Extract EP compatibility info from precompiled model bytes in memory. + /// + /// + /// Same as but reads from a memory buffer instead of a file. + /// Useful when precompiled models are loaded from encrypted storage, network, or other non-file sources. + /// + /// The model data bytes. + /// The execution provider type string. Use to get this value. + /// The compatibility info string, or null if no compatibility info exists for the specified EP. + /// If modelData is null/empty or epType is null or empty. + /// If the model data cannot be parsed. + public string GetCompatibilityInfoFromModelBytes(byte[] modelData, string epType) + { + if (modelData == null || modelData.Length == 0) + throw new ArgumentException("modelData must be non-empty", nameof(modelData)); + if (string.IsNullOrEmpty(epType)) + throw new ArgumentException("epType must be non-empty", nameof(epType)); + + var allocator = OrtAllocator.DefaultInstance; + var epTypeUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(epType); + + NativeApiStatus.VerifySuccess( + NativeMethods.OrtGetCompatibilityInfoFromModelBytes( + modelData, (UIntPtr)modelData.Length, epTypeUtf8, + allocator.Pointer, out IntPtr compatInfoPtr)); + + if (compatInfoPtr == IntPtr.Zero) + return null; + + return NativeOnnxValueHelper.StringFromNativeUtf8(compatInfoPtr, allocator); + } + /// /// Get/Set log level property of OrtEnv instance diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs index 103fe5bc10106..f1b792454f205 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs @@ -10,6 +10,8 @@ namespace Microsoft.ML.OnnxRuntime.Tests; using System.Linq; using Xunit; using System.Collections.Generic; +using Google.Protobuf; +using Onnx; public class EpCompatibilityTests { @@ -23,6 +25,35 @@ private IReadOnlyList GetDevices() return epDevices; } + /// + /// Creates a minimal valid ONNX ModelProto with optional compatibility metadata. + /// + private static byte[] CreateModelWithCompatibilityMetadata( + Dictionary epCompatibilityInfo = null) + { + var modelProto = new ModelProto(); + modelProto.IrVersion = (long)Onnx.Version.IrVersion; + modelProto.Graph = new GraphProto { Name = "test_graph" }; + + var opset = new OperatorSetIdProto(); + opset.Domain = ""; + opset.Version = 13; + modelProto.OpsetImport.Add(opset); + + if (epCompatibilityInfo != null) + { + foreach (var kvp in epCompatibilityInfo) + { + var prop = new StringStringEntryProto(); + prop.Key = "ep_compatibility_info." + kvp.Key; + prop.Value = kvp.Value; + modelProto.MetadataProps.Add(prop); + } + } + + return modelProto.ToByteArray(); + } + [Fact] public void GetEpCompatibility_InvalidArgs() { @@ -45,5 +76,103 @@ public void GetEpCompatibility_SingleDeviceCpuProvider() // CPU defaults to not applicable in this scenario Assert.Equal(OrtCompiledModelCompatibility.EP_NOT_APPLICABLE, status); } + + [Fact] + public void GetCompatibilityInfoFromModel_InvalidArgs() + { + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModel(null, "TestEP")); + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModel("", "TestEP")); + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModel("model.onnx", null)); + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModel("model.onnx", "")); + } + + [Fact] + public void GetCompatibilityInfoFromModel_FileNotFound() + { + Assert.Throws( + () => ortEnvInstance.GetCompatibilityInfoFromModel("nonexistent_model_path.onnx", "TestEP")); + } + + [Fact] + public void GetCompatibilityInfoFromModelBytes_InvalidArgs() + { + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(null, "TestEP")); + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(new byte[0], "TestEP")); + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(new byte[] { 1, 2, 3 }, null)); + Assert.Throws(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(new byte[] { 1, 2, 3 }, "")); + } + + [Fact] + public void GetCompatibilityInfoFromModel_WithMetadata() + { + const string epType = "TestCompatEP"; + const string expectedCompatInfo = "test_compat_v1.0_driver_123"; + + byte[] modelData = CreateModelWithCompatibilityMetadata( + new Dictionary { { epType, expectedCompatInfo } }); + + string tempModelPath = System.IO.Path.Combine( + System.IO.Path.GetTempPath(), + System.IO.Path.GetRandomFileName() + ".onnx"); + + System.IO.File.WriteAllBytes(tempModelPath, modelData); + + try + { + string result = ortEnvInstance.GetCompatibilityInfoFromModel(tempModelPath, epType); + Assert.NotNull(result); + Assert.Equal(expectedCompatInfo, result); + } + finally + { + if (System.IO.File.Exists(tempModelPath)) + { + System.IO.File.Delete(tempModelPath); + } + } + } + + [Fact] + public void GetCompatibilityInfoFromModelBytes_InvalidModelData() + { + byte[] invalidData = System.Text.Encoding.UTF8.GetBytes("this is not a valid ONNX model"); + Assert.Throws( + () => ortEnvInstance.GetCompatibilityInfoFromModelBytes(invalidData, "TestEP")); + } + + [Fact] + public void GetCompatibilityInfoFromModelBytes_WithMetadata() + { + const string epType = "TestCompatEP"; + const string expectedCompatInfo = "test_compat_v1.0_driver_123"; + + byte[] modelData = CreateModelWithCompatibilityMetadata( + new Dictionary { { epType, expectedCompatInfo } }); + + string result = ortEnvInstance.GetCompatibilityInfoFromModelBytes(modelData, epType); + Assert.NotNull(result); + Assert.Equal(expectedCompatInfo, result); + } + + [Fact] + public void GetCompatibilityInfoFromModelBytes_NotFound() + { + // Create model with metadata for a different EP + byte[] modelData = CreateModelWithCompatibilityMetadata( + new Dictionary { { "DifferentEP", "some_value" } }); + + string result = ortEnvInstance.GetCompatibilityInfoFromModelBytes(modelData, "NonExistentEP"); + Assert.Null(result); + } + + [Fact] + public void GetCompatibilityInfoFromModelBytes_NoMetadata() + { + // Create model without any compatibility metadata + byte[] modelData = CreateModelWithCompatibilityMetadata(); + + string result = ortEnvInstance.GetCompatibilityInfoFromModelBytes(modelData, "AnyEP"); + Assert.Null(result); + } } #endif diff --git a/js/react_native/android/CMakeLists.txt b/js/react_native/android/CMakeLists.txt index 0bcf552ff9e41..a23f5ba7cd8ab 100644 --- a/js/react_native/android/CMakeLists.txt +++ b/js/react_native/android/CMakeLists.txt @@ -1,5 +1,5 @@ +cmake_minimum_required(VERSION 3.13) project(OnnxruntimeJSI) -cmake_minimum_required(VERSION 3.9.0) set(PACKAGE_NAME "onnxruntime-react-native") set(BUILD_DIR ${CMAKE_SOURCE_DIR}/build) @@ -97,3 +97,6 @@ target_link_libraries( ${log-lib} # <-- Logcat logger android # <-- Android JNI core ) + +# 16KB page size support (Android 15+ requirement) +target_link_options(onnxruntimejsi PRIVATE "-Wl,-z,max-page-size=16384") diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc deleted file mode 100644 index 651f270230a75..0000000000000 --- a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "contrib_ops/cpu/bert/attention_base.h" -#include "contrib_ops/cpu/bert/multihead_attention_helper.h" -#include "core/providers/common.h" - -namespace onnxruntime { -namespace contrib { - -Status AttentionBase::CheckInputs(const TensorShape& input_shape, - const TensorShape& weights_shape, - const TensorShape& bias_shape, - const Tensor*& mask_index, - const Tensor* past, - const Tensor* attention_bias, - void* parameters, - const Tensor* past_seq_len) const { - // Abbreviation and Meanings: - // B: batch_size - // S: sequence_length (input sequence length of query) - // P: past_sequence_length (past sequence length of key or value) - // L: kv_sequence_length (input sequence length of key or value) - // M: max_sequence_length - // T: total_sequence_length = past_sequence_length + kv_sequence_length - // N: num_heads - // H: head size for Q and K, aka q_head_size or k_head_size or qk_head_size - // H_v: v_head_size - // D_i: input hidden size - // D: hidden size for Q and K (D = N * H), aka q_hidden_size or k_hidden_size or qk_hidden_size - // D_v: v_hidden_size = num_heads * v_head_size - - // When past state is used, Q, K and V should have same hidden size (unless we split it into past_key and past_value). - - // Input shapes: - // input (Q/K/V) : (B, S, D_i) - // weights (Q/K/V) : (D_i, D + D + D_v) - // bias (Q/K/V) : (D + D + D_v) - // mask_index : see below - // past (K/V) : (2, B, N, P, H) or NULL - // attention_bias : (B or 1, N or 1, S, T) or NULL - - // For mask_index, the following shapes are supported: - // NULL, (B, 1), (1, 1) - // (B), (2 * B), (3 * B + 2) - // (B, T) - // (B, S, T) - // (B, 1, M, M) - // - // When a model is pruned (like some attention heads are removed in Q/K/V), input_hidden_size could be larger - // than hidden dimension of Q, K and V. - - if (past != nullptr && attention_bias != nullptr) { - // past is used on GPT-2 model with past state, we don't have a case for attention bias yet - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Attention cannot have both past and attention_bias"); - } - - const auto& dims = input_shape.GetDims(); - if (dims.size() != 3) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ", - dims.size()); - } - - auto& batch_size = dims[0]; - auto& sequence_length = dims[1]; - int64_t input_hidden_size = dims[2]; - - const auto& bias_dims = bias_shape.GetDims(); - if (bias_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ", - bias_dims.size()); - } - - const auto& weights_dims = weights_shape.GetDims(); - if (weights_dims.size() != 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'weights' is expected to have 2 dimensions, got ", - weights_dims.size()); - } - if (weights_dims[0] != input_hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 1 dimension 0 should have same length as dimension 2 of input 0"); - } - - if (bias_dims[0] != weights_dims[1]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'bias' dimension 0 should have same length as dimension 1 of input 'weights'"); - } - - int64_t q_hidden_size = bias_dims[0] / static_cast(3); - int64_t k_hidden_size = q_hidden_size; - int64_t v_hidden_size = k_hidden_size; - if (qkv_hidden_sizes_.size() != 0) { - if (qkv_hidden_sizes_.size() != 3) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "qkv_hidden_sizes attribute should have 3 elements"); - } - - for (size_t i = 0; i < qkv_hidden_sizes_.size(); i++) { - if (qkv_hidden_sizes_[i] % num_heads_ != 0) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "hidden_size should be divisible by num_heads:", qkv_hidden_sizes_[i]); - } - } - - q_hidden_size = qkv_hidden_sizes_[0]; - k_hidden_size = qkv_hidden_sizes_[1]; - v_hidden_size = qkv_hidden_sizes_[2]; - } - - int64_t kv_sequence_length = sequence_length; - - if (q_hidden_size != k_hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "qkv_hidden_sizes first element should be same as the second"); - } - - if (this->require_same_hidden_size_ && k_hidden_size != v_hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Hidden size of Q, K and V shall be same"); - } - - if (bias_dims[0] != q_hidden_size + k_hidden_size + v_hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'bias' dimension 0 should have same length as sum of Q/K/V hidden sizes:", - " q_hidden_size=", q_hidden_size, " k_hidden_size=", k_hidden_size, " v_hidden_size=", - v_hidden_size, "bias_dims[0]=", bias_dims[0]); - } - - int64_t past_sequence_length = 0; - if (past != nullptr) { // past is optional - if (k_hidden_size != v_hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' expect k_hidden_size == v_hidden_size"); - } - - const auto& past_dims = past->Shape().GetDims(); - if (past_dims.size() != 5) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' is expected to have 5 dimension, got ", - past_dims.size()); - } - - if (past_dims[0] != 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'past' dimension 0 shall have length of 2"); - } - - if (past_dims[1] != batch_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'past' dimension 1 shall have same length as dimension 0 of input 0"); - } - - if (static_cast(past_dims[2]) != num_heads_) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'past' dimension 2 shall have length of num_heads", num_heads_); - } - - if (static_cast(past_dims[4]) != k_hidden_size / num_heads_) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'past' dimension 2 shall have length of ", k_hidden_size / num_heads_); - } - - if (!past_present_share_buffer_) { - past_sequence_length = past_dims[3]; - } else { - if (past_seq_len == nullptr || !onnxruntime::IsScalarOr1ElementVector(past_seq_len)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "past_sequence_length tensor must be of one element when past_present_share_buffer is set"); - } - past_sequence_length = *past_seq_len->Data(); - } - } - - int64_t total_sequence_length = kv_sequence_length + past_sequence_length; - if (past != nullptr && past_present_share_buffer_) { - const auto& past_dims = past->Shape().GetDims(); - if (past_dims[3] < total_sequence_length) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "when past_present_share_buffer, past tensor sequence must not smaller than total_sequqnce_length "); - } - } - - int64_t max_sequence_length = -1; - AttentionMaskType mask_type = AttentionMaskType::MASK_NONE; - if (mask_index != nullptr) { // mask_index is optional - mask_type = AttentionMaskType::MASK_UNKNOWN; - auto status = this->CheckMask(mask_index, mask_type, - max_sequence_length, batch_size, sequence_length, total_sequence_length); - if (status != Status::OK()) { - return status; - } - - if (mask_type == AttentionMaskType::MASK_2D_DUMMY) { - mask_index = nullptr; - mask_type = AttentionMaskType::MASK_NONE; - } - } - - gsl::span attention_bias_dims; - if (attention_bias != nullptr) { - attention_bias_dims = attention_bias->Shape().GetDims(); - - ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckAttentionBias( - attention_bias_dims, batch_size, num_heads_, sequence_length, total_sequence_length)); - } - - if (past != nullptr && past_present_share_buffer_) { - if (max_sequence_length <= 0) { - max_sequence_length = past->Shape().GetDims()[3]; - } - if (max_sequence_length != past->Shape().GetDims()[3]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "max_sequence_length not matching from mask and past when past_present_share_buffer_ is set"); - } - } - - if (parameters != nullptr) { - AttentionParameters* output_parameters = reinterpret_cast(parameters); - output_parameters->batch_size = static_cast(batch_size); - output_parameters->sequence_length = static_cast(sequence_length); - output_parameters->past_sequence_length = static_cast(past_sequence_length); - output_parameters->kv_sequence_length = static_cast(kv_sequence_length); - output_parameters->total_sequence_length = static_cast(total_sequence_length); - output_parameters->max_sequence_length = static_cast(max_sequence_length); - output_parameters->input_hidden_size = static_cast(input_hidden_size); - output_parameters->hidden_size = static_cast(q_hidden_size); - output_parameters->v_hidden_size = static_cast(v_hidden_size); - output_parameters->head_size = static_cast(q_hidden_size) / num_heads_; - output_parameters->v_head_size = static_cast(v_hidden_size) / num_heads_; - output_parameters->num_heads = num_heads_; - output_parameters->is_unidirectional = is_unidirectional_; - output_parameters->past_present_share_buffer = (past_present_share_buffer_ != 0 && past != nullptr); - output_parameters->do_rotary = do_rotary_; - output_parameters->rotary_dim = rotary_embedding_ == 0 ? (int)(output_parameters->head_size) : rotary_embedding_; - output_parameters->mask_filter_value = mask_filter_value_; - output_parameters->scale = scale_; - output_parameters->mask_type = mask_type; - output_parameters->broadcast_attn_bias_dim_0 = attention_bias_dims.size() > 0 && attention_bias_dims[0] == 1; - output_parameters->broadcast_attn_bias_dim_1 = attention_bias_dims.size() > 1 && attention_bias_dims[1] == 1; - output_parameters->qkv_format = Q_K_V_BNSH; - } - - return Status::OK(); -} - -Status AttentionBase::CheckMask(const Tensor* mask_index, - AttentionMaskType& mask_type, - int64_t& max_sequence_length, - int64_t batch_size, - int64_t sequence_length, - int64_t total_sequence_length) const { - const auto& mask_dims = mask_index->Shape().GetDims(); - if (mask_dims.size() == 1) { - if (mask_dims[0] != batch_size && mask_dims[0] != 2 * batch_size && mask_dims[0] != 3 * batch_size + 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'mask_index' with 1D data shall have length of batch_size or 2 * batch_size or 3 * batch_size + 2"); - } - mask_type = (mask_dims[0] == batch_size ? AttentionMaskType::MASK_1D_KEY_SEQ_LEN : mask_dims[0] == 2 * batch_size ? AttentionMaskType::MASK_1D_END_START - : AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START); - } else if (mask_dims.size() == 2) { - if (mask_dims[0] == batch_size && mask_dims[1] == total_sequence_length) { - mask_type = AttentionMaskType::MASK_2D_KEY_PADDING; - } else { - // Add operator supports broadcasting. Here we handle a case with only one element in the 2nd dimension. - if ((mask_dims[0] == batch_size || mask_dims[0] == 1) && mask_dims[1] == 1) { - // Mask will have same value after propagation, which has same effect as no mask. - mask_type = AttentionMaskType::MASK_2D_DUMMY; - } else { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'mask_index' with 2D data shall have shape " - "batch_size x total_sequence_length"); - } - } - } else if (mask_dims.size() == 3) { - if (mask_dims[0] != batch_size || mask_dims[1] != sequence_length || mask_dims[2] != total_sequence_length) { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'mask_index' with 3D data shall have shape " - "batch_size x sequence_length x total_sequence_length"); - } - mask_type = AttentionMaskType::MASK_3D_ATTENTION; - } else if (mask_dims.size() == 4) { - if (mask_dims[0] != batch_size || mask_dims[1] != 1 || mask_dims[2] != mask_dims[3] || - mask_dims[2] < total_sequence_length) { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'mask_index' with 4D data shall have shape " - "batch_size x 1 x max_sequence_length x max_sequence_length)"); - } - max_sequence_length = mask_dims[3]; - mask_type = AttentionMaskType::MASK_4D_MEGATRON; - if (this->is_unidirectional_) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'mask_index' with 4D data shall have is_unidirectional set to false"); - } - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'mask_index' is expected to have 1, 2, 3 or 4 dimensions, got ", - mask_dims.size()); - } - - return Status::OK(); -} - -Status AttentionBase::CheckInputs(const TensorShape& input_shape, - const TensorShape& weights_shape, - const TensorShape& bias_shape, - const Tensor*& mask_index, - const Tensor* past, - const Tensor* attention_bias, - void* parameters, - const int max_threads_per_block, - const Tensor* past_seq_len) const { - if (num_heads_ > max_threads_per_block) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block); - } - - return CheckInputs(input_shape, weights_shape, bias_shape, mask_index, past, attention_bias, parameters, past_seq_len); -} - -Tensor* AttentionBase::GetPresent(OpKernelContext* context, - const Tensor* past, - int batch_size, - int head_size, - int kv_sequence_length, - int& past_sequence_length) const { - // Input and output shapes: - // past : (2, batch_size, num_heads, past_sequence_length, head_size) - // present : (2, batch_size, num_heads, past_sequence_length + kv_sequence_length, head_size) - - past_sequence_length = (nullptr != past) ? static_cast(past->Shape().GetDims()[3]) : 0; - std::array present_dims{2, batch_size, num_heads_, static_cast(kv_sequence_length) + past_sequence_length, head_size}; - - TensorShape present_shape(present_dims); - Tensor* present = context->Output(1, present_shape); - if (nullptr != past && nullptr == present) { - ORT_THROW("Expect to have present state output when past state input is given"); - } - - return present; -} - -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_base.h index bd7f03379b2f0..2872fcfda5bbf 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.h @@ -3,12 +3,19 @@ #pragma once +#include #include #include "core/common/common.h" -#include "core/framework/op_kernel.h" #include "core/providers/cpu/mlas_backend_kernel_selector_config_utils.h" +#ifndef SHARED_PROVIDER +#include "core/framework/op_kernel.h" +#include "core/providers/common.h" +#endif #include "contrib_ops/cpu/bert/attention_common.h" #include "contrib_ops/cpu/bert/attention_parameters.h" +#ifndef SHARED_PROVIDER +#include "contrib_ops/cpu/bert/multihead_attention_helper.h" +#endif namespace onnxruntime { namespace contrib { @@ -25,14 +32,25 @@ class AttentionBase { const int max_threads_per_block, // for CUDA const Tensor* past_seq_len = nullptr) const; +#ifdef SHARED_PROVIDER Tensor* GetPresent(OpKernelContext* context, const Tensor* past, int batch_size, int head_size, int kv_sequence_length, int& past_sequence_length) const; +#else + template + Tensor* GetPresent(TOpKernelContext* context, + const Tensor* past, + int batch_size, + int head_size, + int kv_sequence_length, + int& past_sequence_length) const; +#endif protected: + // Keep the class layout identical in SHARED_PROVIDER and non-SHARED_PROVIDER builds. MLAS_BACKEND_KERNEL_SELECTOR_CONFIG mlas_backend_kernel_selector_config_; template @@ -54,7 +72,9 @@ class AttentionBase { require_same_hidden_size_ = require_same_hidden_size; +#ifndef SHARED_PROVIDER SetupMlasBackendKernelSelectorFromConfigOptions(mlas_backend_kernel_selector_config_, info.GetConfigOptions()); +#endif } Status CheckMask(const Tensor* mask_index, @@ -84,5 +104,299 @@ class AttentionBase { float scale_; // the scale to be used for softmax }; +#ifndef SHARED_PROVIDER +// Inline implementations of out-of-line methods for non-SHARED_PROVIDER builds +// (attention_base.cc definitions are used only in the SHARED_PROVIDER bridge path). +inline Status AttentionBase::CheckMask(const Tensor* mask_index, + AttentionMaskType& mask_type, + int64_t& max_sequence_length, + int64_t batch_size, + int64_t sequence_length, + int64_t total_sequence_length) const { + const auto& mask_dims = mask_index->Shape().GetDims(); + if (mask_dims.size() == 1) { + if (mask_dims[0] != batch_size && mask_dims[0] != 2 * batch_size && mask_dims[0] != 3 * batch_size + 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'mask_index' with 1D data shall have length of batch_size or 2 * batch_size or 3 * batch_size + 2"); + } + mask_type = (mask_dims[0] == batch_size ? AttentionMaskType::MASK_1D_KEY_SEQ_LEN : mask_dims[0] == 2 * batch_size ? AttentionMaskType::MASK_1D_END_START + : AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START); + } else if (mask_dims.size() == 2) { + if (mask_dims[0] == batch_size && mask_dims[1] == total_sequence_length) { + mask_type = AttentionMaskType::MASK_2D_KEY_PADDING; + } else { + if ((mask_dims[0] == batch_size || mask_dims[0] == 1) && mask_dims[1] == 1) { + mask_type = AttentionMaskType::MASK_2D_DUMMY; + } else { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'mask_index' with 2D data shall have shape " + "batch_size x total_sequence_length"); + } + } + } else if (mask_dims.size() == 3) { + if (mask_dims[0] != batch_size || mask_dims[1] != sequence_length || mask_dims[2] != total_sequence_length) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'mask_index' with 3D data shall have shape " + "batch_size x sequence_length x total_sequence_length"); + } + mask_type = AttentionMaskType::MASK_3D_ATTENTION; + } else if (mask_dims.size() == 4) { + if (mask_dims[0] != batch_size || mask_dims[1] != 1 || mask_dims[2] != mask_dims[3] || + mask_dims[2] < total_sequence_length) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'mask_index' with 4D data shall have shape " + "batch_size x 1 x max_sequence_length x max_sequence_length)"); + } + max_sequence_length = mask_dims[3]; + mask_type = AttentionMaskType::MASK_4D_MEGATRON; + if (this->is_unidirectional_) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'mask_index' with 4D data shall have is_unidirectional set to false"); + } + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'mask_index' is expected to have 1, 2, 3 or 4 dimensions, got ", + mask_dims.size()); + } + + return Status::OK(); +} + +inline Status AttentionBase::CheckInputs(const TensorShape& input_shape, + const TensorShape& weights_shape, + const TensorShape& bias_shape, + const Tensor*& mask_index, + const Tensor* past, + const Tensor* attention_bias, + void* parameters, + const Tensor* past_seq_len) const { + if (past != nullptr && attention_bias != nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Attention cannot have both past and attention_bias"); + } + + const auto& dims = input_shape.GetDims(); + if (dims.size() != 3) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ", + dims.size()); + } + + auto& batch_size = dims[0]; + auto& sequence_length = dims[1]; + int64_t input_hidden_size = dims[2]; + + const auto& bias_dims = bias_shape.GetDims(); + if (bias_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ", + bias_dims.size()); + } + + const auto& weights_dims = weights_shape.GetDims(); + if (weights_dims.size() != 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'weights' is expected to have 2 dimensions, got ", + weights_dims.size()); + } + if (weights_dims[0] != input_hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 1 dimension 0 should have same length as dimension 2 of input 0"); + } + + if (bias_dims[0] != weights_dims[1]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'bias' dimension 0 should have same length as dimension 1 of input 'weights'"); + } + + int64_t q_hidden_size = bias_dims[0] / static_cast(3); + int64_t k_hidden_size = q_hidden_size; + int64_t v_hidden_size = k_hidden_size; + if (qkv_hidden_sizes_.size() != 0) { + if (qkv_hidden_sizes_.size() != 3) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "qkv_hidden_sizes attribute should have 3 elements"); + } + + for (size_t i = 0; i < qkv_hidden_sizes_.size(); i++) { + if (qkv_hidden_sizes_[i] % num_heads_ != 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "hidden_size should be divisible by num_heads:", qkv_hidden_sizes_[i]); + } + } + + q_hidden_size = qkv_hidden_sizes_[0]; + k_hidden_size = qkv_hidden_sizes_[1]; + v_hidden_size = qkv_hidden_sizes_[2]; + } + + int64_t kv_sequence_length = sequence_length; + + if (q_hidden_size != k_hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "qkv_hidden_sizes first element should be same as the second"); + } + + if (this->require_same_hidden_size_ && k_hidden_size != v_hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Hidden size of Q, K and V shall be same"); + } + + if (bias_dims[0] != q_hidden_size + k_hidden_size + v_hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'bias' dimension 0 should have same length as sum of Q/K/V hidden sizes:", + " q_hidden_size=", q_hidden_size, " k_hidden_size=", k_hidden_size, " v_hidden_size=", + v_hidden_size, "bias_dims[0]=", bias_dims[0]); + } + + int64_t past_sequence_length = 0; + if (past != nullptr) { + if (k_hidden_size != v_hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' expect k_hidden_size == v_hidden_size"); + } + + const auto& past_dims = past->Shape().GetDims(); + if (past_dims.size() != 5) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' is expected to have 5 dimension, got ", + past_dims.size()); + } + + if (past_dims[0] != 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'past' dimension 0 shall have length of 2"); + } + + if (past_dims[1] != batch_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'past' dimension 1 shall have same length as dimension 0 of input 0"); + } + + if (static_cast(past_dims[2]) != num_heads_) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'past' dimension 2 shall have length of num_heads", num_heads_); + } + + if (static_cast(past_dims[4]) != k_hidden_size / num_heads_) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'past' dimension 2 shall have length of ", k_hidden_size / num_heads_); + } + + if (!past_present_share_buffer_) { + past_sequence_length = past_dims[3]; + } else { + if (past_seq_len == nullptr || !::onnxruntime::IsScalarOr1ElementVector(past_seq_len)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "past_sequence_length tensor must be of one element when past_present_share_buffer is set"); + } + past_sequence_length = *past_seq_len->Data(); + } + } + + int64_t total_sequence_length = kv_sequence_length + past_sequence_length; + if (past != nullptr && past_present_share_buffer_) { + const auto& past_dims = past->Shape().GetDims(); + if (past_dims[3] < total_sequence_length) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "when past_present_share_buffer, past tensor sequence must not smaller than total_sequence_length "); + } + } + + int64_t max_sequence_length = -1; + AttentionMaskType mask_type = AttentionMaskType::MASK_NONE; + if (mask_index != nullptr) { + mask_type = AttentionMaskType::MASK_UNKNOWN; + auto status = this->CheckMask(mask_index, mask_type, + max_sequence_length, batch_size, sequence_length, total_sequence_length); + if (status != Status::OK()) { + return status; + } + + if (mask_type == AttentionMaskType::MASK_2D_DUMMY) { + mask_index = nullptr; + mask_type = AttentionMaskType::MASK_NONE; + } + } + + gsl::span attention_bias_dims; + if (attention_bias != nullptr) { + attention_bias_dims = attention_bias->Shape().GetDims(); + + ORT_RETURN_IF_ERROR(::onnxruntime::contrib::multihead_attention_helper::CheckAttentionBias( + attention_bias_dims, batch_size, num_heads_, sequence_length, total_sequence_length)); + } + + if (past != nullptr && past_present_share_buffer_) { + if (max_sequence_length <= 0) { + max_sequence_length = past->Shape().GetDims()[3]; + } + if (max_sequence_length != past->Shape().GetDims()[3]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "max_sequence_length not matching from mask and past when past_present_share_buffer_ is set"); + } + } + + if (parameters != nullptr) { + AttentionParameters* output_parameters = reinterpret_cast(parameters); + output_parameters->batch_size = static_cast(batch_size); + output_parameters->sequence_length = static_cast(sequence_length); + output_parameters->past_sequence_length = static_cast(past_sequence_length); + output_parameters->kv_sequence_length = static_cast(kv_sequence_length); + output_parameters->total_sequence_length = static_cast(total_sequence_length); + output_parameters->max_sequence_length = static_cast(max_sequence_length); + output_parameters->input_hidden_size = static_cast(input_hidden_size); + output_parameters->hidden_size = static_cast(q_hidden_size); + output_parameters->v_hidden_size = static_cast(v_hidden_size); + output_parameters->head_size = static_cast(q_hidden_size) / num_heads_; + output_parameters->v_head_size = static_cast(v_hidden_size) / num_heads_; + output_parameters->num_heads = num_heads_; + output_parameters->is_unidirectional = is_unidirectional_; + output_parameters->past_present_share_buffer = (past_present_share_buffer_ != 0 && past != nullptr); + output_parameters->do_rotary = do_rotary_; + output_parameters->rotary_dim = rotary_embedding_ == 0 ? (int)(output_parameters->head_size) : rotary_embedding_; + output_parameters->mask_filter_value = mask_filter_value_; + output_parameters->scale = scale_; + output_parameters->mask_type = mask_type; + output_parameters->broadcast_attn_bias_dim_0 = attention_bias_dims.size() > 0 && attention_bias_dims[0] == 1; + output_parameters->broadcast_attn_bias_dim_1 = attention_bias_dims.size() > 1 && attention_bias_dims[1] == 1; + output_parameters->qkv_format = Q_K_V_BNSH; + } + + return Status::OK(); +} + +inline Status AttentionBase::CheckInputs(const TensorShape& input_shape, + const TensorShape& weights_shape, + const TensorShape& bias_shape, + const Tensor*& mask_index, + const Tensor* past, + const Tensor* attention_bias, + void* parameters, + const int max_threads_per_block, + const Tensor* past_seq_len) const { + if (num_heads_ > max_threads_per_block) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block); + } + + return CheckInputs(input_shape, weights_shape, bias_shape, mask_index, past, attention_bias, parameters, past_seq_len); +} + +template +inline Tensor* AttentionBase::GetPresent(TOpKernelContext* context, + const Tensor* past, + int batch_size, + int head_size, + int kv_sequence_length, + int& past_sequence_length) const { + past_sequence_length = (nullptr != past) ? static_cast(past->Shape().GetDims()[3]) : 0; + std::array present_dims{2, batch_size, num_heads_, + static_cast(kv_sequence_length) + past_sequence_length, head_size}; + + TensorShape present_shape(present_dims); + Tensor* present = context->Output(1, present_shape); + if (nullptr != past && nullptr == present) { + ORT_THROW("Expect to have present state output when past state input is given"); + } + + return present; +} +#endif // SHARED_PROVIDER + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h b/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h index 9a123e80adc18..f316a0dfdf91c 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h @@ -10,33 +10,33 @@ namespace contrib { // Parameters deduced from node attributes and inputs/outputs. struct AttentionParameters { - int batch_size; - int sequence_length; - int kv_sequence_length; // input sequence length of K or V - int past_sequence_length; // sequence length in past state of K or V - int total_sequence_length; // total sequence length of K or V - int max_sequence_length; // max sequence length from 4D mask - int input_hidden_size; // first dimension of weights for input projection - int hidden_size; // hidden size of Q or K - int head_size; // hidden size per head of Q or K - int v_hidden_size; // hidden size of V - int v_head_size; // hidden size per head of V - int num_heads; - int num_splits; // number of splits for splitkv + int batch_size = 0; + int sequence_length = 0; + int kv_sequence_length = 0; // input sequence length of K or V + int past_sequence_length = 0; // sequence length in past state of K or V + int total_sequence_length = 0; // total sequence length of K or V + int max_sequence_length = 0; // max sequence length from 4D mask + int input_hidden_size = 0; // first dimension of weights for input projection + int hidden_size = 0; // hidden size of Q or K + int head_size = 0; // hidden size per head of Q or K + int v_hidden_size = 0; // hidden size of V + int v_head_size = 0; // hidden size per head of V + int num_heads = 0; + int num_splits = 0; // number of splits for splitkv int rotary_dim = 0; // rotary embedding dimension - int beam_width; + int beam_width = 0; bool is_unidirectional = false; bool past_present_share_buffer = false; bool is_packed_qkv = false; // whether qkv is packed bool do_rotary = false; bool broadcast_attn_bias_dim_0 = false; bool broadcast_attn_bias_dim_1 = false; - float mask_filter_value; - float scale; + float mask_filter_value = 0.0f; + float scale = 0.0f; bool use_tf32 = false; bool is_output_bnsh = false; // whether the output format is BNSH - AttentionMaskType mask_type; - AttentionQkvFormat qkv_format; + AttentionMaskType mask_type = AttentionMaskType::MASK_NONE; + AttentionQkvFormat qkv_format = AttentionQkvFormat::Q_K_V_BNSH; }; // Parameters deduced from node attributes and inputs/outputs. diff --git a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc deleted file mode 100644 index 97f75d297d789..0000000000000 --- a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "longformer_attention_base.h" - -namespace onnxruntime { -namespace contrib { - -Status LongformerAttentionBase::CheckInputs(const TensorShape& input_shape, - const TensorShape& weights_shape, - const TensorShape& bias_shape, - const TensorShape& attention_mask_shape, - const TensorShape& global_weights_shape, - const TensorShape& global_bias_shape, - const TensorShape& global_mask_shape) const { - // Input shapes: - // input : (batch_size, sequence_length, hidden_size) - // weights : (hidden_size, 3 * hidden_size) -- format 1 - // (3, hidden_size, hidden_size) -- format 0 - // bias : (3 * hidden_size) -- format 1 (bias for Q, K, V) - // (5 * hidden_size) -- format 0 (bias for Q, K, V, Global_K, Global_V) - // attention_mask : (batch_size, sequence_length) - // global_weights : (hidden_size, 3 * hidden_size) -- format 1 - // (3, hidden_size, hidden_size) -- format 0 - // global_bias : (3 * hidden_size) -- format 1 (bias for Global_Q, Global_K, Global_V) - // (1 * hidden_size) -- format 0 (bias for Global_Q) - // global_attention_mask : (batch_size, sequence_length) - - const auto& dims = input_shape.GetDims(); - if (dims.size() != 3) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ", - dims.size()); - } - - int batch_size = static_cast(dims[0]); - int sequence_length = static_cast(dims[1]); - auto hidden_size = dims[2]; - if (sequence_length % (2 * window_) != 0) { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'input' dimension 1 should be divisible by 2W, where W is value of the window attribute."); - } - if (hidden_size % num_heads_ != 0) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'input' dimension 2 should be divisible by value of the num_heads attribute."); - } - - const auto& weights_dims = weights_shape.GetDims(); - bool use_merged_qkv_weights = (weights_shape.NumDimensions() == 2); - if (use_merged_qkv_weights) { - if (weights_dims[0] != hidden_size || weights_dims[1] != 3 * hidden_size) { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'weights' shape should be (hidden_size, 3 * hidden_size) for format 1"); - } - } else { - if (weights_dims.size() != 3 || - weights_dims[0] != 3 || weights_dims[1] != hidden_size || weights_dims[2] != hidden_size) { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'weights' shape should be (3, hidden_size, hidden_size) for format 0"); - } - } - - const auto& bias_dims = bias_shape.GetDims(); - if (bias_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ", - bias_dims.size()); - } - - if (use_merged_qkv_weights) { - if (bias_dims[0] != 3 * hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'bias' shape should be (3 * hidden_size) for format 1"); - } - } else { - if (bias_dims[0] != 5 * hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'bias' shape should be (5 * hidden_size) for format 0"); - } - } - - const auto& mask_dims = attention_mask_shape.GetDims(); - if (mask_dims.size() == 2) { - if (static_cast(mask_dims[0]) != batch_size || static_cast(mask_dims[1]) != sequence_length) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Inputs 'attention_mask' shape shall be (batch_size, sequence_length)"); - } - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'attention_mask' is expected to have 2 dimensions, got ", mask_dims.size()); - } - - const auto& global_weights_dims = global_weights_shape.GetDims(); - if (use_merged_qkv_weights) { - if (global_weights_dims.size() != 2 || - global_weights_dims[0] != hidden_size || global_weights_dims[1] != 3 * hidden_size) { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'global_weights' shape should be (hidden_size, 3 * hidden_size) for format 1"); - } - } else { - if (global_weights_dims.size() != 3 || global_weights_dims[0] != 3 || - global_weights_dims[1] != hidden_size || global_weights_dims[2] != hidden_size) { - return ORT_MAKE_STATUS( - ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'global_weights' shape should be (3, hidden_size, hidden_size) for format 0"); - } - } - - const auto& global_bias_dims = global_bias_shape.GetDims(); - if (global_bias_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global_bias' is expected to have 1 dimension, got ", - global_bias_dims.size()); - } - - if (use_merged_qkv_weights) { - if (global_bias_dims[0] != 3 * hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'global_bias' shape should be (3 * hidden_size) for format 1"); - } - } else { - if (global_bias_dims[0] != hidden_size) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'global_bias' shape should be (hidden_size) for format 0"); - } - } - - const auto& global_mask_dims = global_mask_shape.GetDims(); - if (global_mask_dims.size() != 2 || - static_cast(global_mask_dims[0]) != batch_size || - static_cast(global_mask_dims[1]) != sequence_length) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'global_attention_mask' shape shall be (batch_size, sequence_length)"); - } - - return Status::OK(); -} - -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h index ac1cccaa83cf9..bb1dfea38ae80 100644 --- a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h @@ -4,7 +4,9 @@ #pragma once #include "core/common/common.h" +#ifndef SHARED_PROVIDER #include "core/framework/op_kernel.h" +#endif namespace onnxruntime { namespace contrib { @@ -20,7 +22,8 @@ class LongformerAttentionBase { const TensorShape& global_attention_mask_shape) const; protected: - LongformerAttentionBase(const OpKernelInfo& info) { + template + LongformerAttentionBase(const KernelInfoType& info) { int64_t num_heads = 0; ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0); num_heads_ = static_cast(num_heads); @@ -43,5 +46,126 @@ constexpr const char* kUseHalf4 = "ORT_LONGFORMER_USE_HALF4"; } // namespace longformer +#ifndef SHARED_PROVIDER +// Inline implementation of CheckInputs for non-SHARED_PROVIDER builds. +inline Status LongformerAttentionBase::CheckInputs(const TensorShape& input_shape, + const TensorShape& weights_shape, + const TensorShape& bias_shape, + const TensorShape& attention_mask_shape, + const TensorShape& global_weights_shape, + const TensorShape& global_bias_shape, + const TensorShape& global_mask_shape) const { + const auto& dims = input_shape.GetDims(); + if (dims.size() != 3) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ", + dims.size()); + } + + int batch_size = static_cast(dims[0]); + int sequence_length = static_cast(dims[1]); + auto hidden_size = dims[2]; + if (sequence_length % (2 * window_) != 0) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'input' dimension 1 should be divisible by 2W, where W is value of the window attribute."); + } + if (hidden_size % num_heads_ != 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'input' dimension 2 should be divisible by value of the num_heads attribute."); + } + + const auto& weights_dims = weights_shape.GetDims(); + bool use_merged_qkv_weights = (weights_shape.NumDimensions() == 2); + if (use_merged_qkv_weights) { + if (weights_dims[0] != hidden_size || weights_dims[1] != 3 * hidden_size) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'weights' shape should be (hidden_size, 3 * hidden_size) for format 1"); + } + } else { + if (weights_dims.size() != 3 || + weights_dims[0] != 3 || weights_dims[1] != hidden_size || weights_dims[2] != hidden_size) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'weights' shape should be (3, hidden_size, hidden_size) for format 0"); + } + } + + const auto& bias_dims = bias_shape.GetDims(); + if (bias_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ", + bias_dims.size()); + } + + if (use_merged_qkv_weights) { + if (bias_dims[0] != 3 * hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'bias' shape should be (3 * hidden_size) for format 1"); + } + } else { + if (bias_dims[0] != 5 * hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'bias' shape should be (5 * hidden_size) for format 0"); + } + } + + const auto& mask_dims = attention_mask_shape.GetDims(); + if (mask_dims.size() == 2) { + if (static_cast(mask_dims[0]) != batch_size || static_cast(mask_dims[1]) != sequence_length) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Inputs 'attention_mask' shape shall be (batch_size, sequence_length)"); + } + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'attention_mask' is expected to have 2 dimensions, got ", mask_dims.size()); + } + + const auto& global_weights_dims = global_weights_shape.GetDims(); + if (use_merged_qkv_weights) { + if (global_weights_dims.size() != 2 || + global_weights_dims[0] != hidden_size || global_weights_dims[1] != 3 * hidden_size) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'global_weights' shape should be (hidden_size, 3 * hidden_size) for format 1"); + } + } else { + if (global_weights_dims.size() != 3 || global_weights_dims[0] != 3 || + global_weights_dims[1] != hidden_size || global_weights_dims[2] != hidden_size) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'global_weights' shape should be (3, hidden_size, hidden_size) for format 0"); + } + } + + const auto& global_bias_dims = global_bias_shape.GetDims(); + if (global_bias_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global_bias' is expected to have 1 dimension, got ", + global_bias_dims.size()); + } + + if (use_merged_qkv_weights) { + if (global_bias_dims[0] != 3 * hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'global_bias' shape should be (3 * hidden_size) for format 1"); + } + } else { + if (global_bias_dims[0] != hidden_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'global_bias' shape should be (hidden_size) for format 0"); + } + } + + const auto& global_mask_dims = global_mask_shape.GetDims(); + if (global_mask_dims.size() != 2 || + static_cast(global_mask_dims[0]) != batch_size || + static_cast(global_mask_dims[1]) != sequence_length) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'global_attention_mask' shape shall be (batch_size, sequence_length)"); + } + + return Status::OK(); +} +#endif // SHARED_PROVIDER + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/crop.h b/onnxruntime/contrib_ops/cpu/crop.h index 3b72ef429c1f7..97577304e948e 100644 --- a/onnxruntime/contrib_ops/cpu/crop.h +++ b/onnxruntime/contrib_ops/cpu/crop.h @@ -4,7 +4,9 @@ #pragma once #include "core/common/common.h" +#ifndef SHARED_PROVIDER #include "core/framework/op_kernel.h" +#endif #include @@ -13,9 +15,10 @@ namespace contrib { class CropBase { protected: - CropBase(const OpKernelInfo& info) - : border_(info.GetAttrsOrDefault("border")), - scale_(info.GetAttrsOrDefault("scale")) { + template + CropBase(const KernelInfoType& info) + : border_(info.template GetAttrsOrDefault("border")), + scale_(info.template GetAttrsOrDefault("scale")) { } Status ValidateInput(const Tensor* X) const { diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index a656abb098911..56bff8aa30f68 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -237,9 +237,22 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, device_id), mem_type1); } else if (strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 || - strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0) { + strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0 || + // Accept pre-1.25 names "WebGPU_Buffer"/"WebNN_Tensor" for backward compatibility + // with released onnxruntime-genai that still uses the old names. + // Normalize to the current (short) constant so downstream name comparisons work. + // See: https://github.com/microsoft/onnxruntime/pull/27207 + strcmp(name1, "WebGPU_Buffer") == 0 || + strcmp(name1, "WebNN_Tensor") == 0) { + // Map old long names to current short constants to keep downstream name comparisons consistent. + const char* normalized_name = name1; + if (strcmp(name1, "WebGPU_Buffer") == 0) { + normalized_name = onnxruntime::WEBGPU_BUFFER; + } else if (strcmp(name1, "WebNN_Tensor") == 0) { + normalized_name = onnxruntime::WEBNN_TENSOR; + } *out = new OrtMemoryInfo( - name1, type, + normalized_name, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, device_id), mem_type1); diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 6bcbdc401619a..bee7f048b7c6e 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -349,66 +349,112 @@ Status TensorProtoWithExternalDataToTensorProto( return Status::OK(); } -Status ValidateExternalDataPath(const std::filesystem::path& base_dir, - const std::filesystem::path& location, - const std::filesystem::path& model_path) { - // Reject absolute paths - ORT_RETURN_IF(location.is_absolute(), - "Absolute paths not allowed for external data location"); - if (!base_dir.empty()) { - // Resolve and verify the path stays within model directory - auto base_canonical = std::filesystem::weakly_canonical(base_dir); - // If the symlink exists, it resolves to the target path; - // so if the symlink is outside the directory it would be caught here. - auto resolved = std::filesystem::weakly_canonical(base_dir / location); - - // Check that resolved path starts with base directory - auto [base_end, resolved_it] = std::mismatch( - base_canonical.begin(), base_canonical.end(), - resolved.begin(), resolved.end()); - - if (base_end != base_canonical.end()) { - // If validation against logical base_dir fails, we check against the - // real (canonical) path of the model file to support symlinked models - // (e.g. models in Hugging Face Hub local cache). - if (!model_path.empty()) { - auto real_model_dir = std::filesystem::weakly_canonical(model_path).parent_path(); - - auto [real_base_end, real_resolved_it] = std::mismatch( - real_model_dir.begin(), real_model_dir.end(), - resolved.begin(), resolved.end()); - - if (real_base_end == real_model_dir.end()) { - return Status::OK(); - } +// Wraps std::filesystem::weakly_canonical with error_code handling. +static Status WeaklyCanonicalPath(const std::filesystem::path& path, std::filesystem::path& result) { + std::error_code ec; + result = std::filesystem::weakly_canonical(path, ec); + ORT_RETURN_IF(ec, "Failed to get the weakly canonical path: ", path, " - ", ec.message()); + return Status::OK(); +} - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "External data path: ", location, " (resolved path: ", resolved, - ") escapes both model directory: ", base_dir, - " and real model directory: ", real_model_dir); - } +// Wraps std::filesystem::exists with error_code handling. +static Status PathExists(const std::filesystem::path& path, bool& exists) { + std::error_code ec; + exists = std::filesystem::exists(path, ec); + ORT_RETURN_IF(ec, "Failed to check existence of path: ", path, " - ", ec.message()); + return Status::OK(); +} - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "External data path: ", location, " (resolved path: ", resolved, - ") escapes model directory: ", base_dir); - } - } else { - // The basedir is empty, which occurs when 1) the session loads a model from bytes and 2) the application does not - // set an external file folder path via the session config option - // `kOrtSessionOptionsModelExternalInitializersFileFolderPath`. - - // We conservatively check that the normalized relative path does not contain ".." path components that would allow - // access to arbitrary files outside of the current working directory. Based on ONNX checker validation. - auto norm_location = location.lexically_normal(); - - for (const auto& path_component : norm_location) { - if (path_component == ORT_TSTR("..")) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "External data path: ", location, - " (model loaded from bytes) escapes working directory"); - } +// Checks whether `path` has the given path prefix. +static bool HasPathComponentPrefix(const std::filesystem::path& prefix, const std::filesystem::path& path) { + auto [prefix_end, path_it] = std::mismatch(prefix.begin(), prefix.end(), path.begin(), path.end()); + return prefix_end == prefix.end(); +} + +Status ValidateExternalDataPath(const std::filesystem::path& model_path, + const std::filesystem::path& external_data_path) { + ORT_RETURN_IF(external_data_path.empty(), "Empty external data path not allowed"); + + // Note: Use !root_path().empty() to reject paths like '/some/path` even on Windows. + ORT_RETURN_IF(!external_data_path.root_path().empty(), "Absolute path not allowed for external data location"); + +#if defined(__wasm__) + std::error_code error_code; + std::filesystem::current_path(error_code); + if (error_code) { + // If we can't access the current working directory in a WASM build, we assume that the WASM + // environment does not have a virtual filesystem and defer validation to an ExternalDataLoader for + // a WASM EP. + return Status::OK(); + } +#endif + + // Determine the model directory: use model file's parent directory if provided, + // otherwise use the current working directory. + std::filesystem::path model_dir = model_path.empty() || model_path.parent_path().empty() + ? std::filesystem::path{"."} + : model_path.parent_path(); + + // Resolve the model directory and the external data path to their weakly canonical forms, which + // resolves symlinks but does not require that the paths actually exist yet. + std::filesystem::path model_dir_canonical; + std::filesystem::path external_data_path_canonical; + ORT_RETURN_IF_ERROR(WeaklyCanonicalPath(model_dir, model_dir_canonical)); + ORT_RETURN_IF_ERROR(WeaklyCanonicalPath(model_dir_canonical / external_data_path, external_data_path_canonical)); + + // Check that the external data path is contained by the model directory. + // If it is, check if the external data file actually exists. + if (HasPathComponentPrefix(model_dir_canonical, external_data_path_canonical)) { + bool path_exists = false; + ORT_RETURN_IF_ERROR(PathExists(external_data_path_canonical, path_exists)); + ORT_RETURN_IF(!path_exists, "External data path does not exist: ", external_data_path_canonical); + return Status::OK(); + } + + if (model_path.empty()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "External data path for model loaded from bytes escapes working directory. ", + "External data path: ", external_data_path, " resolved path: ", + external_data_path_canonical, " ", "working directory: ", model_dir); + } + + // The model file itself may be a symlink. Therefore, check against the real/canonical directory of the model + // after resolving all symlinks. + // + // This supports symlinked models (e.g., Hugging Face Hub local cache) where the canonical + // parent of the model file differs from the parent directory of the symlinked model file. + std::error_code ec; + if (!std::filesystem::is_symlink(model_path, ec)) { + // Note: is_symlink returns false if file is not a symlink, file does not exist, or an error + // occurred (e.g., permissions). In any of these cases, we just return an error. + std::string fs_error_msg; + if (ec) { + fs_error_msg = " filesystem::is_symlink error: " + ec.message(); } + + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "External data path for model escapes model directory. ", + "External data path: ", external_data_path, " resolved path: ", + external_data_path_canonical, " ", "model directory: ", model_dir, fs_error_msg); } - return Status::OK(); + + std::filesystem::path real_model_path; + ORT_RETURN_IF_ERROR(WeaklyCanonicalPath(model_path, real_model_path)); + auto real_model_dir = real_model_path.parent_path(); + + // Check that the external data path is contained by the real model directory. + // If it is, check if the external data file actually exists. + if (HasPathComponentPrefix(real_model_dir, external_data_path_canonical)) { + bool path_exists = false; + ORT_RETURN_IF_ERROR(PathExists(external_data_path_canonical, path_exists)); + ORT_RETURN_IF(!path_exists, "External data path does not exist: ", external_data_path_canonical); + return Status::OK(); + } + + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "External data path: ", external_data_path, " (resolved path: ", + external_data_path_canonical, ") escapes both model directory: ", model_dir, + " and real model directory: ", real_model_dir); } Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h index f3f33a32b8076..e7649c072416c 100644 --- a/onnxruntime/core/framework/tensorprotoutils.h +++ b/onnxruntime/core/framework/tensorprotoutils.h @@ -539,19 +539,33 @@ Status TensorProtoWithExternalDataToTensorProto( ONNX_NAMESPACE::TensorProto& new_tensor_proto); /// -/// Validates if the external data path is under the model directory. -/// If the model is a symlink, it checks against both the logical model directory (base_dir) -/// and the real/canonical directory of the model. -/// If the `base_dir` is empty, the function only ensures that `location` is not an absolute path. +/// Validates that the given external data path is not an absolute path, is under the model directory +/// (after resolving symlinks), and exists. +/// +/// The model path can be empty if the model is loaded from bytes and the application did not specify a directory +/// for external data files. In this case, the external data path must be contained under the current working +/// directory. +/// +/// The model path can point to a non-existing model file if the model is loaded from bytes and the application +/// specified a directory for external data files via the session config entry +/// `kOrtSessionOptionsModelExternalInitializersFileFolderPath`. In this case, the model_path is set to +/// " / virtual_model.onnx" and the external data path +/// must be contained under `kOrtSessionOptionsModelExternalInitializersFileFolderPath`. +/// +/// If the model itself is a symlink, this function checks against both the directory containing the symlink +/// and the real/canonical directory of the model after resolving all symlinks. +/// +/// On WASM builds, this function skips most validation (except checks for non-empty/non-absolute path) if we are +/// unable to query the current working directory, as this indicates that the WASM environment does not have +/// a valid filesystem. If skipped, an ExternalDataLoader will validate the location and contents of the +/// external data file at the time of access. /// -/// Logical model location directory -/// Location string retrieved from TensorProto external data -/// Optional path to the model file, used for canonical path validation if base_dir check fails -/// The function will fail if the resolved full path is not under the logical model directory -/// nor the real directory of the model path -Status ValidateExternalDataPath(const std::filesystem::path& base_dir, - const std::filesystem::path& location, - const std::filesystem::path& model_path = {}); +/// Path to the model file. Can be empty or point to a virtual file. +/// External data file path to be validated. +/// Retrieved from TensorProto external data info +/// The function will fail if the resolved `external_data_path` path is not under the model directory +Status ValidateExternalDataPath(const std::filesystem::path& model_path, + const std::filesystem::path& external_data_path); #endif // !defined(SHARED_PROVIDER) diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 5aa466ecb5bc7..3599edbfcd357 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -3742,10 +3742,7 @@ Status Graph::ConvertInitializersIntoOrtValues() { FindAllSubgraphs(all_subgraphs); const auto& model_path = GetModel().ModelPath(); - PathString model_dir; - if (!model_path.empty()) { - ORT_RETURN_IF_ERROR(GetDirNameFromFilePath(model_path, model_dir)); - } + std::unordered_set validated_external_data_paths; auto put_weights_maybe_in_memory_func = [&](Graph& graph) -> Status { // if we have any initializers that are not in memory, put them there. @@ -3771,11 +3768,17 @@ Status Graph::ConvertInitializersIntoOrtValues() { std::unique_ptr external_data_info; ORT_RETURN_IF_ERROR(onnxruntime::ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info)); const auto& location = external_data_info->GetRelPath(); - auto st = utils::ValidateExternalDataPath(model_dir, location, model_path); - if (!st.IsOK()) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "External data path validation failed for initializer: ", tensor_proto.name(), - ". Error: ", st.ErrorMessage()); + + if (validated_external_data_paths.count(location) == 0) { + auto st = utils::ValidateExternalDataPath(model_path, location); + + if (!st.IsOK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "External data path validation failed for initializer: ", tensor_proto.name(), + ". Error: ", st.ErrorMessage()); + } + + validated_external_data_paths.insert(location); } } continue; diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc index da2e8fc37382a..fdc0818e8437b 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc @@ -43,7 +43,7 @@ bool IsDQWeightSigned(int32_t dt_weight) { } // Holds transposed weight/scale/zp tensors and their TensorProtos for MatMulNBits. -// Used by both DQMatMulToMatMulNBitsAction and DQCastMatMulToMatMulNBitsAction. +// Used by DQMatMulToMatMulNBitsAction. struct TransposedQuantizedTensors { Tensor weight; Tensor scale; @@ -486,146 +486,6 @@ Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph, return Status::OK(); } -DQCastMatMulToMatMulNBitsAction::DQCastMatMulToMatMulNBitsAction( - int64_t accuracy_level, - concurrency::ThreadPool* intra_op_thread_pool) - : accuracy_level_{accuracy_level}, - intra_op_thread_pool_{intra_op_thread_pool} { - ORT_ENFORCE(accuracy_level_ >= 0 && accuracy_level_ <= 4, "MatMulNBits accuracy level must be between 0 and 4"); -} - -Status DQCastMatMulToMatMulNBitsAction::Run(Graph& graph, const NodesToOptimize& selected_nodes) const { - // Selected nodes layout (from DQCastMatMulToMatMulNBitsSelector): - // Input(0) = DQ node - // Input(1) = Cast on input B (between DQ and MatMul) - // Target() = MatMul node - auto* dq_node = selected_nodes.Input(0); - auto* cast_b_node = selected_nodes.Input(1); - auto& matmul_node = selected_nodes.Target(); - - // --- Transpose DQ weights/scales/zp via shared helper --- - TransposedQuantizedTensors transposed; - ORT_RETURN_IF_ERROR(TransposeDQWeightsForMatMulNBits( - graph, *dq_node, "fused_DQ_Cast_MatMul", intra_op_thread_pool_, transposed)); - - // MatMulNBits operates in the DQ scale dtype. - // Always insert Cast on input A (to DQ dtype) and Cast on output (DQ dtype to MatMul output dtype). - // ORT's redundant cast elimination optimizer will clean up unnecessary casts later. - - // Determine DQ output element type (e.g., fp16) - int32_t dq_output_dtype = cast_b_node->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); - // Determine MatMul output element type (e.g., fp32) - int32_t matmul_output_dtype = matmul_node.OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); - - const auto& dq_attrs = dq_node->GetAttributes(); - const auto* weight_arg = dq_node->InputDefs()[0]; - auto K = weight_arg->Shape()->dim(0).dim_value(); - auto N = weight_arg->Shape()->dim(1).dim_value(); - auto block_size = dq_attrs.at("block_size").i(); - int32_t dt_weight = weight_arg->TypeAsProto()->tensor_type().elem_type(); - auto bits = DQWeightBits(dt_weight); - - // --- Create fp16 NodeArg for MatMulNBits input A --- - NodeArg* matmul_input_a = matmul_node.MutableInputDefs()[0]; - ONNX_NAMESPACE::TypeProto input_a_fp16_type; - input_a_fp16_type.mutable_tensor_type()->set_elem_type(dq_output_dtype); - if (matmul_input_a->Shape()) { - *input_a_fp16_type.mutable_tensor_type()->mutable_shape() = - matmul_input_a->TypeAsProto()->tensor_type().shape(); - } - auto cast_a_out_name = graph.GenerateNodeArgName(matmul_node.Name() + "_input_a_cast"); - NodeArg* input_a_arg = &graph.GetOrCreateNodeArg(cast_a_out_name, &input_a_fp16_type); - - // --- Create fp16 NodeArg for MatMulNBits output --- - ONNX_NAMESPACE::TypeProto output_fp16_type; - output_fp16_type.mutable_tensor_type()->set_elem_type(dq_output_dtype); - if (matmul_node.OutputDefs()[0]->Shape()) { - *output_fp16_type.mutable_tensor_type()->mutable_shape() = - matmul_node.OutputDefs()[0]->TypeAsProto()->tensor_type().shape(); - } - auto mnb_out_name = graph.GenerateNodeArgName(matmul_node.Name() + "_matmulnbits_out"); - NodeArg* mnb_output_arg = &graph.GetOrCreateNodeArg(mnb_out_name, &output_fp16_type); - - // --- Create MatMulNBits node --- - NodeAttributes attrs; - utils::SetNodeAttribute(utils::MakeAttribute("K", K), attrs); - utils::SetNodeAttribute(utils::MakeAttribute("N", N), attrs); - utils::SetNodeAttribute(utils::MakeAttribute("bits", bits), attrs); - utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs); - utils::SetNodeAttribute(utils::MakeAttribute("accuracy_level", accuracy_level_), attrs); - - auto& new_node = graph.AddNode( - graph.GenerateNodeName(matmul_node.Name() + "_MatMulNBits"), - "MatMulNBits", - "Fused DQ+Cast+MatMul to MatMulNBits", - {input_a_arg}, - {mnb_output_arg}, - &attrs, - kMSDomain); - - const auto& target_provider = matmul_node.GetExecutionProviderType(); - new_node.SetExecutionProviderType(target_provider.empty() ? kCpuExecutionProvider : target_provider); - - // Add transposed weight, scale, zp to inputs - auto& input_defs = new_node.MutableInputDefs(); - input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight))); - new_node.MutableInputArgsCount().push_back(1); - - input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.scale_proto, std::move(transposed.scale))); - new_node.MutableInputArgsCount().push_back(1); - - if (transposed.zero_point_proto) { - input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, *transposed.zero_point_proto, std::move(*transposed.zero_point))); - new_node.MutableInputArgsCount().push_back(1); - } - - // --- Insert Cast on input A: matmul_input_dtype -> dq_output_dtype --- - { - NodeAttributes cast_attrs; - utils::SetNodeAttribute( - utils::MakeAttribute("to", static_cast(dq_output_dtype)), - cast_attrs); - auto& cast_node = graph.AddNode( - graph.GenerateNodeName(matmul_node.Name() + "_Cast_input_a"), - "Cast", "", - {matmul_input_a}, - {input_a_arg}, - &cast_attrs, - kOnnxDomain); - cast_node.SetExecutionProviderType(new_node.GetExecutionProviderType()); - } - - // --- Insert Cast on output: dq_output_dtype -> matmul_output_dtype --- - { - NodeAttributes cast_attrs; - utils::SetNodeAttribute( - utils::MakeAttribute("to", static_cast(matmul_output_dtype)), - cast_attrs); - auto& cast_node = graph.AddNode( - graph.GenerateNodeName(matmul_node.Name() + "_Cast_output"), - "Cast", "", - {mnb_output_arg}, - {const_cast(matmul_node.OutputDefs()[0])}, - &cast_attrs, - kOnnxDomain); - cast_node.SetExecutionProviderType(new_node.GetExecutionProviderType()); - } - - // --- Remove original nodes --- - auto remove_node = [&graph](Node* node) { - if (node) { - graph_utils::RemoveNodeOutputEdges(graph, *node); - graph.RemoveNode(node->Index()); - } - }; - - remove_node(&matmul_node); - remove_node(cast_b_node); - remove_node(dq_node); - - return Status::OK(); -} - static std::vector GetGemmMoveInfo(bool does_q_node_exist) { NTO::NodeLocation dq_A{NTO::NodeType::kInput, 0}; NTO::NodeLocation dq_B{NTO::NodeType::kInput, 1}; diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h index e112959cc58da..02a8353707599 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h @@ -107,20 +107,6 @@ struct DQMatMulToMatMulNBitsAction : public ReplaceWithNew { concurrency::ThreadPool* intra_op_thread_pool_; }; -// Used together with DQCastMatMulToMatMulNBitsSelector. -// Handles DQ -> Cast(fp16->fp32) -> MatMul fusion to MatMulNBits, -// including optional Cast on input A and output type alignment. -struct DQCastMatMulToMatMulNBitsAction : public Action { - DQCastMatMulToMatMulNBitsAction(int64_t accuracy_level, - concurrency::ThreadPool* intra_op_thread_pool); - - Status Run(Graph&, const NodesToOptimize& selected_nodes) const override; - - private: - int64_t accuracy_level_; - concurrency::ThreadPool* intra_op_thread_pool_; -}; - struct GemmReplaceWithQuant : public Action { GemmReplaceWithQuant(); diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc index 0b04445692c9b..8cab6911646f2 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc @@ -7,6 +7,7 @@ #include #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h" + #include "core/mlas/inc/mlas.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h" @@ -306,7 +307,12 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi intra_op_thread_pool); #if !defined(ORT_MINIMAL_BUILD) - std::vector providers = {kCpuExecutionProvider, kCudaExecutionProvider, kDmlExecutionProvider}; + // Include "" (empty string) to match nodes not yet assigned to an EP. + // For FP16 models on CPU EP, FP16 MatMul nodes are not claimed during partitioning + // (no FP16 MatMul kernel on CPU), leaving their EP unassigned. The DQ->MatMul fusion + // should still apply; the action assigns kCpuExecutionProvider to the resulting + // MatMulNBits node (which has both float and float16 CPU kernels). + std::vector providers = {kCpuExecutionProvider, kCudaExecutionProvider, kDmlExecutionProvider, ""}; std::unique_ptr selector = std::make_unique(providers); qdq_selector_action_registry.RegisterSelectorAndAction(action_name, {{"MatMul", {}}}, @@ -316,25 +322,6 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi #else qdq_selector_action_registry.RegisterAction(action_name, std::move(action)); #endif - - // DQ -> Cast(fp16->fp32) -> MatMul pattern. - // Handles FP16 models where Cast nodes are inserted between DQ and MatMul. - const std::string cast_action_name{"DQCastMatMulToMatMulNBits"}; - - std::unique_ptr cast_action = - std::make_unique(qdq_matmulnbits_accuracy_level, - intra_op_thread_pool); - -#if !defined(ORT_MINIMAL_BUILD) - std::unique_ptr cast_selector = - std::make_unique(providers); - qdq_selector_action_registry.RegisterSelectorAndAction(cast_action_name, - {{"MatMul", {}}}, - std::move(cast_selector), - std::move(cast_action)); -#else - qdq_selector_action_registry.RegisterAction(cast_action_name, std::move(cast_action)); -#endif } void GemmQDQRules(SelectorActionRegistry& qdq_selector_action_registry) { @@ -416,7 +403,9 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer( apply_context, // this transformer is compatible with CPU, DML, ACL and CUDA EP. // There is further EP control on the rule level. - {kCpuExecutionProvider, kDmlExecutionProvider, kAclExecutionProvider, kCudaExecutionProvider}} { + // Also accept nodes with empty EP (unassigned) so that individual selectors + // that include "" in their compatible providers can match unassigned nodes. + {kCpuExecutionProvider, kDmlExecutionProvider, kAclExecutionProvider, kCudaExecutionProvider, ""}} { } } // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc index c39dfeb082e35..8a00fe11ff3fd 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc @@ -651,75 +651,6 @@ bool DQMatMulNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Nod return ValidateBlockwiseDQForMatMulNBits(graph, *dq_nodes[0]); } -std::optional -DQCastMatMulToMatMulNBitsSelector::Select(const GraphViewer& graph_viewer, const Node& node) const { - // Check EP compatibility - const std::string_view node_ep = node.GetExecutionProviderType(); - if (!compatible_providers_.empty() && - std::find(compatible_providers_.begin(), compatible_providers_.end(), node_ep) == compatible_providers_.end()) { - return std::nullopt; - } - - const auto& graph = graph_viewer.GetGraph(); - - // node must be MatMul - if (node.OpType() != "MatMul") { - return std::nullopt; - } - - if (node.InputDefs().size() < 2) { - return std::nullopt; - } - - // Check input B: must be Cast(fp16->fp32) - const Node* cast_b = graph_viewer.GetProducerNode(node.InputDefs()[1]->Name()); - if (!cast_b || cast_b->OpType() != "Cast") { - return std::nullopt; - } - - const auto& cast_b_attrs = cast_b->GetAttributes(); - auto to_iter = cast_b_attrs.find("to"); - if (to_iter == cast_b_attrs.end() || - to_iter->second.i() != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { - return std::nullopt; - } - - // Cast B input must be fp16 - if (!cast_b->InputDefs()[0]->TypeAsProto() || - cast_b->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type() != - ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) { - return std::nullopt; - } - - // Cast B must have exactly 1 output edge (to MatMul) and not be a graph output - if (!optimizer_utils::CheckOutputEdges(graph, *cast_b, 1)) { - return std::nullopt; - } - - // Cast B's input must come from a DQ node - const Node* dq_node = graph_viewer.GetProducerNode(cast_b->InputDefs()[0]->Name()); - if (!dq_node || dq_node->OpType() != QDQ::DQOpName) { - return std::nullopt; - } - - // DQ must have exactly 1 output edge (to Cast B) and not be a graph output - if (!optimizer_utils::CheckOutputEdges(graph, *dq_node, 1)) { - return std::nullopt; - } - - if (!ValidateBlockwiseDQForMatMulNBits(graph, *dq_node)) { - return std::nullopt; - } - - // Build selection - NodesToOptimizeIndicesBuilder builder; - builder.input_nodes.push_back(dq_node->Index()); - builder.input_nodes.push_back(cast_b->Index()); - builder.target_node = node.Index(); - - return builder.Build(); -} - bool GemmNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const Node* redundant_clip_node, const std::vector& dq_nodes, const std::vector& q_nodes) const { diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h index 5c10668733785..79c374b301442 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h @@ -461,27 +461,6 @@ class DQMatMulToMatMulNBitsSelector : public BaseSelector { : BaseSelector(std::make_unique(), compatible_providers) {} }; -// Convert "DQ -> Cast(fp16->fp32) -> MatMul" to "MatMulNBits". -// Handles Cast(fp16->fp32) between DQ and MatMul on input B, and optionally on input A. -// Selection layout: -// input_nodes[0] = DQ node -// input_nodes[1] = Cast on input B (between DQ and MatMul) -// target_node = MatMul -// output_nodes = {} -class DQCastMatMulToMatMulNBitsSelector : public NodeSelector { - public: - explicit DQCastMatMulToMatMulNBitsSelector(gsl::span compatible_providers = {}) - : compatible_providers_(compatible_providers.begin(), compatible_providers.end()) {} - - DQCastMatMulToMatMulNBitsSelector(DQCastMatMulToMatMulNBitsSelector&& rhs) noexcept - : compatible_providers_(std::move(rhs.compatible_providers_)) {} - - std::optional Select(const GraphViewer& graph_viewer, const Node& node) const override; - - private: - std::vector compatible_providers_; -}; - // Input: DQ nodes for A, B and optional C // Output: optional Q node for Y class GemmSelector : public BaseSelector { diff --git a/onnxruntime/core/providers/cpu/math/cumsum.cc b/onnxruntime/core/providers/cpu/math/cumsum.cc index 8321b81021d19..14ea6712f7f46 100644 --- a/onnxruntime/core/providers/cpu/math/cumsum.cc +++ b/onnxruntime/core/providers/cpu/math/cumsum.cc @@ -13,29 +13,6 @@ using namespace onnxruntime; namespace onnxruntime { -namespace cumsum_op { -Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out) { - if (!axis_tensor) - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor must be provided to the CumSum op"); - - if (axis_tensor->Shape().NumDimensions() > 1) - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be 0D or 1D"); - - if (axis_tensor->IsDataType()) { - axis_out = static_cast(axis_tensor->Data()[0]); - } else if (axis_tensor->IsDataType()) { - axis_out = axis_tensor->Data()[0]; - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be of type `int32_t` or `int64_t`"); - } - - axis_out = HandleNegativeAxis(axis_out, input_rank); - - return Status::OK(); -} - -} // namespace cumsum_op - ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL( CumSum, 11, diff --git a/onnxruntime/core/providers/cpu/math/cumsum.h b/onnxruntime/core/providers/cpu/math/cumsum.h index fa1c1ceb0df10..b7443ada40861 100644 --- a/onnxruntime/core/providers/cpu/math/cumsum.h +++ b/onnxruntime/core/providers/cpu/math/cumsum.h @@ -1,11 +1,17 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#pragma once #include "core/common/common.h" +#include "core/providers/common.h" + +#ifndef SHARED_PROVIDER #include "core/framework/op_kernel.h" +#endif namespace onnxruntime { +#ifndef SHARED_PROVIDER template class CumSum final : public OpKernel { public: @@ -17,10 +23,33 @@ class CumSum final : public OpKernel { int64_t exclusive_; int64_t reverse_; }; +#endif namespace cumsum_op { +#ifdef SHARED_PROVIDER Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out); +#else +inline Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out) { + if (!axis_tensor) + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor must be provided to the CumSum op"); + + if (axis_tensor->Shape().NumDimensions() > 1) + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be 0D or 1D"); + + if (axis_tensor->IsDataType()) { + axis_out = static_cast(axis_tensor->Data()[0]); + } else if (axis_tensor->IsDataType()) { + axis_out = axis_tensor->Data()[0]; + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be of type `int32_t` or `int64_t`"); + } + + axis_out = HandleNegativeAxis(axis_out, input_rank); + + return Status::OK(); +} +#endif // SHARED_PROVIDER } // namespace cumsum_op } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc index 87958a9f7e2dd..0680be3aea49c 100644 --- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc @@ -258,76 +258,6 @@ void RoiAlignForward(const TensorShape& output_shape, const T* bottom_data, floa } } // namespace -Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, const Tensor* batch_indices_ptr) { - constexpr int64_t EXPECTED_NUM_ROI_DIMS = 2; - constexpr int64_t EXPECTED_SECOND_ROI_DIM = 4; - if (!X_ptr) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null input X ptr"); - } - if (!rois_ptr) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null rois_ptr"); - } - if (!batch_indices_ptr) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null batch_indices_ptr"); - } - - const auto& rois_dims = rois_ptr->Shape(); - const auto& batch_indices_dims = batch_indices_ptr->Shape(); - - if (batch_indices_dims.NumDimensions() != 1) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "Number of dimensions for batch indices should be exactly 1"); - } - - // validate rois_dims - if (rois_dims.NumDimensions() != EXPECTED_NUM_ROI_DIMS) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "Number of dimensions for rois should be exactly " + std::to_string(EXPECTED_NUM_ROI_DIMS)); - } - if (rois_dims[1] != EXPECTED_SECOND_ROI_DIM) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "Second dimension for rois should be exactly " + std::to_string(EXPECTED_SECOND_ROI_DIM)); - } - - // first dimension of batch_indices and rois should match - if (batch_indices_dims[0] != rois_dims[0]) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "First dimension (num_rois) of batch_indices and rois don't match"); - } - - if (batch_indices_ptr->Location().device.Type() == OrtDevice::CPU) { - // Validate batch_indices values are within [0, batch_size) when the tensor - // data is accessible from the host (CPU). - const int64_t batch_size = X_ptr->Shape()[0]; - const int64_t num_rois = batch_indices_dims[0]; - - auto check_bounds = [batch_size, num_rois](const auto* batch_indices_data) -> Status { - for (int64_t i = 0; i < num_rois; ++i) { - if (batch_indices_data[i] < 0 || batch_indices_data[i] >= batch_size) { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "batch_indices value " + std::to_string(batch_indices_data[i]) + - " at index " + std::to_string(i) + - " is out of range [0, " + std::to_string(batch_size) + ")"); - } - } - return Status::OK(); - }; - - if (batch_indices_ptr->IsDataType()) { - auto status = check_bounds(batch_indices_ptr->Data()); - if (!status.IsOK()) return status; - } else if (batch_indices_ptr->IsDataType()) { - auto status = check_bounds(batch_indices_ptr->Data()); - if (!status.IsOK()) return status; - } else { - return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "batch_indices must be of type int64_t or int32_t"); - } - } - - return Status::OK(); -} - template Status RoiAlign::Compute(OpKernelContext* context) const { const auto* X_ptr = context->Input(0); diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.h b/onnxruntime/core/providers/cpu/object_detection/roialign.h index 1bb8bd34c5cb2..bb97de158369b 100644 --- a/onnxruntime/core/providers/cpu/object_detection/roialign.h +++ b/onnxruntime/core/providers/cpu/object_detection/roialign.h @@ -3,12 +3,86 @@ #pragma once -#include "core/framework/op_kernel.h" +#include #include +#include + +#include "core/common/common.h" +#ifndef SHARED_PROVIDER +#include "core/framework/op_kernel.h" +#endif namespace onnxruntime { +#ifdef SHARED_PROVIDER Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, const Tensor* batch_indices_ptr); +#else +inline Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, const Tensor* batch_indices_ptr) { + constexpr int64_t EXPECTED_NUM_ROI_DIMS = 2; + constexpr int64_t EXPECTED_SECOND_ROI_DIM = 4; + if (!X_ptr) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null input X ptr"); + } + if (!rois_ptr) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null rois_ptr"); + } + if (!batch_indices_ptr) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null batch_indices_ptr"); + } + + const auto& rois_dims = rois_ptr->Shape(); + const auto& batch_indices_dims = batch_indices_ptr->Shape(); + + if (batch_indices_dims.NumDimensions() != 1) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "Number of dimensions for batch indices should be exactly 1"); + } + + if (rois_dims.NumDimensions() != EXPECTED_NUM_ROI_DIMS) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "Number of dimensions for rois should be exactly " + std::to_string(EXPECTED_NUM_ROI_DIMS)); + } + if (rois_dims[1] != EXPECTED_SECOND_ROI_DIM) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "Second dimension for rois should be exactly " + std::to_string(EXPECTED_SECOND_ROI_DIM)); + } + + if (batch_indices_dims[0] != rois_dims[0]) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "First dimension (num_rois) of batch_indices and rois don't match"); + } + + if (batch_indices_ptr->Location().device.Type() == OrtDevice::CPU) { + const int64_t batch_size = X_ptr->Shape()[0]; + const int64_t num_rois = batch_indices_dims[0]; + + auto check_bounds = [batch_size, num_rois](const auto* batch_indices_data) -> Status { + for (int64_t i = 0; i < num_rois; ++i) { + if (batch_indices_data[i] < 0 || batch_indices_data[i] >= batch_size) { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "batch_indices value " + std::to_string(batch_indices_data[i]) + + " at index " + std::to_string(i) + + " is out of range [0, " + std::to_string(batch_size) + ")"); + } + } + return Status::OK(); + }; + + if (batch_indices_ptr->IsDataType()) { + auto status = check_bounds(batch_indices_ptr->Data()); + if (!status.IsOK()) return status; + } else if (batch_indices_ptr->IsDataType()) { + auto status = check_bounds(batch_indices_ptr->Data()); + if (!status.IsOK()) return status; + } else { + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, + "batch_indices must be of type int64_t or int32_t"); + } + } + + return Status::OK(); +} +#endif enum struct RoiAlignMode { avg = 0, @@ -17,10 +91,10 @@ enum struct RoiAlignMode { class RoiAlignBase { public: - explicit RoiAlignBase(const OpKernelInfo& info) { - // mode + template + explicit RoiAlignBase(const TKernelInfo& info) { std::string mode; - if (info.GetAttr("mode", &mode).IsOK()) { + if (info.template GetAttr("mode", &mode).IsOK()) { std::transform(mode.begin(), mode.end(), mode.begin(), [](char i) { return static_cast(::tolower(i)); }); if (mode == "avg") { mode_ = RoiAlignMode::avg; @@ -31,41 +105,33 @@ class RoiAlignBase { } } - // output_height int64_t output_height_tmp; - if (info.GetAttr("output_height", &output_height_tmp).IsOK()) { + if (info.template GetAttr("output_height", &output_height_tmp).IsOK()) { output_height_ = output_height_tmp; } - // output_width int64_t output_width_tmp; - if (info.GetAttr("output_width", &output_width_tmp).IsOK()) { + if (info.template GetAttr("output_width", &output_width_tmp).IsOK()) { output_width_ = output_width_tmp; } - // sampling_ratio int64_t sampling_ratio_tmp; - if (info.GetAttr("sampling_ratio", &sampling_ratio_tmp).IsOK()) { + if (info.template GetAttr("sampling_ratio", &sampling_ratio_tmp).IsOK()) { sampling_ratio_ = sampling_ratio_tmp; ORT_ENFORCE(sampling_ratio_ >= 0, "Sampling ratio should be >=0, but it was ", sampling_ratio_); } - // spatial_scale float spatial_scale_tmp; - if (info.GetAttr("spatial_scale", &spatial_scale_tmp).IsOK()) { + if (info.template GetAttr("spatial_scale", &spatial_scale_tmp).IsOK()) { spatial_scale_ = spatial_scale_tmp; } std::string coordinate_transformation_mode; - if (info.GetAttr("coordinate_transformation_mode", &coordinate_transformation_mode).IsOK()) { - if (coordinate_transformation_mode == "half_pixel") - half_pixel_ = true; - else - half_pixel_ = false; + if (info.template GetAttr("coordinate_transformation_mode", &coordinate_transformation_mode).IsOK()) { + half_pixel_ = coordinate_transformation_mode == "half_pixel"; } if (mode_ == RoiAlignMode::max && sampling_ratio_ != 1) { - // TODO(fdwr): Issue #6146. ORT 1.13 will correct the incorrect summation of max mode with PR #7354. LOGS_DEFAULT(WARNING) << "The existing summation for max mode and sampling ratios besides 1 is incorrect " << "and will be fixed in the next ORT 1.13 release. Thus the results of RoiAlign " << "will be different."; diff --git a/onnxruntime/core/providers/cpu/tensor/concat.cc b/onnxruntime/core/providers/cpu/tensor/concat.cc index e3d5c0600420f..98d61ed1d3127 100644 --- a/onnxruntime/core/providers/cpu/tensor/concat.cc +++ b/onnxruntime/core/providers/cpu/tensor/concat.cc @@ -49,14 +49,6 @@ using EnabledDataTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExec Concat, Input, 0); } // namespace -// this method will be shared between 'Concat' (CPU and GPU) and -// 'ConcatFromSequence' ('concat' and 'stack' modes) to validate inputs -Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, - const InlinedTensorsVector& input_tensors, - Prepare& p) const { - return PrepareForComputeImpl(ctx, input_tensors, p); -} - namespace { TensorShapeVector StridesForStack(const TensorShapeVector& full_strides, uint64_t axis) { // if we are stacking, skip the dimension that will be stacked along in the output strides diff --git a/onnxruntime/core/providers/cpu/tensor/concatbase.h b/onnxruntime/core/providers/cpu/tensor/concatbase.h index b9085b2a9318b..df2eb78c61180 100644 --- a/onnxruntime/core/providers/cpu/tensor/concatbase.h +++ b/onnxruntime/core/providers/cpu/tensor/concatbase.h @@ -209,8 +209,16 @@ class ConcatBase { return Status::OK(); } +#ifdef SHARED_PROVIDER Status PrepareForCompute(OpKernelContext* ctx, const InlinedTensorsVector& input_tensors, Prepare& p) const; +#else + template + inline Status PrepareForCompute(KernelContextType* ctx, const InlinedTensorsVector& input_tensors, + Prepare& p) const { + return PrepareForComputeImpl(ctx, input_tensors, p); + } +#endif protected: template diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc index f171b33ee5f4f..3b3c67e7d818b 100644 --- a/onnxruntime/core/providers/cpu/tensor/gather.cc +++ b/onnxruntime/core/providers/cpu/tensor/gather.cc @@ -56,10 +56,6 @@ ONNX_CPU_OPERATOR_KERNEL( .TypeConstraint("Tind", BuildKernelDefConstraintsFromTypeList()), Gather); -Status GatherBase::PrepareForCompute(OpKernelContext* context, Prepare& p) const { - return PrepareForComputeImpl(context, p); -} - template Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uint8_t* dst_base, bool is_string_type, const size_t element_bytes, const int64_t block_size, const int64_t M, diff --git a/onnxruntime/core/providers/cpu/tensor/gatherbase.h b/onnxruntime/core/providers/cpu/tensor/gatherbase.h index 1f5e85c554a78..fc29c04290883 100644 --- a/onnxruntime/core/providers/cpu/tensor/gatherbase.h +++ b/onnxruntime/core/providers/cpu/tensor/gatherbase.h @@ -46,7 +46,14 @@ class GatherBase { return Status::OK(); } +#ifdef SHARED_PROVIDER Status PrepareForCompute(OpKernelContext* context, Prepare& p) const; +#else + template + inline Status PrepareForCompute(KernelContextType* context, Prepare& p) const { + return PrepareForComputeImpl(context, p); + } +#endif protected: template diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h index 3218c8952d6ec..14a22fa7be0af 100644 --- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h +++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h @@ -9,8 +9,9 @@ namespace onnxruntime { class SpaceDepthBase { protected: - explicit SpaceDepthBase(const OpKernelInfo& info) { - ORT_ENFORCE(info.GetAttr("blocksize", &blocksize_).IsOK(), + template + explicit SpaceDepthBase(const KernelInfoType& info) { + ORT_ENFORCE(info.template GetAttr("blocksize", &blocksize_).IsOK(), "Attribute blocksize is not set."); } diff --git a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc index 1b6ee02061d34..5fdb57b1a5e35 100644 --- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc +++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc @@ -77,57 +77,6 @@ ONNX_CPU_OPERATOR_KERNEL( .TypeConstraint("T", DataTypeImpl::AllTensorTypes()), Unsqueeze); -Status UnsqueezeBase::PrepareCompute(OpKernelContext* ctx, Prepare& p) const { - const auto* X = ctx->Input(0); - ORT_ENFORCE(X != nullptr); - auto& input_tensor = *X; - - TensorShapeVector axes; - size_t num_inputs = ctx->InputCount(); - if (num_inputs == 2) { // axes is an input - const Tensor* axes_tensor = ctx->Input(1); - ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); - ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 0 || - axes_tensor->Shape().NumDimensions() == 1, - "An axes tensor must be a scalar or a 1-D tensor."); - auto data_span = axes_tensor->template DataAsSpan(); - axes.assign(data_span.begin(), data_span.end()); - } else { - axes.assign(axes_.begin(), axes_.end()); - } - - // New dimension count is the current dimensions + the number of entries in axes - // Initialize output_dims to 0 in each axis initially - TensorShapeVector output_dims(axes.size() + input_tensor.Shape().NumDimensions(), 0); - - // Set all axes indices to 1 in output_dims and check for duplicates - for (int64_t axis : axes) { - // Valid axis range is [0, output_rank - 1] - axis = HandleNegativeAxis(axis, onnxruntime::narrow(output_dims.size())); - if (axis < 0 || axis >= static_cast(output_dims.size())) - return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has an out of range axis"); - if (output_dims[onnxruntime::narrow(axis)] != 0) - return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has a duplicate axis"); - output_dims[onnxruntime::narrow(axis)] = 1; - } - - // Now fill in the zero entries with the existing shape - { - auto begin = input_tensor.Shape().GetDims().begin(); - for (auto& axisSize : output_dims) { - if (axisSize == 0) - axisSize = *begin++; - } - assert(begin == input_tensor.Shape().GetDims().end()); - } - - TensorShape output_shape(output_dims); - p.output_tensor = ctx->Output(0, output_shape); - ORT_ENFORCE(nullptr != p.output_tensor); - p.input_tensor = &input_tensor; - return Status::OK(); -} - Status Unsqueeze::Compute(OpKernelContext* ctx) const { Prepare p; ORT_RETURN_IF_ERROR(PrepareCompute(ctx, p)); diff --git a/onnxruntime/core/providers/cpu/tensor/unsqueeze.h b/onnxruntime/core/providers/cpu/tensor/unsqueeze.h index 5a8a318923da5..09a77c113e022 100644 --- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.h +++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.h @@ -19,7 +19,57 @@ class UnsqueezeBase { Tensor* output_tensor = nullptr; }; +#ifdef SHARED_PROVIDER Status PrepareCompute(OpKernelContext* context, Prepare& p) const; +#else + template + inline Status PrepareCompute(KernelContextType* ctx, Prepare& p) const { + const auto* X = ctx->template Input(0); + ORT_ENFORCE(X != nullptr); + auto& input_tensor = *X; + + TensorShapeVector axes; + size_t num_inputs = ctx->InputCount(); + if (num_inputs == 2) { + const Tensor* axes_tensor = ctx->template Input(1); + ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 0 || + axes_tensor->Shape().NumDimensions() == 1, + "An axes tensor must be a scalar or a 1-D tensor."); + auto data_span = axes_tensor->template DataAsSpan(); + axes.assign(data_span.begin(), data_span.end()); + } else { + axes.assign(axes_.begin(), axes_.end()); + } + + TensorShapeVector output_dims(axes.size() + input_tensor.Shape().NumDimensions(), 0); + + for (int64_t axis : axes) { + axis = HandleNegativeAxis(axis, onnxruntime::narrow(output_dims.size())); + if (axis < 0 || axis >= static_cast(output_dims.size())) + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has an out of range axis"); + if (output_dims[onnxruntime::narrow(axis)] != 0) + return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has a duplicate axis"); + output_dims[onnxruntime::narrow(axis)] = 1; + } + + { + auto begin = input_tensor.Shape().GetDims().begin(); + for (auto& axis_size : output_dims) { + if (axis_size == 0) + axis_size = *begin++; + } + assert(begin == input_tensor.Shape().GetDims().end()); + } + + TensorShape output_shape(output_dims); + p.output_tensor = ctx->Output(0, output_shape); + ORT_ENFORCE(nullptr != p.output_tensor); + p.input_tensor = &input_tensor; + return Status::OK(); + } +#endif + static TensorShapeVector ComputeOutputShape( const TensorShape& input_shape, const TensorShapeVector& axes) { diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc index b533f1b7dc80b..87ba56bc45dad 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsample.cc +++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc @@ -2,10 +2,6 @@ // Licensed under the MIT License. #include "core/providers/cpu/tensor/upsample.h" - -#include - -#include "core/common/inlined_containers.h" #include "core/common/safeint.h" #include "core/platform/threadpool.h" #include "core/providers/cpu/tensor/upsample_antialias.h" @@ -35,46 +31,6 @@ REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9); REGISTER_VERSIONED_TYPED_KERNEL(int8_t, 9, 9); REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9); -void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span input_dims, - InlinedVector& scales) const { - // AspectRatioPolicy::STRETCH is default policy when opset < 18 - if (keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH) { - return; - } - - InlinedHashSet axes_set(axes_.begin(), axes_.end()); - - float scale_in_policy = 0.0f; - if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) { - scale_in_policy = std::numeric_limits::max(); - - for (size_t i = 0; i < scales.size(); i++) { - if (axes_set.empty() || axes_set.count(i) > 0) { - scale_in_policy = std::min(scale_in_policy, scales[i]); - } - } - } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) { - scale_in_policy = std::numeric_limits::min(); - - for (size_t i = 0; i < scales.size(); i++) { - if (axes_set.empty() || axes_set.count(i) > 0) { - scale_in_policy = std::max(scale_in_policy, scales[i]); - } - } - } - - for (size_t i = 0; i < scales.size(); i++) { - // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes - if (axes_set.empty() || axes_set.count(i) > 0) { - scales[i] = scale_in_policy; - output_dims[i] = static_cast(std::round(scales[i] * input_dims[i])); - } else { - scales[i] = 1.0f; - output_dims[i] = input_dims[i]; - } - } -} - template void UpsampleNearest2x(int64_t batch_size, int64_t num_channels, diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h index b0e309a70444f..7dcf88133e967 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h +++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h @@ -4,9 +4,12 @@ #pragma once #include +#include +#include #include #include #include +#include #include #include @@ -120,6 +123,49 @@ void PrintAntiAliasBuffers(std::ostream& os, gsl::span bounds, gsl::spa os << std::endl; } +namespace upsamplebase_helper { + +inline void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span input_dims, + InlinedVector& scales, AspectRatioPolicy keep_aspect_ratio_policy, + const TensorShapeVector& axes) { + if (keep_aspect_ratio_policy == AspectRatioPolicy::STRETCH) { + return; + } + + std::unordered_set axes_set(axes.begin(), axes.end()); + + float scale_in_policy = 0.0f; + if (keep_aspect_ratio_policy == AspectRatioPolicy::NOT_LARGER) { + scale_in_policy = std::numeric_limits::max(); + + for (size_t i = 0; i < scales.size(); ++i) { + if (axes_set.empty() || axes_set.count(static_cast(i)) > 0) { + scale_in_policy = std::min(scale_in_policy, scales[i]); + } + } + } else if (keep_aspect_ratio_policy == AspectRatioPolicy::NOT_SMALLER) { + scale_in_policy = std::numeric_limits::min(); + + for (size_t i = 0; i < scales.size(); ++i) { + if (axes_set.empty() || axes_set.count(static_cast(i)) > 0) { + scale_in_policy = std::max(scale_in_policy, scales[i]); + } + } + } + + for (size_t i = 0; i < scales.size(); ++i) { + if (axes_set.empty() || axes_set.count(static_cast(i)) > 0) { + scales[i] = scale_in_policy; + output_dims[i] = static_cast(std::round(scales[i] * input_dims[i])); + } else { + scales[i] = 1.0f; + output_dims[i] = input_dims[i]; + } + } +} + +} // namespace upsamplebase_helper + class UpsampleBase { public: // Make this available in other EP via provider bridge @@ -597,6 +643,13 @@ class UpsampleBase { } }; // UpsampleBase +#ifndef SHARED_PROVIDER +inline void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span input_dims, + InlinedVector& scales) const { + upsamplebase_helper::AdjustOutputSizeAsPolicy(output_dims, input_dims, scales, keep_aspect_ratio_policy_, axes_); +} +#endif + } // namespace onnxruntime #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 60ac16018f539..9cfc38c8f292f 100755 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -944,8 +944,11 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Resize); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, uint8_t, Resize); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, ReverseSequence); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 15, float, RoiAlign); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 15, double, RoiAlign); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, float, RoiAlign); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, double, RoiAlign); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, MLFloat16, RoiAlign); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Slice); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int64_t, Slice); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, ThresholdedRelu); @@ -1424,6 +1427,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterND); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 19, float, GridSample); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, MLFloat16, RoiAlign); // Opset 17 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization); @@ -1592,6 +1596,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, HardSwish); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, GridSample); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, RoiAlign); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, RoiAlign); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, RoiAlign); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, RoiAlign); // Opset 23. class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, float, Attention); @@ -2027,8 +2035,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2676,6 +2687,10 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // Opset 23 BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign.cc b/onnxruntime/core/providers/cuda/object_detection/roialign.cc index 71fb066c2898f..b4f63b3aa04f2 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cuda/object_detection/roialign.cc @@ -7,11 +7,37 @@ namespace onnxruntime { namespace cuda { -#define REGISTER_KERNEL_TYPED(T) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ +#define ADD_VERSIONED_TYPED_ROIALIGN_OP_10(T) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ RoiAlign, \ kOnnxDomain, \ 10, \ + 15, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ + RoiAlign); + +#define ADD_VERSIONED_TYPED_ROIALIGN_OP_16(T) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + RoiAlign, \ + kOnnxDomain, \ + 16, \ + 21, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ + RoiAlign); + +#define ADD_TYPED_ROIALIGN_OP_22(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + RoiAlign, \ + kOnnxDomain, \ + 22, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()) \ @@ -67,13 +93,19 @@ Status RoiAlign::ComputeInternal(OpKernelContext* context) const { return Status::OK(); } -#define SPECIALIZED_COMPUTE(T) \ - REGISTER_KERNEL_TYPED(T) \ +#define SPECIALIZED_COMPUTE(T) \ + ADD_VERSIONED_TYPED_ROIALIGN_OP_10(T) \ + ADD_VERSIONED_TYPED_ROIALIGN_OP_16(T) \ + ADD_TYPED_ROIALIGN_OP_22(T) \ template Status RoiAlign::ComputeInternal(OpKernelContext* ctx) const; SPECIALIZED_COMPUTE(float) SPECIALIZED_COMPUTE(double) -// SPECIALIZED_COMPUTE(MLFloat16) +SPECIALIZED_COMPUTE(MLFloat16) + +// BFloat16 is available for RoiAlign op from version 22: +ADD_TYPED_ROIALIGN_OP_22(BFloat16) +template Status RoiAlign::ComputeInternal(OpKernelContext* ctx) const; } // namespace cuda }; // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu index 7acfd9d075461..76f6f26fd8a02 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu +++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu @@ -17,12 +17,13 @@ #include "roialign_impl.h" #include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/shared_inc/accumulation_type.h" namespace onnxruntime { namespace cuda { template -__device__ T bilinear_interpolate( +__device__ AccumulationType_t bilinear_interpolate( const T* bottom_data, const int height, const int width, @@ -30,51 +31,61 @@ __device__ T bilinear_interpolate( T x, const bool is_mode_avg, const int index /* index for debug only*/) { + using TAcc = AccumulationType_t; + + TAcc y_acc = static_cast(y); + TAcc x_acc = static_cast(x); + // deal with cases that inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { + if (y_acc < static_cast(-1.0f) || y_acc > static_cast(height) || + x_acc < static_cast(-1.0f) || x_acc > static_cast(width)) { // empty - return 0; + return static_cast(0.0f); } - if (y <= 0) { - y = 0; + if (y_acc <= static_cast(0.0f)) { + y_acc = static_cast(0.0f); } - if (x <= 0) { - x = 0; + if (x_acc <= static_cast(0.0f)) { + x_acc = static_cast(0.0f); } - int y_low = (int)y; - int x_low = (int)x; + int y_low = static_cast(y_acc); + int x_low = static_cast(x_acc); int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; - y = (T)y_low; + y_acc = static_cast(y_low); } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; - x = (T)x_low; + x_acc = static_cast(x_low); } else { x_high = x_low + 1; } - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; + TAcc ly = y_acc - static_cast(y_low); + TAcc lx = x_acc - static_cast(x_low); + TAcc hy = static_cast(1.0f) - ly; + TAcc hx = static_cast(1.0f) - lx; // do bilinear interpolation - T v1 = bottom_data[y_low * width + x_low]; - T v2 = bottom_data[y_low * width + x_high]; - T v3 = bottom_data[y_high * width + x_low]; - T v4 = bottom_data[y_high * width + x_high]; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + TAcc v1 = static_cast(bottom_data[y_low * width + x_low]); + TAcc v2 = static_cast(bottom_data[y_low * width + x_high]); + TAcc v3 = static_cast(bottom_data[y_high * width + x_low]); + TAcc v4 = static_cast(bottom_data[y_high * width + x_high]); + TAcc w1 = hy * hx; + TAcc w2 = hy * lx; + TAcc w3 = ly * hx; + TAcc w4 = ly * lx; - T val = is_mode_avg - ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4) // mode Avg - : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4); // mode Max + TAcc val = is_mode_avg + ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4) // mode Avg + : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4); // mode Max return val; } @@ -97,6 +108,8 @@ __global__ void RoIAlignForward( const bool half_pixel, const int64_t* batch_indices_ptr, const int64_t batch_size) { + using TAcc = AccumulationType_t; + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; @@ -111,26 +124,27 @@ __global__ void RoIAlignForward( // If the index is out of range, we set the output to 0 for this RoI element. if (roi_batch_ind < 0 || roi_batch_ind >= batch_size) { CUDA_KERNEL_ASSERT(false && "batch_indices values are out of range"); - top_data[index] = 0; + top_data[index] = static_cast(0.0f); continue; } // Do not using rounding; this implementation detail is critical - T roi_offset = half_pixel ? T(0.5) : T(0); - T roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset; - T roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset; - T roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset; - T roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_end_w - roi_start_w; - T roi_height = roi_end_h - roi_start_h; + const TAcc spatial_scale_acc = static_cast(spatial_scale); + const TAcc roi_offset = half_pixel ? static_cast(0.5f) : static_cast(0.0f); + TAcc roi_start_w = static_cast(offset_bottom_rois[0]) * spatial_scale_acc - roi_offset; + TAcc roi_start_h = static_cast(offset_bottom_rois[1]) * spatial_scale_acc - roi_offset; + TAcc roi_end_w = static_cast(offset_bottom_rois[2]) * spatial_scale_acc - roi_offset; + TAcc roi_end_h = static_cast(offset_bottom_rois[3]) * spatial_scale_acc - roi_offset; + + TAcc roi_width = roi_end_w - roi_start_w; + TAcc roi_height = roi_end_h - roi_start_h; if (!half_pixel) { // backward compatibility // Force malformed ROIs to be 1x1 - roi_width = max(roi_width, (T)1.); - roi_height = max(roi_height, (T)1.); + roi_width = max(roi_width, static_cast(1.0f)); + roi_height = max(roi_height, static_cast(1.0f)); } - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + const TAcc bin_size_h = roi_height / static_cast(pooled_height); + const TAcc bin_size_w = roi_width / static_cast(pooled_width); const T* offset_bottom_data = bottom_data + static_cast((roi_batch_ind * channels + c) * height * width); @@ -138,27 +152,27 @@ __global__ void RoIAlignForward( // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio - : _Ceil(roi_height / pooled_height); // e.g., = 2 + : static_cast(_Ceil(roi_height / static_cast(pooled_height))); // e.g., = 2 int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : _Ceil(roi_width / pooled_width); + (sampling_ratio > 0) ? sampling_ratio : static_cast(_Ceil(roi_width / static_cast(pooled_width))); // We do average (integral) pooling inside a bin - const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + const TAcc count = static_cast(roi_bin_grid_h * roi_bin_grid_w); // e.g. = 4 - T output_val = 0.; + TAcc output_val = static_cast(0.0f); bool max_flag = false; for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { - const T y = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + const TAcc y = roi_start_h + static_cast(ph) * bin_size_h + + (static_cast(iy) + static_cast(0.5f)) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); + const TAcc x = roi_start_w + static_cast(pw) * bin_size_w + + (static_cast(ix) + static_cast(0.5f)) * bin_size_w / + static_cast(roi_bin_grid_w); - T val = bilinear_interpolate( - offset_bottom_data, height, width, y, x, is_mode_avg, index); + const TAcc val = bilinear_interpolate( + offset_bottom_data, height, width, static_cast(y), static_cast(x), is_mode_avg, index); if (is_mode_avg) { output_val += val; @@ -176,7 +190,7 @@ __global__ void RoIAlignForward( output_val /= count; } - top_data[index] = output_val; + top_data[index] = static_cast(output_val); } } @@ -241,6 +255,8 @@ void RoiAlignImpl( SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) +SPECIALIZED_IMPL(half) +SPECIALIZED_IMPL(BFloat16) } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/grid_sample.cc b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc index b9d47a27e8e83..d97d5fcbb0b5b 100755 --- a/onnxruntime/core/providers/cuda/tensor/grid_sample.cc +++ b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc @@ -51,7 +51,6 @@ template GridSample::GridSample(const OpKernelInfo& info) : CudaKernel(info) { opset_start_version_ = info.node().SinceVersion(); - std::string mode_str = info.GetAttrOrDefault("mode", "bilinear"); std::string padding_mode_str = info.GetAttrOrDefault("padding_mode", "zeros"); align_corners_ = static_cast(info.GetAttrOrDefault("align_corners", 0)); diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc index e7032d5880581..e2c08618264dd 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample.cc +++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc @@ -380,5 +380,11 @@ Status Upsample::ComputeInternal(OpKernelContext* context) const { return BaseCompute(context, roi_array, scales_array, output_dims); } +template class Upsample; +template class Upsample; +template class Upsample; +template class Upsample; +template class Upsample; + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h index 93f0e47005050..c57322e46bfe8 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h @@ -45,332 +45,6 @@ static int GetNumProfiles(std::unordered_map>>& shape_ranges) { - // Serialize profile - flexbuffers::Builder builder; - auto profile_start = builder.StartMap(); - for (auto outer_it = shape_ranges.begin(); outer_it != shape_ranges.end(); ++outer_it) { - builder.TypedVector(outer_it->first.c_str(), [&] { - for (auto inner_it = outer_it->second.begin(); inner_it != outer_it->second.end(); ++inner_it) { - builder.Int(inner_it->first); - builder.Int(inner_it->second.first); - builder.Int(inner_it->second.second); - } - }); - } - builder.EndMap(profile_start); - builder.Finish(); - - // Save flexbuffer - std::ofstream file(file_name, std::ios::binary | std::ios::out); - auto buf = builder.GetBuffer(); - size_t size = builder.GetSize(); - file.write(reinterpret_cast(&buf[0]), size); - file.close(); -} - -// Deserialize engine profile -// [Deprecated] Use DeserializeProfileV2 -static std::unordered_map>> DeserializeProfile(std::ifstream& infile) { - // Load flexbuffer - infile.seekg(0, std::ios::end); - size_t length = infile.tellg(); - infile.seekg(0, std::ios::beg); - std::unique_ptr data{new char[length]}; - infile.read((char*)data.get(), length); - infile.close(); - - // Deserialize profile - std::unordered_map>> shape_ranges; - auto tensors_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap(); - auto keys = tensors_range_entries.Keys(); - auto values = tensors_range_entries.Values(); - for (size_t i = 0, i_end = keys.size(); i < i_end; ++i) { - auto dim_range_vectors = values[i].AsTypedVector(); - std::unordered_map> inner_map; - for (size_t j = 0, j_end = dim_range_vectors.size() / 3; j < j_end; ++j) { - size_t idx = 3 * j; - inner_map[dim_range_vectors[idx].AsInt64()] = std::make_pair(dim_range_vectors[idx + 1].AsInt64(), dim_range_vectors[idx + 2].AsInt64()); - } - shape_ranges[keys[i].AsString().c_str()] = inner_map; - } - return shape_ranges; -} - -/* - * Seralize engine profile. (This function starts from ORT 1.15) - * - * - * (1) Single profile case: - * Assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, - * and tensor_b has one dynamic shape dimension: dim_1. - * - * The data before serialization will be: - * { - * tensor_a: { - * dim_0: [[min_shape_0, max_shape_0, opt_shape_0]], - * dim_2: [[min_shape_2, max_shape_2, opt_shape_2]] - * }, - * tensor_b: { - * dim_1: [[min_shape_1, max_shape_1, opt_shape_1]] - * } - * } - * - * The data after serialization will be: - * { - * tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_2, min_shape_2, max_shape_2, opt_shape_2] - * tensor_b: [dim_1, min_shape_1, max_shape_1, opt_shape_1] - * } - * - * - * (2) Multiple profiles case: - * For example, if the data before serialization is: - * { - * tensor_a: { - * dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]] - * }, - * tensor_b: { - * dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]] - * } - * } - * - * The data after serialization will be: - * { - * tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_0, min_shape_1, max_shape_1, opt_shape_1] - * | | | | - * ---------------- profile 0 ----------------- ---------------- profile 1 ----------------- - * - * tensor_b: [dim_1, min_shape_2, max_shape_2, opt_shape_2, dim_1, min_shape_3, max_shape_3, opt_shape_3] - * | | | | - * ---------------- profile 0 ----------------- ---------------- profile 1 ----------------- - * } - * - */ -static void SerializeProfileV2(const std::string& file_name, std::unordered_map>>>& shape_ranges) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] In SerializeProfileV2()"; - // Serialize profile - flexbuffers::Builder builder; - auto tensor_map_start = builder.StartMap(); - for (auto tensor_it = shape_ranges.begin(); tensor_it != shape_ranges.end(); tensor_it++) { // iterate tensors - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] input tensor is '" << tensor_it->first.c_str() << "'"; - builder.TypedVector(tensor_it->first.c_str(), [&] { - for (auto dim_it = tensor_it->second.begin(); dim_it != tensor_it->second.end(); dim_it++) { - size_t num_profiles = dim_it->second.size(); - for (size_t i = 0; i < num_profiles; i++) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] profile #" << i << ", dim is " << dim_it->first; - builder.Int(dim_it->first); - builder.Int(dim_it->second[i][0]); - builder.Int(dim_it->second[i][1]); - builder.Int(dim_it->second[i][2]); - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] " << dim_it->first << ", " << dim_it->second[i][0] << ", " << dim_it->second[i][1] << ", " << dim_it->second[i][2]; - } - } - }); - } - builder.EndMap(tensor_map_start); - builder.Finish(); - - // Save flexbuffer - std::ofstream file(file_name, std::ios::binary | std::ios::out); - auto buf = builder.GetBuffer(); - size_t size = builder.GetSize(); - file.write(reinterpret_cast(&buf[0]), size); - file.close(); -} - -/* - * Deserialize engine profile. (This function starts from ORT 1.15) - * - * - * (1) Single profile case: - * Assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, - * and tensor_b has one dynamic shape dimension: dim_1. - * - * The data in profile file will be: - * { - * tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_2, min_shape_2, max_shape_2, opt_shape_2] - * tensor_b: [dim_1, min_shape_1, max_shape_1, opt_shape_1] - * } - * - * The data after deserialization will be: - * { - * tensor_a: { - * dim_0: [[min_shape_0, max_shape_0, opt_shape_0]], - * dim_2: [[min_shape_2, max_shape_2, opt_shape_2]] - * }, - * tensor_b: { - * dim_1: [[min_shape_1, max_shape_1, opt_shape_1]] - * } - * } - * - * - * (2) Multiple profiles case: - * For example, if the data in profile file is: - * { - * tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_0, min_shape_1, max_shape_1, opt_shape_1] - * | | | | - * ---------------- profile 0 ----------------- ---------------- profile 1 ----------------- - * - * tensor_b: [dim_1, min_shape_2, max_shape_2, opt_shape_2, dim_1, min_shape_3, max_shape_3, opt_shape_3] - * | | | | - * ---------------- profile 0 ----------------- ---------------- profile 1 ----------------- - * } - * - * The data after deserialization will be: - * { - * tensor_a: { - * dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]] - * }, - * tensor_b: { - * dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]] - * } - * } - */ -static std::unordered_map>>> DeserializeProfileV2(std::ifstream& infile) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] In DeserializeProfileV2()"; - // Load flexbuffer - infile.seekg(0, std::ios::end); - size_t length = infile.tellg(); - infile.seekg(0, std::ios::beg); - std::unique_ptr data{new char[length]}; - infile.read((char*)data.get(), length); - infile.close(); - - // Deserialize profile - std::unordered_map>>> shape_ranges; - auto tensors_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap(); - auto keys = tensors_range_entries.Keys(); - auto values = tensors_range_entries.Values(); - for (size_t i = 0, end = keys.size(); i < end; ++i) { // iterate tensors - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] input tensor is '" << keys[i].AsString().c_str() << "'"; - auto dim_range_vector = values[i].AsTypedVector(); - std::unordered_map>> inner_map; - std::vector> profile_vector; - - for (size_t k = 0; k < (dim_range_vector.size() / 4); k++) { // iterate dim, min, max, opt for all profiles - std::vector shape_vector; - auto idx = 4 * k; - auto dim = dim_range_vector[idx].AsInt64(); - shape_vector.push_back(dim_range_vector[idx + 1].AsInt64()); // min shape - shape_vector.push_back(dim_range_vector[idx + 2].AsInt64()); // max shape - shape_vector.push_back(dim_range_vector[idx + 3].AsInt64()); // opt shape - - if (inner_map.find(dim) == inner_map.end()) { - inner_map[dim] = profile_vector; - } - inner_map[dim].push_back(shape_vector); - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] " << dim << ", " << shape_vector[0] << ", " << shape_vector[1] << ", " << shape_vector[2]; - } - shape_ranges[keys[i].AsString().c_str()] = inner_map; - } - return shape_ranges; -} - -/* - * Compare profile shapes from profile file (.profile) with explicit profile min/max/opt shapes. - * Return false meaning no need to rebuild engine if everything is same. - * Otherwise return true and engine needs to be rebuilt. - */ -static bool CompareProfiles(const std::string& file_name, - std::unordered_map>>& profile_min_shapes, - std::unordered_map>>& profile_max_shapes, - std::unordered_map>>& profile_opt_shapes) { - std::ifstream profile_file(file_name, std::ios::binary | std::ios::in); - if (!profile_file) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] " << file_name << " doesn't exist."; - return true; - } - - std::unordered_map>>> shape_ranges; - shape_ranges = DeserializeProfileV2(profile_file); - - /* The format of the two data structures are below, for example: - * - * shape_ranges: - * { - * tensor_a: { - * dim_0: [[min_shape, max_shape, opt_shape]], - * dim_2: [[min_shape, max_shape, opt_shape]] - * }, - * tensor_b: { - * dim_1: [[min_shape, max_shape, opt_shape]] - * } - * } - * - * profile_min_shapes: - * { - * tensor_a: [[dim_0_value_0, dim_1_value_1, dim_2_value_2]], - * tensor_b: [[dim_0_value_3, dim_1_value_4, dim_2_value_5]] - * } - * - */ - - // Check number of dynamic shape inputs - if (profile_min_shapes.size() != shape_ranges.size()) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] Numbers of dynamic shape inputs are not the same."; - return true; - } - - // Iterate through shape_ranges map - for (auto tensor_it = shape_ranges.begin(); tensor_it != shape_ranges.end(); tensor_it++) { // iterate tensors - auto tensor_name = tensor_it->first; - if (profile_min_shapes.find(tensor_name) == profile_min_shapes.end()) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] Tensor name '" << tensor_name << "' doesn't exist in trt_profile_min_shapes."; - return true; - } - - for (auto dim_it = tensor_it->second.begin(); dim_it != tensor_it->second.end(); dim_it++) { // iterate dimensions - auto dim = dim_it->first; - auto num_profiles = GetNumProfiles(profile_min_shapes); - - if (dim_it->second.size() != static_cast(num_profiles)) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] Numbers of profiles are not the same."; - return true; - } - - for (size_t i = 0; i < dim_it->second.size(); i++) { // iterate (multiple) profile(s) - auto shape_values = dim_it->second[i]; - if (dim > (profile_min_shapes[tensor_name][i].size() - 1)) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] dimension " << dim << " of '" << tensor_name << "' in " << file_name << " exceeds the total dimension of trt_profile_min_shapes."; - return true; - } - - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] min shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_min_shapes[tensor_name][i][dim]; - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] min shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[0] << " in " << file_name; - if (profile_min_shapes[tensor_name][i][dim] != shape_values[0]) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] min shape values of dimension " << dim << " of '" << tensor_name << "' are not the same"; - return true; - } - - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] max shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_max_shapes[tensor_name][i][dim]; - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] max shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[1] << " in " << file_name; - if (profile_max_shapes[tensor_name][i][dim] != shape_values[1]) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] max shape values of dimension " << dim << " of '" << tensor_name << "' are not the same"; - return true; - } - - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] opt shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_opt_shapes[tensor_name][i][dim]; - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] opt shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[2] << " in " << file_name; - if (profile_opt_shapes[tensor_name][i][dim] != shape_values[2]) { - LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] opt shape values of dimension " << dim << " of '" << tensor_name << "' are not the same"; - return true; - } - } - } - } - return false; -} - /* * Get cache by name * @@ -394,37 +68,6 @@ static std::string GetComputeCapability(const cudaDeviceProp& prop) { return compute_capability; } -/* - * Get cache by type - * - * \param root root path of the cache - * \param file_extension It could be ".engine", ".profile" or ".timing" - */ -static std::vector GetCachesByType(const std::string& root, std::string file_extension) { - std::vector cache_files; - for (const auto& entry : fs::directory_iterator(root)) { - if (fs::path(file_extension) == fs::path(entry).extension()) { - cache_files.push_back(fs::path(entry)); - } - } - return cache_files; -} - -static bool IsCacheExistedByType(const std::string& root, std::string file_extension) { - auto cache_files = GetCachesByType(root, file_extension); - if (cache_files.size() == 0) { - return false; - } - return true; -} - -static void RemoveCachesByType(const std::string& root, std::string file_extension) { - auto cache_files = GetCachesByType(root, file_extension); - for (const auto& entry : cache_files) { - fs::remove(entry); - } -} - /** * * Helper class to generate engine id via model name/model content/env metadata @@ -631,51 +274,6 @@ static bool ParseProfileShapes(std::string profile_shapes_string, std::unordered return true; } -static std::vector split(const std::string& str, char delimiter) { - std::vector tokens; - std::string token; - std::istringstream tokenStream(str); - while (std::getline(tokenStream, token, delimiter)) { - tokens.push_back(token); - } - return tokens; -} - -static std::string join(const std::vector& vec, const std::string& delimiter) { - std::string result; - for (size_t i = 0; i < vec.size(); ++i) { - result += vec[i]; - if (i < vec.size() - 1) { - result += delimiter; - } - } - return result; -} - -/* - * Parse engine cache name suffix when user customizes prefix for engine cache name - * - * For example: - * When default subgraph name is "NvExecutionProvider_TRTKernel_graph_torch-jit-export_2068723788287043730_189_189_fp16" - * This func will generate the suffix "2068723788287043730_189_fp16" - * - */ -static std::string GetCacheSuffix(const std::string& fused_node_name, const std::string& trt_node_name_with_precision) { - std::vector split_fused_node_name = split(fused_node_name, '_'); - if (split_fused_node_name.size() >= 3) { - // Get index of model hash from fused_node_name - std::string model_hash = split_fused_node_name[split_fused_node_name.size() - 3]; - size_t index = fused_node_name.find(model_hash); - // Parse suffix from trt_node_name_with_precision, as it has additional precision info - std::vector suffix_group = split(trt_node_name_with_precision.substr(index), '_'); - if (suffix_group.size() > 2) { - suffix_group.erase(suffix_group.begin() + 2); - } - return join(suffix_group, "_"); - } - return ""; -} - /* * Checks if there is a an element with value `-1` in nvinfer1::Dims */ @@ -700,37 +298,4 @@ static bool checkTrtTensorIsDynamic(nvinfer1::ITensor* tensor) { return checkTrtDimIsDynamic(tensor->getDimensions()); } } - -struct ScopedContext { - explicit ScopedContext(int device_id) : pushed_(true) { - CUcontext cu_context = 0; - CU_CALL_THROW(cuCtxGetCurrent(&cu_context)); - if (!cu_context) { - // cuCtxGetCurrent succeeded but returned nullptr, which indicates that no CUDA context - // is currently set for this thread. This implicates that there is not user created context. - // We use runtime API to initialize a context for the specified device. - CUDA_CALL_THROW(cudaSetDevice(device_id)); - CU_CALL_THROW(cuCtxGetCurrent(&cu_context)); - } - CU_CALL_THROW(cuCtxPushCurrent(cu_context)); - } - - /** \brief Push an existing context (e.g. CIG context); pop on destruction. */ - explicit ScopedContext(CUcontext ctx) : pushed_(ctx != nullptr) { - if (ctx != nullptr) { - CU_CALL_THROW(cuCtxPushCurrent(ctx)); - } - } - - ScopedContext(const ScopedContext&) = delete; - - ~ScopedContext() { - if (pushed_) { - cuCtxPopCurrent(nullptr); - } - } - - private: - bool pushed_ = true; -}; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc index 4d5c5b45f65dd..31ff17f241371 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc @@ -16,7 +16,6 @@ #include "core/framework/plugin_ep_stream.h" #include "core/providers/nv_tensorrt_rtx/nv_provider_options.h" #include "core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.h" -#include "core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h" #include "core/providers/cuda/cuda_stream_handle.h" // D3D12 headers for graphics interop on Windows @@ -30,6 +29,7 @@ #include "nv_provider_factory_creator.h" #include "nv_data_transfer.h" #include "nv_allocator.h" +#include "nv_scoped_context.h" using namespace onnxruntime; diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_scoped_context.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_scoped_context.h new file mode 100644 index 0000000000000..8a16533b01c7b --- /dev/null +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_scoped_context.h @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Licensed under the MIT License. + +#include "nv_includes.h" +#include "core/providers/cuda/cuda_pch.h" +#include "core/providers/cuda/shared_inc/cuda_call.h" + +namespace onnxruntime { +struct ScopedContext { + explicit ScopedContext(int device_id) : pushed_(true) { + CUcontext cu_context = 0; + CU_CALL_THROW(cuCtxGetCurrent(&cu_context)); + if (!cu_context) { + // cuCtxGetCurrent succeeded but returned nullptr, which indicates that no CUDA context + // is currently set for this thread. This implicates that there is not user created context. + // We use runtime API to initialize a context for the specified device. + CUDA_CALL_THROW(cudaSetDevice(device_id)); + CU_CALL_THROW(cuCtxGetCurrent(&cu_context)); + } + CU_CALL_THROW(cuCtxPushCurrent(cu_context)); + } + + /** \brief Push an existing context (e.g. CIG context); pop on destruction. */ + explicit ScopedContext(CUcontext ctx) : pushed_(ctx != nullptr) { + if (ctx != nullptr) { + CU_CALL_THROW(cuCtxPushCurrent(ctx)); + } + } + + ScopedContext(const ScopedContext&) = delete; + + ~ScopedContext() { + if (pushed_) { + cuCtxPopCurrent(nullptr); + } + } + + private: + bool pushed_ = true; +}; +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index eba0a8c2615aa..52060489e7c54 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -445,6 +445,7 @@ void QnnLogging(const char* format, QnnLog_Level_t level, uint64_t timestamp, va_list argument_parameter) { + ORT_UNUSED_PARAMETER(level); ORT_UNUSED_PARAMETER(timestamp); if (!::onnxruntime::logging::LoggingManager::HasDefaultLogger()) { @@ -454,8 +455,7 @@ void QnnLogging(const char* format, } const auto& logger = ::onnxruntime::logging::LoggingManager::DefaultLogger(); - // Map QNN log level to ORT severity - logging::Severity severity = QnnBackendManager::MapQNNLogLevelToOrtSeverity(level); + const auto severity = ::onnxruntime::logging::Severity::kVERBOSE; const auto data_type = ::onnxruntime::logging::DataType::SYSTEM; if (logger.OutputIsEnabled(severity, data_type)) { @@ -529,22 +529,6 @@ QnnLog_Level_t QnnBackendManager::MapOrtSeverityToQNNLogLevel(logging::Severity } } -/* static */ logging::Severity QnnBackendManager::MapQNNLogLevelToOrtSeverity(QnnLog_Level_t qnn_log_level) { - // Map QNN log level to ORT log severity - switch (qnn_log_level) { - case QNN_LOG_LEVEL_VERBOSE: - case QNN_LOG_LEVEL_DEBUG: - return logging::Severity::kVERBOSE; - case QNN_LOG_LEVEL_INFO: - return logging::Severity::kINFO; - case QNN_LOG_LEVEL_WARN: - return logging::Severity::kWARNING; - case QNN_LOG_LEVEL_ERROR: - default: - return logging::Severity::kERROR; - } -} - Status QnnBackendManager::ResetQnnLogLevel(std::optional ort_log_level) { std::lock_guard lock(logger_recursive_mutex_); if (!backend_setup_completed_ || logger_ == nullptr) { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index dfa40a2c8aa0d..fe4ec0b7018a5 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -270,9 +270,6 @@ class QnnBackendManager : public std::enable_shared_from_this Qnn_ErrorHandle_t ReleaseDmaData(Qnn_ContextBinaryDmaDataMem_t data_mem, void* mapped_base_ptr); #endif - QnnLog_Level_t MapOrtSeverityToQNNLogLevel(logging::Severity ort_log_level); - static logging::Severity MapQNNLogLevelToOrtSeverity(QnnLog_Level_t qnn_log_level); - #ifdef QNN_FILE_MAPPED_WEIGHTS_AVAILABLE typedef struct FileMappingCallbackInfo { void* const mapped_file_ptr; @@ -379,6 +376,7 @@ class QnnBackendManager : public std::enable_shared_from_this const char* QnnProfileErrorToString(QnnProfile_Error_t error); std::string QnnErrorHandleToString(Qnn_ErrorHandle_t error); + QnnLog_Level_t MapOrtSeverityToQNNLogLevel(logging::Severity ort_log_level); // Adds a new QNN context. // Transfers ownership of `context_handle` (i.e., responsibility of freeing it) to this instance diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index ad22187a75cd9..a49376066009a 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -46,8 +46,8 @@ using namespace onnxruntime; /// @brief Gets the path of directory containing the dynamic library that contains the address. /// @param address An address of a function or variable in the dynamic library. /// @return The path of the directory containing the dynamic library, or an empty string if the path cannot be determined. -static onnxruntime::PathString GetDynamicLibraryLocationByAddress(const void* address) { #ifdef _WIN32 +static onnxruntime::PathString GetDynamicLibraryLocationByAddress(const void* address) { HMODULE moduleHandle; if (!::GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, reinterpret_cast(address), &moduleHandle)) { @@ -66,11 +66,9 @@ static onnxruntime::PathString GetDynamicLibraryLocationByAddress(const void* ad buffer.resize(requiredSize); return {std::move(buffer)}; } -#else - std::ignore = address; -#endif return {}; } +#endif vaip_core::OrtApiForVaip* create_org_api_hook(); struct OrtVitisAIEpAPI { diff --git a/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc new file mode 100644 index 0000000000000..968086ebd2613 --- /dev/null +++ b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// This file REQUIRES the following external definitions: +// FILE_NAME, VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE, and VER_STRING + +#include + +#if defined(DEBUG) || defined(_DEBUG) +#define VER_DEBUG VS_FF_DEBUG +#else +#define VER_DEBUG 0 +#endif + +// ----------------------------------------------------------------------------- + +VS_VERSION_INFO VERSIONINFO +FILEVERSION VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE +PRODUCTVERSION VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE +FILEFLAGSMASK VS_FFI_FILEFLAGSMASK +FILEFLAGS VER_DEBUG +FILEOS VOS__WINDOWS32 +FILETYPE VFT_DLL +FILESUBTYPE VFT2_UNKNOWN + +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + BEGIN + VALUE "CompanyName", "Microsoft Corporation" + VALUE "FileDescription", "ONNX Runtime VitisAI Provider" + VALUE "FileVersion", VER_STRING + VALUE "InternalName", "ONNX Runtime VitisAI Provider" + VALUE "LegalCopyright", "\251 Microsoft Corporation. All rights reserved." + VALUE "OriginalFilename", FILE_NAME + VALUE "ProductName", "Microsoft\256 Windows\256 Operating System" + VALUE "ProductVersion", VER_STRING + END + END + + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1252 + END +END diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 7ea25ea115567..f64a40145f535 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -113,7 +113,6 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector ep_context_node_ptrs; auto get_config_entry = [](const void* state, const char* entry_name) -> vaip_core::DllSafe { const onnxruntime::RunOptions& run_options = *static_cast(state); auto ret = run_options.GetConfigOptions().GetConfigEntry(std::string(entry_name)); diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 1a925a16a6a71..937a96a619822 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1545,6 +1545,55 @@ void addGlobalMethods(py::module& m) { }, R"pbdoc("Validate a compiled model's compatibility information for one or more EP devices.)pbdoc"); + m.def( + "get_compatibility_info_from_model", + [](const std::basic_string& model_path, const std::string& ep_type) -> py::object { + Ort::AllocatorWithDefaultOptions allocator; + Ort::AllocatedStringPtr compat_info = Ort::GetCompatibilityInfoFromModelAllocated( + model_path.c_str(), ep_type.c_str(), allocator); + if (compat_info.get() == nullptr) { + return py::none(); + } + return py::str(compat_info.get()); + }, + R"pbdoc(Extract EP compatibility info from a precompiled model file. + +Parses the model file to extract the compatibility info string for a specific execution provider +from the model's metadata properties. Returns None if no compatibility info exists for the EP. + +Args: + model_path: Path to the ONNX model file. + ep_type: The execution provider type string (e.g. "CPUExecutionProvider"). + +Returns: + The compatibility info string, or None if not found. +)pbdoc"); + + m.def( + "get_compatibility_info_from_model_bytes", + [](const py::buffer& model_data, const std::string& ep_type) -> py::object { + py::buffer_info info = model_data.request(); + Ort::AllocatorWithDefaultOptions allocator; + Ort::AllocatedStringPtr compat_info = Ort::GetCompatibilityInfoFromModelBytesAllocated( + info.ptr, static_cast(info.size * info.itemsize), ep_type.c_str(), allocator); + if (compat_info.get() == nullptr) { + return py::none(); + } + return py::str(compat_info.get()); + }, + R"pbdoc(Extract EP compatibility info from precompiled model bytes in memory. + +Same as get_compatibility_info_from_model but reads from a buffer instead of a file. +Accepts bytes, bytearray, memoryview, or any object supporting the buffer protocol. + +Args: + model_data: The model data as a buffer (bytes, bytearray, memoryview, etc.). + ep_type: The execution provider type string (e.g. "CPUExecutionProvider"). + +Returns: + The compatibility info string, or None if not found. +)pbdoc"); + m.def( "copy_tensors", [](const std::vector& src, const std::vector& dest, py::object& py_arg) { diff --git a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py index ed067a1362663..743bf50f6c608 100644 --- a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py +++ b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py @@ -110,9 +110,8 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): ) return else: - # Shape inference failed. Use default skip_index=1 (no broadcasting) since both - # Add inputs have already been verified as non-initializer dynamic tensors above. - logger.debug("symbolic shape inference failed, using default skip_index for SkipLayerNormalization") + logger.debug("skip SkipLayerNormalization fusion since symbolic shape inference failed") + return gather_path = self.model.match_parent_path(add, ["Gather"], [None]) if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None: diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc index 424d6cbac743c..8c5859823ac16 100644 --- a/onnxruntime/test/framework/tensorutils_test.cc +++ b/onnxruntime/test/framework/tensorutils_test.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include "gtest/gtest.h" #include "gmock/gmock.h" @@ -519,43 +520,109 @@ class PathValidationTest : public ::testing::Test { // Clean up the temporary directory. std::filesystem::remove_all(base_dir_); std::filesystem::remove_all(outside_dir_); + + for (const auto& other_dir : other_dirs_) { + std::filesystem::remove_all(other_dir); + } + + for (const auto& other_file : other_files_) { + std::filesystem::remove(other_file); + } + } + + // Create directory that will be removed during test teardown. + void CreateDirectories(std::filesystem::path dir) { + std::filesystem::create_directories(dir); + other_dirs_.push_back(std::move(dir)); + } + + // Create empty file that will be removed during test teardown. + void CreateEmptyFile(std::filesystem::path file_path) { + std::ofstream{file_path}; + other_files_.push_back(std::move(file_path)); } std::filesystem::path base_dir_; std::filesystem::path outside_dir_; + std::vector other_dirs_; + std::vector other_files_; }; // Test cases for ValidateExternalDataPath. TEST_F(PathValidationTest, ValidateExternalDataPath) { + std::filesystem::path model_path = base_dir_ / "model.onnx"; + std::filesystem::path cwd = std::filesystem::current_path(); + const bool is_cwd_root = cwd == cwd.root_path(); + + // Create empty external data files that we'll need for testing. + CreateEmptyFile(base_dir_ / "data.bin"); + CreateDirectories(base_dir_ / "sub"); + CreateEmptyFile(base_dir_ / "sub" / "data.bin"); + CreateEmptyFile(cwd / "data.bin"); + CreateDirectories(cwd / "abc"); + CreateEmptyFile(cwd / "abc" / "data.bin"); + CreateEmptyFile(cwd / "data..bin"); + // Valid relative path. - ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "data.bin")); + ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "data.bin")); - // Empty location. - // Only validate it is not an absolute path. - ASSERT_TRUE(utils::ValidateExternalDataPath(base_dir_, "").IsOK()); + // Empty location not allowed. + { + Status status = utils::ValidateExternalDataPath(model_path, ""); + ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Empty external data path")); + } // Path with ".." that escapes the base directory. - ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "../data.bin").IsOK()); + ASSERT_FALSE(utils::ValidateExternalDataPath(model_path, "../data.bin").IsOK()); // Absolute path. + { + Status status; #ifdef _WIN32 - ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "C:\\data.bin").IsOK()); - ASSERT_FALSE(utils::ValidateExternalDataPath("", "C:\\data.bin").IsOK()); -#else - ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "/data.bin").IsOK()); - ASSERT_FALSE(utils::ValidateExternalDataPath("", "/data.bin").IsOK()); + status = utils::ValidateExternalDataPath(model_path, "C:\\data.bin"); + ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); + + status = utils::ValidateExternalDataPath("", "C:\\data.bin"); + ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); #endif // Absolute path. + // Paths starting from / should be rejected even on Windows. + status = utils::ValidateExternalDataPath(model_path, "/data.bin"); + ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); + + status = utils::ValidateExternalDataPath("", "/data.bin"); + ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); + } + // Windows vs Unix path separators. - ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "sub/data.bin")); - ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "sub\\data.bin")); +#ifdef _WIN32 + ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "sub\\data.bin")); +#endif + ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "sub/data.bin")); + + // Model in a directory that does not exist. + ASSERT_FALSE(utils::ValidateExternalDataPath("non_existent_dir/model.onnx", "data.bin").IsOK()); + + // Model path is a bare filename (no directory component). + ASSERT_STATUS_OK(utils::ValidateExternalDataPath("model.onnx", "data.bin")); + ASSERT_EQ(utils::ValidateExternalDataPath("model.onnx", "../data.bin").IsOK(), is_cwd_root); + + // Model relative path checks. + ASSERT_STATUS_OK(utils::ValidateExternalDataPath("./model.onnx", "data.bin")); + ASSERT_EQ(utils::ValidateExternalDataPath("./model.onnx", "../data.bin").IsOK(), is_cwd_root); +#ifdef _WIN32 + ASSERT_STATUS_OK(utils::ValidateExternalDataPath(".\\model.onnx", "data.bin")); + ASSERT_EQ(utils::ValidateExternalDataPath(".\\model.onnx", "../data.bin").IsOK(), is_cwd_root); +#endif - // Base directory does not exist. - ASSERT_STATUS_OK(utils::ValidateExternalDataPath("non_existent_dir", "data.bin")); + ASSERT_STATUS_OK(utils::ValidateExternalDataPath("./abc/model.onnx", "data.bin")); +#ifdef _WIN32 + ASSERT_STATUS_OK(utils::ValidateExternalDataPath(".\\abc\\model.onnx", "data.bin")); +#endif // - // Tests for an empty base directory. - // The base directory would be empty when 1) the session loads a model from bytes and 2) the application does not + // Tests for an empty model path (model loaded from bytes). + // The model path would be empty when 1) the session loads a model from bytes and 2) the application does not // set an external file folder path via the session config option // kOrtSessionOptionsModelExternalInitializersFileFolderPath. // @@ -568,17 +635,18 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) { ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data..bin")); // A path that would escape the current working directory is invalid. - ASSERT_FALSE(utils::ValidateExternalDataPath("", "../data.bin").IsOK()); + ASSERT_EQ(utils::ValidateExternalDataPath("", "../data.bin").IsOK(), is_cwd_root); // A path that uses ".." but would not escape the current working directory should be fine. ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "a/../data.bin")); // A path with multiple internal ".." that would escape current working direction should fail. - ASSERT_FALSE(utils::ValidateExternalDataPath("", "a/../../data.bin").IsOK()); + ASSERT_EQ(utils::ValidateExternalDataPath("", "a/../../data.bin").IsOK(), is_cwd_root); } TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkInside) { // Symbolic link that points inside the base directory. + auto model_path = base_dir_ / "model.onnx"; try { auto target = base_dir_ / "target.bin"; std::ofstream{target}; @@ -588,11 +656,12 @@ TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkInside) { GTEST_SKIP() << "Skipping symlink tests since symlink creation is not supported in this environment. Exception: " << e.what(); } - ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "link.bin")); + ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "link.bin")); } TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkOutside) { // Symbolic link that points outside the base directory. + auto model_path = base_dir_ / "model.onnx"; auto outside_target = outside_dir_ / "outside.bin"; try { { @@ -603,7 +672,60 @@ TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkOutside) { } catch (const std::exception& e) { GTEST_SKIP() << "Skipping symlink tests since symlink creation is not supported in this environment. Exception: " << e.what(); } - ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "outside_link.bin").IsOK()); + ASSERT_FALSE(utils::ValidateExternalDataPath(model_path, "outside_link.bin").IsOK()); +} + +TEST_F(PathValidationTest, ValidateExternalDataPathEmptyModelPathWithSymlinkInside) { + // Test external data path validation when the model path is empty. + // Specifically tests that the following scenario is valid: + // - A symbolic link within the current working directory pointing to a file still within CWD. + try { + std::filesystem::path cwd = std::filesystem::current_path(); + std::filesystem::path sub_dir = cwd / "symlink_test_subdir"; + CreateDirectories(sub_dir); + + std::filesystem::path target = sub_dir / "target_inside.bin"; + std::filesystem::path symlink = sub_dir / "link_inside.bin"; + std::ofstream{target}; + std::filesystem::create_symlink(target, symlink); + } catch (const std::exception& e) { + GTEST_SKIP() << "Skipping test due to failure setting up directory and symlink files: " + << e.what(); + } + + EXPECT_STATUS_OK(utils::ValidateExternalDataPath("", "./symlink_test_subdir/link_inside.bin")); +} + +TEST_F(PathValidationTest, ValidateExternalDataPathEmptyModelPathWithSymlinkOutside) { + // Test external data path validation when the model path is empty. + // Specifically tests that the following scenario is NOT valid: + // - A symbolic link within the current working directory pointing to a file outside CWD. + try { + std::filesystem::path cwd = std::filesystem::current_path(); + std::filesystem::path sub_dir = cwd / "symlink_test_subdir2"; + CreateDirectories(sub_dir); + + // Check if we can actually make a file outside of the current working directory (i.e., in a temp dir). + // This is only possible if the current working directory is NOT the same as the temp directory. + // Otherwise, we need to skip this test. This happens in Android CI. + auto [cwd_end, outside_end] = std::mismatch(cwd.begin(), cwd.end(), outside_dir_.begin(), outside_dir_.end()); + if (cwd_end == cwd.end()) { + GTEST_SKIP() << "Skipping test that needs to create a symlink outside of the cwd because the cwd is the same as " + << "the temp dir. cwd: " << cwd << " outside_dir_: " << outside_dir_; + } + + std::filesystem::path outside_target = outside_dir_ / "outside_for_empty_basedir.bin"; + std::filesystem::path symlink = sub_dir / "outside_link.bin"; + std::ofstream{outside_target}; + std::filesystem::create_symlink(outside_target, symlink); + } catch (const std::exception& e) { + GTEST_SKIP() << "Skipping test due to failure setting up directory and symlink files: " + << e.what(); + } + + Status status = utils::ValidateExternalDataPath("", "./symlink_test_subdir2/outside_link.bin"); + ASSERT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("escapes working directory")); } // Tests for ValidateEmbeddedTensorProtoDataSizeAndShape and embedded initializer size limits diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index c0cd40ad95ad4..5d7eda39be271 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -697,33 +697,29 @@ TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits_Cuda) { RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); } -// Cast-aware DQ->MatMul fusion tests -// Pattern: DQ(int4->fp16) -> Cast(fp16->fp32) -> MatMul(fp32) -// The Cast between DQ and MatMul on input B should be handled by the -// DQCastMatMulToMatMulNBits selector-action pair. -// MatMulNBits always operates in the DQ scale dtype (fp16). -// The action always inserts Cast on input A and Cast on output. -// ORT's redundant cast elimination optimizer cleans up unnecessary casts. +// DQ(fp16) -> MatMul fusion test +// Pattern: DQ(int4, fp16_scale) -> MatMul(fp16) +// For FP16 models on CPU EP, CPU EP doesn't claim FP16 MatMul during partitioning +// (no FP16 MatMul kernel on CPU), so the node's EP is empty "". +// The DQ->MatMul fusion should still match and fuse to MatMulNBits. // -// Input1(fp32) DQ(int4->fp16) -// | | -// \ Cast(fp16->fp32) -// \ / -// MatMul(fp32) +// Input1(fp16) DQ(int4->fp16) +// \ / +// MatMul(fp16) // | -// output(fp32) +// output(fp16) // // After optimization: -// Input1(fp32) -> Cast(fp32->fp16) -> MatMulNBits(fp16) -> Cast(fp16->fp32) -> output(fp32) +// Input1(fp16) -> MatMulNBits(fp16) -> output(fp16) template typename std::enable_if || std::is_same_v, void>::type -RunDQCastMatMulConverted(const std::vector& input1_shape, +RunDQMatMulFP16Converted(const std::vector& input1_shape, const std::vector& weight_shape, const int64_t axis, const int64_t block_size, int64_t accuracy_level) { auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input_arg = builder.MakeInput(input1_shape, -1.0f, 1.0f); + auto* input_arg = builder.MakeInput(input1_shape, MLFloat16(-1.0f), MLFloat16(1.0f)); auto* output_arg = builder.MakeOutput(); // DQ with fp16 scales @@ -745,24 +741,14 @@ RunDQCastMatMulConverted(const std::vector& input1_shape, builder.AddNode("DequantizeLinear", {weight_arg, scale_arg}, {dq_output}, "", &dq_attrs); } - // Cast fp16 -> fp32 - auto* cast_output = builder.MakeIntermediate(); - NodeAttributes cast_attrs; - utils::SetNodeAttribute(utils::MakeAttribute("to", - static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)), - cast_attrs); - builder.AddNode("Cast", {dq_output}, {cast_output}, "", &cast_attrs); - - // MatMul - builder.AddNode("MatMul", {input_arg, cast_output}, {output_arg}); + // MatMul (fp16) + builder.AddNode("MatMul", {input_arg, dq_output}, {output_arg}); }; auto check_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); const QDQOpKeys qdq_keys = GetQDQOpKeys(false); EXPECT_EQ(op_to_count["MatMul"], 0); - // B-side Cast removed. New Cast(fp32->fp16) on A and Cast(fp16->fp32) on output. - EXPECT_EQ(op_to_count["Cast"], 2); EXPECT_EQ(op_to_count["com.microsoft.MatMulNBits"], 1); EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0); }; @@ -786,12 +772,12 @@ RunDQCastMatMulConverted(const std::vector& input1_shape, add_session_options_fn); } -TEST(QDQTransformerTests, DQCastMatMulConvertedToMatMulNBits) { - // DQ(int4->fp16) -> Cast(fp16->fp32) -> MatMul should be fused to MatMulNBits - RunDQCastMatMulConverted({12, 32}, {32, 16}, 0, 16, 0); - RunDQCastMatMulConverted({12, 32}, {32, 16}, 0, 16, 0); - RunDQCastMatMulConverted({12, 32}, {32, 16}, 0, 16, 0); - RunDQCastMatMulConverted({12, 32}, {32, 16}, 0, 16, 0); +TEST(QDQTransformerTests, DQMatMulFP16ConvertedToMatMulNBits) { + // DQ(int4, fp16_scale) -> MatMul(fp16) should be fused to MatMulNBits + RunDQMatMulFP16Converted({12, 32}, {32, 16}, 0, 16, 0); + RunDQMatMulFP16Converted({12, 32}, {32, 16}, 0, 16, 0); + RunDQMatMulFP16Converted({12, 32}, {32, 16}, 0, 16, 0); + RunDQMatMulFP16Converted({12, 32}, {32, 16}, 0, 16, 0); } #endif // !defined(DISABLE_CONTRIB_OPS) diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc index 1eeb3683bc9aa..c0efb993e7611 100644 --- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc +++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "gtest/gtest.h" +#include "test/common/tensor_op_test_utils.h" #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" #include "test/common/trt_op_test_utils.h" @@ -906,5 +907,88 @@ TEST(RoiAlignTest, BatchIndicesNegative_CUDA) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); #endif } + +TEST(RoiAlignTest, Float16_Opset16) { + auto cuda_ep = DefaultCudaExecutionProvider(); + if (cuda_ep.get() == nullptr) { + GTEST_SKIP() << "Skipping because there is no CUDA execution provider available."; + } + + OpTester test("RoiAlign", 16); + test.AddAttribute("output_height", 3); + test.AddAttribute("output_width", 4); + test.AddAttribute("sampling_ratio", 2); + test.AddAttribute("spatial_scale", 1.0f / 16.0f); + + constexpr int N = 1; + constexpr int C = 1; + constexpr int H = 5; + constexpr int W = 5; + + test.AddInput("X", {N, C, H, W}, ToFloat16({0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.})); + test.AddInput("rois", {1, 4}, ToFloat16({0., 0., 4., 4.})); + test.AddInput("batch_indices", {1}, {0}); + // Values calculated manually or from a known good run + test.AddOutput("Y", {1, 1, 3, 4}, ToFloat16({0.6665f, 1.333f, 2.0f, 2.666f, 4.0f, 4.668f, 5.332f, 6.0f, 7.332f, 8.0f, 8.664f, 9.336f})); + + std::vector> execution_providers; + execution_providers.push_back(std::move(cuda_ep)); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(RoiAlignTest, Float16_Opset22) { + auto cuda_ep = DefaultCudaExecutionProvider(); + if (cuda_ep.get() == nullptr) { + GTEST_SKIP() << "Skipping because there is no CUDA execution provider available."; + } + + OpTester test("RoiAlign", 22); + test.AddAttribute("output_height", 3); + test.AddAttribute("output_width", 4); + test.AddAttribute("sampling_ratio", 2); + test.AddAttribute("spatial_scale", 1.0f / 16.0f); + + constexpr int N = 1; + constexpr int C = 1; + constexpr int H = 5; + constexpr int W = 5; + + test.AddInput("X", {N, C, H, W}, ToFloat16({0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.})); + test.AddInput("rois", {1, 4}, ToFloat16({0., 0., 4., 4.})); + test.AddInput("batch_indices", {1}, {0}); + test.AddOutput("Y", {1, 1, 3, 4}, ToFloat16({0.6665f, 1.333f, 2.0f, 2.666f, 4.0f, 4.668f, 5.332f, 6.0f, 7.332f, 8.0f, 8.664f, 9.336f})); + + std::vector> execution_providers; + execution_providers.push_back(std::move(cuda_ep)); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(RoiAlignTest, BFloat16_Opset22) { + auto cuda_ep = DefaultCudaExecutionProvider(); + if (cuda_ep.get() == nullptr) { + GTEST_SKIP() << "Skipping because there is no CUDA execution provider available."; + } + + OpTester test("RoiAlign", 22); + test.AddAttribute("output_height", 3); + test.AddAttribute("output_width", 4); + test.AddAttribute("sampling_ratio", 2); + test.AddAttribute("spatial_scale", 1.0f / 16.0f); + + constexpr int N = 1; + constexpr int C = 1; + constexpr int H = 5; + constexpr int W = 5; + + test.AddInput("X", {N, C, H, W}, ToBFloat16({0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.})); + test.AddInput("rois", {1, 4}, ToBFloat16({0., 0., 4., 4.})); + test.AddInput("batch_indices", {1}, {0}); + test.AddOutput("Y", {1, 1, 3, 4}, ToBFloat16({0.6665f, 1.333f, 2.0f, 2.666f, 4.0f, 4.668f, 5.332f, 6.0f, 7.332f, 8.0f, 8.664f, 9.336f})); + + std::vector> execution_providers; + execution_providers.push_back(std::move(cuda_ep)); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc index 7fd22cc59745f..2423d7f120b20 100644 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc @@ -37,7 +37,7 @@ TYPED_TEST(GridSampleCustomTest, test_grid_sample_20_4D_linear_zeros_mixed_bound test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - RunTests(test, GetExecutionProviders(20)); + RunTests(test, GetExecutionProviders()); } TYPED_TEST(GridSampleCustomTest, test_grid_sample_20_4D_linear_zeros_mixed_bounds_left_top) { @@ -69,6 +69,5 @@ TYPED_TEST(GridSampleCustomTest, test_grid_sample_20_4D_linear_zeros_mixed_bound test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - RunTests(test, GetExecutionProviders(20)); + RunTests(test, GetExecutionProviders()); } - diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc index c7fccb086ea19..232477c96b4f4 100644 --- a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc +++ b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc @@ -1082,7 +1082,7 @@ TEST_F(NvExecutionProviderExternalResourceImporterTest, FullInferenceWithExterna // Configure to use our CUDA stream char stream_address[32]; size_t stream_addr_val = reinterpret_cast(ort_api_->SyncStream_GetHandle(ort_stream)); - sprintf(stream_address, "%llu", static_cast(stream_addr_val)); + sprintf_s(stream_address, "%llu", static_cast(stream_addr_val)); const char* option_keys[] = { // TODO we should no longer require to set the compute stream at this point but there are too many cudaSetDevice calls from allocators and stream handling (NVBUG 5822116) onnxruntime::nv::provider_option_names::kUserComputeStream, @@ -1095,7 +1095,7 @@ TEST_F(NvExecutionProviderExternalResourceImporterTest, FullInferenceWithExterna }; char aux_stream_address[32]; size_t aux_streams[] = {stream_addr_val}; - sprintf(aux_stream_address, "%llu", reinterpret_cast(aux_streams)); + sprintf_s(aux_stream_address, "%llu", reinterpret_cast(aux_streams)); std::string max_shared_mem_size = std::to_string(1024 * 28); // 28 KiB const char* option_values[] = { stream_address, diff --git a/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py index 8e69fdf088103..8d430ca3f4952 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py +++ b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py @@ -4,10 +4,15 @@ import os import platform import sys +import tempfile import unittest +import onnx + from onnxruntime.capi.onnxruntime_pybind11_state import ( OrtCompiledModelCompatibility, + get_compatibility_info_from_model, + get_compatibility_info_from_model_bytes, get_ep_devices, get_model_compatibility_for_ep_devices, ) @@ -17,6 +22,21 @@ os.add_dll_directory(os.getcwd()) +def _create_model_with_compatibility_metadata(ep_compatibility_info=None): + """Create a minimal valid ONNX model with optional compatibility metadata.""" + graph = onnx.helper.make_graph([], "test_graph", [], []) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + + if ep_compatibility_info: + for ep_type, compat_info in ep_compatibility_info.items(): + entry = onnx.StringStringEntryProto() + entry.key = f"ep_compatibility_info.{ep_type}" + entry.value = compat_info + model.metadata_props.append(entry) + + return model.SerializeToString() + + class TestEpCompatibility(unittest.TestCase): def test_invalid_args(self): # empty devices @@ -41,6 +61,63 @@ def test_basic_smoke(self): status = get_model_compatibility_for_ep_devices(selected, "arbitrary-compat-string") self.assertEqual(status, OrtCompiledModelCompatibility.EP_NOT_APPLICABLE) + def test_get_compatibility_info_from_model_bytes_with_metadata(self): + ep_type = "TestCompatEP" + expected_compat_info = "test_compat_v1.0_driver_123" + model_data = _create_model_with_compatibility_metadata({ep_type: expected_compat_info}) + + result = get_compatibility_info_from_model_bytes(model_data, ep_type) + self.assertIsNotNone(result) + self.assertEqual(result, expected_compat_info) + + def test_get_compatibility_info_from_model_bytes_not_found(self): + model_data = _create_model_with_compatibility_metadata({"DifferentEP": "some_value"}) + + result = get_compatibility_info_from_model_bytes(model_data, "NonExistentEP") + self.assertIsNone(result) + + def test_get_compatibility_info_from_model_bytes_no_metadata(self): + model_data = _create_model_with_compatibility_metadata() + + result = get_compatibility_info_from_model_bytes(model_data, "AnyEP") + self.assertIsNone(result) + + def test_get_compatibility_info_from_model_bytes_invalid_data(self): + with self.assertRaises(RuntimeError): + get_compatibility_info_from_model_bytes(b"this is not a valid ONNX model", "TestEP") + + def test_get_compatibility_info_from_model_bytes_invalid_args(self): + with self.assertRaises(RuntimeError): + get_compatibility_info_from_model_bytes(b"", "TestEP") + with self.assertRaises(RuntimeError): + get_compatibility_info_from_model_bytes(b"data", "") + + def test_get_compatibility_info_from_model_file_with_metadata(self): + ep_type = "TestCompatEP" + expected_compat_info = "file_compat_v2.0" + model_data = _create_model_with_compatibility_metadata({ep_type: expected_compat_info}) + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + f.write(model_data) + model_path = f.name + + try: + result = get_compatibility_info_from_model(model_path, ep_type) + self.assertIsNotNone(result) + self.assertEqual(result, expected_compat_info) + finally: + os.unlink(model_path) + + def test_get_compatibility_info_from_model_file_not_found(self): + with self.assertRaises(RuntimeError): + get_compatibility_info_from_model("nonexistent_model_path.onnx", "TestEP") + + def test_get_compatibility_info_from_model_invalid_args(self): + with self.assertRaises(RuntimeError): + get_compatibility_info_from_model("", "TestEP") + with self.assertRaises(RuntimeError): + get_compatibility_info_from_model("model.onnx", "") + if __name__ == "__main__": unittest.main() diff --git a/onnxruntime/test/python/transformers/test_attention_fusion.py b/onnxruntime/test/python/transformers/test_attention_fusion.py index d25432173a8f0..caaaa1aa628cf 100644 --- a/onnxruntime/test/python/transformers/test_attention_fusion.py +++ b/onnxruntime/test/python/transformers/test_attention_fusion.py @@ -395,17 +395,17 @@ def test_qwen3_normalization_fusion(self): ssln_count = sum(1 for n in nodes if n.op_type == "SkipSimplifiedLayerNormalization") # 4 RMSNorm patterns: pre-attn, Q-norm, K-norm, post-attn. - # Post-attn RMSNorm has an Add parent (residual) → fused as SkipSimplifiedLayerNormalization. - # Remaining 3 stay as SimplifiedLayerNormalization. + # Fallback for SkipLayerNormalization is disabled, so post-attn RMSNorm does not fuse. + # All 4 stay as SimplifiedLayerNormalization. self.assertEqual( sln_count, - 3, - f"Expected 3 SimplifiedLayerNormalization (pre-attn + Q-norm + K-norm), got {sln_count}", + 4, + f"Expected 4 SimplifiedLayerNormalization (pre-attn + Q-norm + K-norm + post-attn), got {sln_count}", ) self.assertEqual( ssln_count, - 1, - f"Expected 1 SkipSimplifiedLayerNormalization (residual + post-attn RMSNorm), got {ssln_count}", + 0, + f"Expected 0 SkipSimplifiedLayerNormalization (residual + post-attn RMSNorm failed to fuse), got {ssln_count}", ) diff --git a/onnxruntime/test/shared_lib/test_allocator.cc b/onnxruntime/test/shared_lib/test_allocator.cc index bf9e54e8b3c7b..c80712e272ad2 100644 --- a/onnxruntime/test/shared_lib/test_allocator.cc +++ b/onnxruntime/test/shared_lib/test_allocator.cc @@ -22,6 +22,27 @@ TEST(CApiTest, allocation_info) { ASSERT_EQ(OrtMemTypeDefault, cpu_mem_info_1.GetMemoryType()); } +// Verify that legacy (pre-1.25) memory info names "WebGPU_Buffer" and "WebNN_Tensor" are accepted +// and normalized to the current short names "WebGPU_Buf" and "WebNN_Ten". +// This ensures backward compatibility with released onnxruntime-genai that uses the old names. +TEST(CApiTest, LegacyWebGpuWebNNMemoryInfoNames) { + // Old (pre-1.25) names must be accepted + Ort::MemoryInfo legacy_webgpu("WebGPU_Buffer", OrtDeviceAllocator, 0, OrtMemTypeDefault); + Ort::MemoryInfo legacy_webnn("WebNN_Tensor", OrtDeviceAllocator, 0, OrtMemTypeDefault); + + // Current (short) names + Ort::MemoryInfo current_webgpu("WebGPU_Buf", OrtDeviceAllocator, 0, OrtMemTypeDefault); + Ort::MemoryInfo current_webnn("WebNN_Ten", OrtDeviceAllocator, 0, OrtMemTypeDefault); + + // Legacy names should be normalized to the current names + ASSERT_EQ(std::string("WebGPU_Buf"), legacy_webgpu.GetAllocatorName()); + ASSERT_EQ(std::string("WebNN_Ten"), legacy_webnn.GetAllocatorName()); + + // Memory infos created with legacy and current names should be equal + ASSERT_EQ(legacy_webgpu, current_webgpu); + ASSERT_EQ(legacy_webnn, current_webnn); +} + TEST(CApiTest, DefaultAllocator) { Ort::AllocatorWithDefaultOptions default_allocator; auto cpu_info = default_allocator.GetInfo(); diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index e472cbcee12d6..14cf38eeb5afd 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -5000,8 +5000,7 @@ TEST(CApiTest, InMemoryModel_SessionConfigExternalFileFolder_ExternalDataOutside // Verify that the exception message indicates security or external data issues EXPECT_TRUE(exception_message.find("External data path") != std::string::npos && - exception_message.find("escapes both model directory") != std::string::npos && - exception_message.find("and real model directory") != std::string::npos) + exception_message.find("escapes model directory") != std::string::npos) << "Exception message should indicate external data or security issue. Got: " << exception_message; }