diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake
index d40ae17e40545..d59c944c8926f 100644
--- a/cmake/onnxruntime_providers_vitisai.cmake
+++ b/cmake/onnxruntime_providers_vitisai.cmake
@@ -19,7 +19,16 @@
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs})
-  onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
+  set(onnxruntime_providers_vitisai_all_srcs ${onnxruntime_providers_vitisai_cc_srcs})
+  if(WIN32)
+    # Sets the DLL version info on Windows: https://learn.microsoft.com/en-us/windows/win32/menurc/versioninfo-resource
+    list(APPEND onnxruntime_providers_vitisai_all_srcs "${ONNXRUNTIME_ROOT}/core/providers/vitisai/onnxruntime_providers_vitisai.rc")
+  endif()
+  onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_all_srcs})
+  if(WIN32)
+    # FILE_NAME preprocessor definition is used in onnxruntime_providers_vitisai.rc
+    target_compile_definitions(onnxruntime_providers_vitisai PRIVATE FILE_NAME=\"onnxruntime_providers_vitisai.dll\")
+  endif()
   onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} safeint_interface flatbuffers::flatbuffers  Boost::mp11)
   target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
   if(MSVC)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index d4e023b0f86a0..9ae3e79d86443 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1786,7 +1786,7 @@ endif()
   endif()
 endif()
 
-if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_CUDA_MINIMAL)
 
   set(custom_op_src_patterns
     "${TEST_SRC_DIR}/testdata/custom_op_library/*.h"
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 81d4f2589151b..fa1914f2a927b 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -452,6 +452,36 @@ public struct OrtApi
         public IntPtr Graph_GetModelMetadata;
         public IntPtr GetModelCompatibilityForEpDevices;
         public IntPtr CreateExternalInitializerInfo;
+
+        // v1.24 APIs
+        public IntPtr TensorTypeAndShape_HasShape;
+        public IntPtr KernelInfo_GetConfigEntries;
+        public IntPtr KernelInfo_GetOperatorDomain;
+        public IntPtr KernelInfo_GetOperatorType;
+        public IntPtr KernelInfo_GetOperatorSinceVersion;
+        public IntPtr GetInteropApi;
+        public IntPtr SessionGetEpDeviceForOutputs;
+        public IntPtr GetNumHardwareDevices;
+        public IntPtr GetHardwareDevices;
+        public IntPtr GetHardwareDeviceEpIncompatibilityDetails;
+        public IntPtr DeviceEpIncompatibilityDetails_GetReasonsBitmask;
+        public IntPtr DeviceEpIncompatibilityDetails_GetNotes;
+        public IntPtr DeviceEpIncompatibilityDetails_GetErrorCode;
+        public IntPtr ReleaseDeviceEpIncompatibilityDetails;
+        public IntPtr GetCompatibilityInfoFromModel;
+        public IntPtr GetCompatibilityInfoFromModelBytes;
+        public IntPtr CreateEnvWithOptions;
+        public IntPtr Session_GetEpGraphAssignmentInfo;
+        public IntPtr EpAssignedSubgraph_GetEpName;
+        public IntPtr EpAssignedSubgraph_GetNodes;
+        public IntPtr EpAssignedNode_GetName;
+        public IntPtr EpAssignedNode_GetDomain;
+        public IntPtr EpAssignedNode_GetOperatorType;
+        public IntPtr RunOptionsSetSyncStream;
+        public IntPtr GetTensorElementTypeAndShapeDataReference;
+        // v1.25 APIs
+        public IntPtr RunOptionsEnableProfiling;
+        public IntPtr RunOptionsDisableProfiling;
     }
 
     internal static class NativeMethods
@@ -884,6 +914,16 @@ static NativeMethods()
                 (DOrtCopyTensors)Marshal.GetDelegateForFunctionPointer(
                     api_.CopyTensors,
                     typeof(DOrtCopyTensors));
+
+            OrtGetCompatibilityInfoFromModel =
+                (DOrtGetCompatibilityInfoFromModel)Marshal.GetDelegateForFunctionPointer(
+                    api_.GetCompatibilityInfoFromModel,
+                    typeof(DOrtGetCompatibilityInfoFromModel));
+
+            OrtGetCompatibilityInfoFromModelBytes =
+                (DOrtGetCompatibilityInfoFromModelBytes)Marshal.GetDelegateForFunctionPointer(
+                    api_.GetCompatibilityInfoFromModelBytes,
+                    typeof(DOrtGetCompatibilityInfoFromModelBytes));
         }
 
         internal class NativeLib
@@ -3092,6 +3132,31 @@ public delegate IntPtr DOrtEpSelectionDelegate(
 
         public static DOrtReleasePrepackedWeightsContainer OrtReleasePrepackedWeightsContainer;
 
+        /// <summary>
+        /// Extract EP compatibility info from a precompiled model file.
+        /// </summary>
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtGetCompatibilityInfoFromModel(
+            byte[] /* const ORTCHAR_T* */ model_path,
+            byte[] /* const char* */ ep_type,
+            IntPtr /* OrtAllocator* */ allocator,
+            out IntPtr /* char** */ compatibility_info);
+
+        public static DOrtGetCompatibilityInfoFromModel OrtGetCompatibilityInfoFromModel;
+
+        /// <summary>
+        /// Extract EP compatibility info from precompiled model bytes in memory.
+        /// </summary>
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtGetCompatibilityInfoFromModelBytes(
+            byte[] /* const void* */ model_data,
+            UIntPtr /* size_t */ model_data_length,
+            byte[] /* const char* */ ep_type,
+            IntPtr /* OrtAllocator* */ allocator,
+            out IntPtr /* char** */ compatibility_info);
+
+        public static DOrtGetCompatibilityInfoFromModelBytes OrtGetCompatibilityInfoFromModelBytes;
+
         #endregion
     } // class NativeMethods
 
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
index 22f541e2207fa..0876db3f21209 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
@@ -524,6 +524,75 @@ public OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices(
             return (OrtCompiledModelCompatibility)status;
         }
 
+        /// <summary>
+        /// Extract EP compatibility info from a precompiled model file.
+        /// </summary>
+        /// <remarks>
+        /// Parses the model file to extract the compatibility info string for a specific execution provider
+        /// from the model's metadata properties. This is only applicable to models that have been precompiled
+        /// for an EP. Standard ONNX models do not contain this information.
+        /// The compatibility info can then be passed to <see cref="GetModelCompatibilityForEpDevices"/> to
+        /// check if a precompiled model is compatible with the current system.
+        /// </remarks>
+        /// <param name="modelPath">Path to the ONNX model file.</param>
+        /// <param name="epType">The execution provider type string. Use <see cref="OrtEpDevice.EpName"/> to get this value.</param>
+        /// <returns>The compatibility info string, or null if no compatibility info exists for the specified EP.</returns>
+        /// <exception cref="ArgumentException">If modelPath or epType is null or empty.</exception>
+        /// <exception cref="OnnxRuntimeException">If the model file cannot be read or parsed.</exception>
+        public string GetCompatibilityInfoFromModel(string modelPath, string epType)
+        {
+            if (string.IsNullOrEmpty(modelPath))
+                throw new ArgumentException("modelPath must be non-empty", nameof(modelPath));
+            if (string.IsNullOrEmpty(epType))
+                throw new ArgumentException("epType must be non-empty", nameof(epType));
+
+            var allocator = OrtAllocator.DefaultInstance;
+            var pathBytes = NativeOnnxValueHelper.GetPlatformSerializedString(modelPath);
+            var epTypeUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(epType);
+
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.OrtGetCompatibilityInfoFromModel(
+                    pathBytes, epTypeUtf8, allocator.Pointer, out IntPtr compatInfoPtr));
+
+            if (compatInfoPtr == IntPtr.Zero)
+                return null;
+
+            return NativeOnnxValueHelper.StringFromNativeUtf8(compatInfoPtr, allocator);
+        }
+
+        /// <summary>
+        /// Extract EP compatibility info from precompiled model bytes in memory.
+        /// </summary>
+        /// <remarks>
+        /// Same as <see cref="GetCompatibilityInfoFromModel"/> but reads from a memory buffer instead of a file.
+        /// Useful when precompiled models are loaded from encrypted storage, network, or other non-file sources.
+        /// </remarks>
+        /// <param name="modelData">The model data bytes.</param>
+        /// <param name="epType">The execution provider type string. Use <see cref="OrtEpDevice.EpName"/> to get this value.</param>
+        /// <returns>The compatibility info string, or null if no compatibility info exists for the specified EP.</returns>
+        /// <exception cref="ArgumentException">If modelData is null/empty or epType is null or empty.</exception>
+        /// <exception cref="OnnxRuntimeException">If the model data cannot be parsed.</exception>
+        public string GetCompatibilityInfoFromModelBytes(byte[] modelData, string epType)
+        {
+            if (modelData == null || modelData.Length == 0)
+                throw new ArgumentException("modelData must be non-empty", nameof(modelData));
+            if (string.IsNullOrEmpty(epType))
+                throw new ArgumentException("epType must be non-empty", nameof(epType));
+
+            var allocator = OrtAllocator.DefaultInstance;
+            var epTypeUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(epType);
+
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.OrtGetCompatibilityInfoFromModelBytes(
+                    modelData, (UIntPtr)modelData.Length, epTypeUtf8,
+                    allocator.Pointer, out IntPtr compatInfoPtr));
+
+            if (compatInfoPtr == IntPtr.Zero)
+                return null;
+
+            return NativeOnnxValueHelper.StringFromNativeUtf8(compatInfoPtr, allocator);
+        }
+
 
         /// <summary>
         /// Get/Set log level property of OrtEnv instance
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs
index 103fe5bc10106..f1b792454f205 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs
@@ -10,6 +10,8 @@ namespace Microsoft.ML.OnnxRuntime.Tests;
 using System.Linq;
 using Xunit;
 using System.Collections.Generic;
+using Google.Protobuf;
+using Onnx;
 
 public class EpCompatibilityTests
 {
@@ -23,6 +25,35 @@ private IReadOnlyList<OrtEpDevice> GetDevices()
         return epDevices;
     }
 
+    /// <summary>
+    /// Creates a minimal valid ONNX ModelProto with optional compatibility metadata.
+    /// </summary>
+    private static byte[] CreateModelWithCompatibilityMetadata(
+        Dictionary<string, string> epCompatibilityInfo = null)
+    {
+        var modelProto = new ModelProto();
+        modelProto.IrVersion = (long)Onnx.Version.IrVersion;
+        modelProto.Graph = new GraphProto { Name = "test_graph" };
+
+        var opset = new OperatorSetIdProto();
+        opset.Domain = "";
+        opset.Version = 13;
+        modelProto.OpsetImport.Add(opset);
+
+        if (epCompatibilityInfo != null)
+        {
+            foreach (var kvp in epCompatibilityInfo)
+            {
+                var prop = new StringStringEntryProto();
+                prop.Key = "ep_compatibility_info." + kvp.Key;
+                prop.Value = kvp.Value;
+                modelProto.MetadataProps.Add(prop);
+            }
+        }
+
+        return modelProto.ToByteArray();
+    }
+
     [Fact]
     public void GetEpCompatibility_InvalidArgs()
     {
@@ -45,5 +76,103 @@ public void GetEpCompatibility_SingleDeviceCpuProvider()
         // CPU defaults to not applicable in this scenario
         Assert.Equal(OrtCompiledModelCompatibility.EP_NOT_APPLICABLE, status);
     }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModel_InvalidArgs()
+    {
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModel(null, "TestEP"));
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModel("", "TestEP"));
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModel("model.onnx", null));
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModel("model.onnx", ""));
+    }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModel_FileNotFound()
+    {
+        Assert.Throws<OnnxRuntimeException>(
+            () => ortEnvInstance.GetCompatibilityInfoFromModel("nonexistent_model_path.onnx", "TestEP"));
+    }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModelBytes_InvalidArgs()
+    {
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(null, "TestEP"));
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(new byte[0], "TestEP"));
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(new byte[] { 1, 2, 3 }, null));
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetCompatibilityInfoFromModelBytes(new byte[] { 1, 2, 3 }, ""));
+    }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModel_WithMetadata()
+    {
+        const string epType = "TestCompatEP";
+        const string expectedCompatInfo = "test_compat_v1.0_driver_123";
+
+        byte[] modelData = CreateModelWithCompatibilityMetadata(
+            new Dictionary<string, string> { { epType, expectedCompatInfo } });
+
+        string tempModelPath = System.IO.Path.Combine(
+            System.IO.Path.GetTempPath(),
+            System.IO.Path.GetRandomFileName() + ".onnx");
+
+        System.IO.File.WriteAllBytes(tempModelPath, modelData);
+
+        try
+        {
+            string result = ortEnvInstance.GetCompatibilityInfoFromModel(tempModelPath, epType);
+            Assert.NotNull(result);
+            Assert.Equal(expectedCompatInfo, result);
+        }
+        finally
+        {
+            if (System.IO.File.Exists(tempModelPath))
+            {
+                System.IO.File.Delete(tempModelPath);
+            }
+        }
+    }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModelBytes_InvalidModelData()
+    {
+        byte[] invalidData = System.Text.Encoding.UTF8.GetBytes("this is not a valid ONNX model");
+        Assert.Throws<OnnxRuntimeException>(
+            () => ortEnvInstance.GetCompatibilityInfoFromModelBytes(invalidData, "TestEP"));
+    }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModelBytes_WithMetadata()
+    {
+        const string epType = "TestCompatEP";
+        const string expectedCompatInfo = "test_compat_v1.0_driver_123";
+
+        byte[] modelData = CreateModelWithCompatibilityMetadata(
+            new Dictionary<string, string> { { epType, expectedCompatInfo } });
+
+        string result = ortEnvInstance.GetCompatibilityInfoFromModelBytes(modelData, epType);
+        Assert.NotNull(result);
+        Assert.Equal(expectedCompatInfo, result);
+    }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModelBytes_NotFound()
+    {
+        // Create model with metadata for a different EP
+        byte[] modelData = CreateModelWithCompatibilityMetadata(
+            new Dictionary<string, string> { { "DifferentEP", "some_value" } });
+
+        string result = ortEnvInstance.GetCompatibilityInfoFromModelBytes(modelData, "NonExistentEP");
+        Assert.Null(result);
+    }
+
+    [Fact]
+    public void GetCompatibilityInfoFromModelBytes_NoMetadata()
+    {
+        // Create model without any compatibility metadata
+        byte[] modelData = CreateModelWithCompatibilityMetadata();
+
+        string result = ortEnvInstance.GetCompatibilityInfoFromModelBytes(modelData, "AnyEP");
+        Assert.Null(result);
+    }
 }
 #endif
diff --git a/js/react_native/android/CMakeLists.txt b/js/react_native/android/CMakeLists.txt
index 0bcf552ff9e41..a23f5ba7cd8ab 100644
--- a/js/react_native/android/CMakeLists.txt
+++ b/js/react_native/android/CMakeLists.txt
@@ -1,5 +1,5 @@
+cmake_minimum_required(VERSION 3.13)
 project(OnnxruntimeJSI)
-cmake_minimum_required(VERSION 3.9.0)
 
 set(PACKAGE_NAME "onnxruntime-react-native")
 set(BUILD_DIR ${CMAKE_SOURCE_DIR}/build)
@@ -97,3 +97,6 @@ target_link_libraries(
   ${log-lib} # <-- Logcat logger
   android # <-- Android JNI core
 )
+
+# 16KB page size support (Android 15+ requirement)
+target_link_options(onnxruntimejsi PRIVATE "-Wl,-z,max-page-size=16384")
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
deleted file mode 100644
index 651f270230a75..0000000000000
--- a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "contrib_ops/cpu/bert/attention_base.h"
-#include "contrib_ops/cpu/bert/multihead_attention_helper.h"
-#include "core/providers/common.h"
-
-namespace onnxruntime {
-namespace contrib {
-
-Status AttentionBase::CheckInputs(const TensorShape& input_shape,
-                                  const TensorShape& weights_shape,
-                                  const TensorShape& bias_shape,
-                                  const Tensor*& mask_index,
-                                  const Tensor* past,
-                                  const Tensor* attention_bias,
-                                  void* parameters,
-                                  const Tensor* past_seq_len) const {
-  // Abbreviation and Meanings:
-  //   B:    batch_size
-  //   S:    sequence_length (input sequence length of query)
-  //   P:    past_sequence_length (past sequence length of key or value)
-  //   L:    kv_sequence_length (input sequence length of key or value)
-  //   M:    max_sequence_length
-  //   T:    total_sequence_length = past_sequence_length + kv_sequence_length
-  //   N:    num_heads
-  //   H:    head size for Q and K, aka q_head_size or k_head_size or qk_head_size
-  //   H_v:  v_head_size
-  //   D_i:  input hidden size
-  //   D:    hidden size for Q and K (D = N * H), aka q_hidden_size or k_hidden_size or qk_hidden_size
-  //   D_v:  v_hidden_size = num_heads * v_head_size
-
-  // When past state is used, Q, K and V should have same hidden size (unless we split it into past_key and past_value).
-
-  // Input shapes:
-  //   input        (Q/K/V)    : (B, S, D_i)
-  //   weights      (Q/K/V)    : (D_i, D + D + D_v)
-  //   bias         (Q/K/V)    : (D + D + D_v)
-  //   mask_index              : see below
-  //   past         (K/V)      : (2, B, N, P, H) or NULL
-  //   attention_bias          : (B or 1, N or 1, S, T) or NULL
-
-  // For mask_index, the following shapes are supported:
-  //     NULL, (B, 1), (1, 1)
-  //     (B), (2 * B), (3 * B + 2)
-  //     (B, T)
-  //     (B, S, T)
-  //     (B, 1, M, M)
-  //
-  // When a model is pruned (like some attention heads are removed in Q/K/V), input_hidden_size could be larger
-  // than hidden dimension of Q, K and V.
-
-  if (past != nullptr && attention_bias != nullptr) {
-    // past is used on GPT-2 model with past state, we don't have a case for attention bias yet
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Attention cannot have both past and attention_bias");
-  }
-
-  const auto& dims = input_shape.GetDims();
-  if (dims.size() != 3) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ",
-                           dims.size());
-  }
-
-  auto& batch_size = dims[0];
-  auto& sequence_length = dims[1];
-  int64_t input_hidden_size = dims[2];
-
-  const auto& bias_dims = bias_shape.GetDims();
-  if (bias_dims.size() != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ",
-                           bias_dims.size());
-  }
-
-  const auto& weights_dims = weights_shape.GetDims();
-  if (weights_dims.size() != 2) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'weights' is expected to have 2 dimensions, got ",
-                           weights_dims.size());
-  }
-  if (weights_dims[0] != input_hidden_size) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 1 dimension 0 should have same length as dimension 2 of input 0");
-  }
-
-  if (bias_dims[0] != weights_dims[1]) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'bias' dimension 0 should have same length as dimension 1 of input 'weights'");
-  }
-
-  int64_t q_hidden_size = bias_dims[0] / static_cast<int64_t>(3);
-  int64_t k_hidden_size = q_hidden_size;
-  int64_t v_hidden_size = k_hidden_size;
-  if (qkv_hidden_sizes_.size() != 0) {
-    if (qkv_hidden_sizes_.size() != 3) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "qkv_hidden_sizes attribute should have 3 elements");
-    }
-
-    for (size_t i = 0; i < qkv_hidden_sizes_.size(); i++) {
-      if (qkv_hidden_sizes_[i] % num_heads_ != 0) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "hidden_size should be divisible by num_heads:", qkv_hidden_sizes_[i]);
-      }
-    }
-
-    q_hidden_size = qkv_hidden_sizes_[0];
-    k_hidden_size = qkv_hidden_sizes_[1];
-    v_hidden_size = qkv_hidden_sizes_[2];
-  }
-
-  int64_t kv_sequence_length = sequence_length;
-
-  if (q_hidden_size != k_hidden_size) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "qkv_hidden_sizes first element should be same as the second");
-  }
-
-  if (this->require_same_hidden_size_ && k_hidden_size != v_hidden_size) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Hidden size of Q, K and V shall be same");
-  }
-
-  if (bias_dims[0] != q_hidden_size + k_hidden_size + v_hidden_size) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'bias' dimension 0 should have same length as sum of Q/K/V hidden sizes:",
-                           " q_hidden_size=", q_hidden_size, " k_hidden_size=", k_hidden_size, " v_hidden_size=",
-                           v_hidden_size, "bias_dims[0]=", bias_dims[0]);
-  }
-
-  int64_t past_sequence_length = 0;
-  if (past != nullptr) {  // past is optional
-    if (k_hidden_size != v_hidden_size) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' expect k_hidden_size == v_hidden_size");
-    }
-
-    const auto& past_dims = past->Shape().GetDims();
-    if (past_dims.size() != 5) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' is expected to have 5 dimension, got ",
-                             past_dims.size());
-    }
-
-    if (past_dims[0] != 2) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'past' dimension 0 shall have length of 2");
-    }
-
-    if (past_dims[1] != batch_size) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'past' dimension 1 shall have same length as dimension 0 of input 0");
-    }
-
-    if (static_cast<int>(past_dims[2]) != num_heads_) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'past' dimension 2 shall have length of num_heads", num_heads_);
-    }
-
-    if (static_cast<int>(past_dims[4]) != k_hidden_size / num_heads_) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'past' dimension 2 shall have length of ", k_hidden_size / num_heads_);
-    }
-
-    if (!past_present_share_buffer_) {
-      past_sequence_length = past_dims[3];
-    } else {
-      if (past_seq_len == nullptr || !onnxruntime::IsScalarOr1ElementVector(past_seq_len)) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "past_sequence_length tensor must be of one element when past_present_share_buffer is set");
-      }
-      past_sequence_length = *past_seq_len->Data<int32_t>();
-    }
-  }
-
-  int64_t total_sequence_length = kv_sequence_length + past_sequence_length;
-  if (past != nullptr && past_present_share_buffer_) {
-    const auto& past_dims = past->Shape().GetDims();
-    if (past_dims[3] < total_sequence_length) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "when past_present_share_buffer, past tensor sequence must not smaller than total_sequqnce_length ");
-    }
-  }
-
-  int64_t max_sequence_length = -1;
-  AttentionMaskType mask_type = AttentionMaskType::MASK_NONE;
-  if (mask_index != nullptr) {  // mask_index is optional
-    mask_type = AttentionMaskType::MASK_UNKNOWN;
-    auto status = this->CheckMask(mask_index, mask_type,
-                                  max_sequence_length, batch_size, sequence_length, total_sequence_length);
-    if (status != Status::OK()) {
-      return status;
-    }
-
-    if (mask_type == AttentionMaskType::MASK_2D_DUMMY) {
-      mask_index = nullptr;
-      mask_type = AttentionMaskType::MASK_NONE;
-    }
-  }
-
-  gsl::span<const int64_t> attention_bias_dims;
-  if (attention_bias != nullptr) {
-    attention_bias_dims = attention_bias->Shape().GetDims();
-
-    ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckAttentionBias(
-        attention_bias_dims, batch_size, num_heads_, sequence_length, total_sequence_length));
-  }
-
-  if (past != nullptr && past_present_share_buffer_) {
-    if (max_sequence_length <= 0) {
-      max_sequence_length = past->Shape().GetDims()[3];
-    }
-    if (max_sequence_length != past->Shape().GetDims()[3]) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "max_sequence_length not matching from mask and past when past_present_share_buffer_ is set");
-    }
-  }
-
-  if (parameters != nullptr) {
-    AttentionParameters* output_parameters = reinterpret_cast<AttentionParameters*>(parameters);
-    output_parameters->batch_size = static_cast<int>(batch_size);
-    output_parameters->sequence_length = static_cast<int>(sequence_length);
-    output_parameters->past_sequence_length = static_cast<int>(past_sequence_length);
-    output_parameters->kv_sequence_length = static_cast<int>(kv_sequence_length);
-    output_parameters->total_sequence_length = static_cast<int>(total_sequence_length);
-    output_parameters->max_sequence_length = static_cast<int>(max_sequence_length);
-    output_parameters->input_hidden_size = static_cast<int>(input_hidden_size);
-    output_parameters->hidden_size = static_cast<int>(q_hidden_size);
-    output_parameters->v_hidden_size = static_cast<int>(v_hidden_size);
-    output_parameters->head_size = static_cast<int>(q_hidden_size) / num_heads_;
-    output_parameters->v_head_size = static_cast<int>(v_hidden_size) / num_heads_;
-    output_parameters->num_heads = num_heads_;
-    output_parameters->is_unidirectional = is_unidirectional_;
-    output_parameters->past_present_share_buffer = (past_present_share_buffer_ != 0 && past != nullptr);
-    output_parameters->do_rotary = do_rotary_;
-    output_parameters->rotary_dim = rotary_embedding_ == 0 ? (int)(output_parameters->head_size) : rotary_embedding_;
-    output_parameters->mask_filter_value = mask_filter_value_;
-    output_parameters->scale = scale_;
-    output_parameters->mask_type = mask_type;
-    output_parameters->broadcast_attn_bias_dim_0 = attention_bias_dims.size() > 0 && attention_bias_dims[0] == 1;
-    output_parameters->broadcast_attn_bias_dim_1 = attention_bias_dims.size() > 1 && attention_bias_dims[1] == 1;
-    output_parameters->qkv_format = Q_K_V_BNSH;
-  }
-
-  return Status::OK();
-}
-
-Status AttentionBase::CheckMask(const Tensor* mask_index,
-                                AttentionMaskType& mask_type,
-                                int64_t& max_sequence_length,
-                                int64_t batch_size,
-                                int64_t sequence_length,
-                                int64_t total_sequence_length) const {
-  const auto& mask_dims = mask_index->Shape().GetDims();
-  if (mask_dims.size() == 1) {
-    if (mask_dims[0] != batch_size && mask_dims[0] != 2 * batch_size && mask_dims[0] != 3 * batch_size + 2) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'mask_index' with 1D data shall have length of batch_size or 2 * batch_size or 3 * batch_size + 2");
-    }
-    mask_type = (mask_dims[0] == batch_size ? AttentionMaskType::MASK_1D_KEY_SEQ_LEN : mask_dims[0] == 2 * batch_size ? AttentionMaskType::MASK_1D_END_START
-                                                                                                                      : AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START);
-  } else if (mask_dims.size() == 2) {
-    if (mask_dims[0] == batch_size && mask_dims[1] == total_sequence_length) {
-      mask_type = AttentionMaskType::MASK_2D_KEY_PADDING;
-    } else {
-      // Add operator supports broadcasting. Here we handle a case with only one element in the 2nd dimension.
-      if ((mask_dims[0] == batch_size || mask_dims[0] == 1) && mask_dims[1] == 1) {
-        // Mask will have same value after propagation, which has same effect as no mask.
-        mask_type = AttentionMaskType::MASK_2D_DUMMY;
-      } else {
-        return ORT_MAKE_STATUS(
-            ONNXRUNTIME, INVALID_ARGUMENT,
-            "Inputs 'mask_index' with 2D data shall have shape "
-            "batch_size x total_sequence_length");
-      }
-    }
-  } else if (mask_dims.size() == 3) {
-    if (mask_dims[0] != batch_size || mask_dims[1] != sequence_length || mask_dims[2] != total_sequence_length) {
-      return ORT_MAKE_STATUS(
-          ONNXRUNTIME, INVALID_ARGUMENT,
-          "Inputs 'mask_index' with 3D data shall have shape "
-          "batch_size x sequence_length x total_sequence_length");
-    }
-    mask_type = AttentionMaskType::MASK_3D_ATTENTION;
-  } else if (mask_dims.size() == 4) {
-    if (mask_dims[0] != batch_size || mask_dims[1] != 1 || mask_dims[2] != mask_dims[3] ||
-        mask_dims[2] < total_sequence_length) {
-      return ORT_MAKE_STATUS(
-          ONNXRUNTIME, INVALID_ARGUMENT,
-          "Inputs 'mask_index' with 4D data shall have shape "
-          "batch_size x 1 x max_sequence_length x max_sequence_length)");
-    }
-    max_sequence_length = mask_dims[3];
-    mask_type = AttentionMaskType::MASK_4D_MEGATRON;
-    if (this->is_unidirectional_) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'mask_index' with 4D data shall have is_unidirectional set to false");
-    }
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'mask_index' is expected to have 1, 2, 3 or 4 dimensions, got ",
-                           mask_dims.size());
-  }
-
-  return Status::OK();
-}
-
-Status AttentionBase::CheckInputs(const TensorShape& input_shape,
-                                  const TensorShape& weights_shape,
-                                  const TensorShape& bias_shape,
-                                  const Tensor*& mask_index,
-                                  const Tensor* past,
-                                  const Tensor* attention_bias,
-                                  void* parameters,
-                                  const int max_threads_per_block,
-                                  const Tensor* past_seq_len) const {
-  if (num_heads_ > max_threads_per_block) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block);
-  }
-
-  return CheckInputs(input_shape, weights_shape, bias_shape, mask_index, past, attention_bias, parameters, past_seq_len);
-}
-
-Tensor* AttentionBase::GetPresent(OpKernelContext* context,
-                                  const Tensor* past,
-                                  int batch_size,
-                                  int head_size,
-                                  int kv_sequence_length,
-                                  int& past_sequence_length) const {
-  // Input and output shapes:
-  //   past        : (2, batch_size, num_heads, past_sequence_length, head_size)
-  //   present     : (2, batch_size, num_heads, past_sequence_length + kv_sequence_length, head_size)
-
-  past_sequence_length = (nullptr != past) ? static_cast<int>(past->Shape().GetDims()[3]) : 0;
-  std::array<int64_t, 5> present_dims{2, batch_size, num_heads_, static_cast<int64_t>(kv_sequence_length) + past_sequence_length, head_size};
-
-  TensorShape present_shape(present_dims);
-  Tensor* present = context->Output(1, present_shape);
-  if (nullptr != past && nullptr == present) {
-    ORT_THROW("Expect to have present state output when past state input is given");
-  }
-
-  return present;
-}
-
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
index bd7f03379b2f0..2872fcfda5bbf 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
@@ -3,12 +3,19 @@
 
 #pragma once
 
+#include <array>
 #include <vector>
 #include "core/common/common.h"
-#include "core/framework/op_kernel.h"
 #include "core/providers/cpu/mlas_backend_kernel_selector_config_utils.h"
+#ifndef SHARED_PROVIDER
+#include "core/framework/op_kernel.h"
+#include "core/providers/common.h"
+#endif
 #include "contrib_ops/cpu/bert/attention_common.h"
 #include "contrib_ops/cpu/bert/attention_parameters.h"
+#ifndef SHARED_PROVIDER
+#include "contrib_ops/cpu/bert/multihead_attention_helper.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -25,14 +32,25 @@ class AttentionBase {
                      const int max_threads_per_block,  // for CUDA
                      const Tensor* past_seq_len = nullptr) const;
 
+#ifdef SHARED_PROVIDER
   Tensor* GetPresent(OpKernelContext* context,
                      const Tensor* past,
                      int batch_size,
                      int head_size,
                      int kv_sequence_length,
                      int& past_sequence_length) const;
+#else
+  template <typename TOpKernelContext>
+  Tensor* GetPresent(TOpKernelContext* context,
+                     const Tensor* past,
+                     int batch_size,
+                     int head_size,
+                     int kv_sequence_length,
+                     int& past_sequence_length) const;
+#endif
 
  protected:
+  // Keep the class layout identical in SHARED_PROVIDER and non-SHARED_PROVIDER builds.
   MLAS_BACKEND_KERNEL_SELECTOR_CONFIG mlas_backend_kernel_selector_config_;
 
   template <typename KernelInfoType>
@@ -54,7 +72,9 @@ class AttentionBase {
 
     require_same_hidden_size_ = require_same_hidden_size;
 
+#ifndef SHARED_PROVIDER
     SetupMlasBackendKernelSelectorFromConfigOptions(mlas_backend_kernel_selector_config_, info.GetConfigOptions());
+#endif
   }
 
   Status CheckMask(const Tensor* mask_index,
@@ -84,5 +104,299 @@ class AttentionBase {
   float scale_;                            // the scale to be used for softmax
 };
 
+#ifndef SHARED_PROVIDER
+// Inline implementations of out-of-line methods for non-SHARED_PROVIDER builds
+// (attention_base.cc definitions are used only in the SHARED_PROVIDER bridge path).
+inline Status AttentionBase::CheckMask(const Tensor* mask_index,
+                                       AttentionMaskType& mask_type,
+                                       int64_t& max_sequence_length,
+                                       int64_t batch_size,
+                                       int64_t sequence_length,
+                                       int64_t total_sequence_length) const {
+  const auto& mask_dims = mask_index->Shape().GetDims();
+  if (mask_dims.size() == 1) {
+    if (mask_dims[0] != batch_size && mask_dims[0] != 2 * batch_size && mask_dims[0] != 3 * batch_size + 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Inputs 'mask_index' with 1D data shall have length of batch_size or 2 * batch_size or 3 * batch_size + 2");
+    }
+    mask_type = (mask_dims[0] == batch_size ? AttentionMaskType::MASK_1D_KEY_SEQ_LEN : mask_dims[0] == 2 * batch_size ? AttentionMaskType::MASK_1D_END_START
+                                                                                                                      : AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START);
+  } else if (mask_dims.size() == 2) {
+    if (mask_dims[0] == batch_size && mask_dims[1] == total_sequence_length) {
+      mask_type = AttentionMaskType::MASK_2D_KEY_PADDING;
+    } else {
+      if ((mask_dims[0] == batch_size || mask_dims[0] == 1) && mask_dims[1] == 1) {
+        mask_type = AttentionMaskType::MASK_2D_DUMMY;
+      } else {
+        return ORT_MAKE_STATUS(
+            ONNXRUNTIME, INVALID_ARGUMENT,
+            "Inputs 'mask_index' with 2D data shall have shape "
+            "batch_size x total_sequence_length");
+      }
+    }
+  } else if (mask_dims.size() == 3) {
+    if (mask_dims[0] != batch_size || mask_dims[1] != sequence_length || mask_dims[2] != total_sequence_length) {
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT,
+          "Inputs 'mask_index' with 3D data shall have shape "
+          "batch_size x sequence_length x total_sequence_length");
+    }
+    mask_type = AttentionMaskType::MASK_3D_ATTENTION;
+  } else if (mask_dims.size() == 4) {
+    if (mask_dims[0] != batch_size || mask_dims[1] != 1 || mask_dims[2] != mask_dims[3] ||
+        mask_dims[2] < total_sequence_length) {
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT,
+          "Inputs 'mask_index' with 4D data shall have shape "
+          "batch_size x 1 x max_sequence_length x max_sequence_length)");
+    }
+    max_sequence_length = mask_dims[3];
+    mask_type = AttentionMaskType::MASK_4D_MEGATRON;
+    if (this->is_unidirectional_) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Inputs 'mask_index' with 4D data shall have is_unidirectional set to false");
+    }
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'mask_index' is expected to have 1, 2, 3 or 4 dimensions, got ",
+                           mask_dims.size());
+  }
+
+  return Status::OK();
+}
+
+inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
+                                         const TensorShape& weights_shape,
+                                         const TensorShape& bias_shape,
+                                         const Tensor*& mask_index,
+                                         const Tensor* past,
+                                         const Tensor* attention_bias,
+                                         void* parameters,
+                                         const Tensor* past_seq_len) const {
+  if (past != nullptr && attention_bias != nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Attention cannot have both past and attention_bias");
+  }
+
+  const auto& dims = input_shape.GetDims();
+  if (dims.size() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ",
+                           dims.size());
+  }
+
+  auto& batch_size = dims[0];
+  auto& sequence_length = dims[1];
+  int64_t input_hidden_size = dims[2];
+
+  const auto& bias_dims = bias_shape.GetDims();
+  if (bias_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ",
+                           bias_dims.size());
+  }
+
+  const auto& weights_dims = weights_shape.GetDims();
+  if (weights_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'weights' is expected to have 2 dimensions, got ",
+                           weights_dims.size());
+  }
+  if (weights_dims[0] != input_hidden_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 1 dimension 0 should have same length as dimension 2 of input 0");
+  }
+
+  if (bias_dims[0] != weights_dims[1]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'bias' dimension 0 should have same length as dimension 1 of input 'weights'");
+  }
+
+  int64_t q_hidden_size = bias_dims[0] / static_cast<int64_t>(3);
+  int64_t k_hidden_size = q_hidden_size;
+  int64_t v_hidden_size = k_hidden_size;
+  if (qkv_hidden_sizes_.size() != 0) {
+    if (qkv_hidden_sizes_.size() != 3) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "qkv_hidden_sizes attribute should have 3 elements");
+    }
+
+    for (size_t i = 0; i < qkv_hidden_sizes_.size(); i++) {
+      if (qkv_hidden_sizes_[i] % num_heads_ != 0) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "hidden_size should be divisible by num_heads:", qkv_hidden_sizes_[i]);
+      }
+    }
+
+    q_hidden_size = qkv_hidden_sizes_[0];
+    k_hidden_size = qkv_hidden_sizes_[1];
+    v_hidden_size = qkv_hidden_sizes_[2];
+  }
+
+  int64_t kv_sequence_length = sequence_length;
+
+  if (q_hidden_size != k_hidden_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "qkv_hidden_sizes first element should be same as the second");
+  }
+
+  if (this->require_same_hidden_size_ && k_hidden_size != v_hidden_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Hidden size of Q, K and V shall be same");
+  }
+
+  if (bias_dims[0] != q_hidden_size + k_hidden_size + v_hidden_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'bias' dimension 0 should have same length as sum of Q/K/V hidden sizes:",
+                           " q_hidden_size=", q_hidden_size, " k_hidden_size=", k_hidden_size, " v_hidden_size=",
+                           v_hidden_size, "bias_dims[0]=", bias_dims[0]);
+  }
+
+  int64_t past_sequence_length = 0;
+  if (past != nullptr) {
+    if (k_hidden_size != v_hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' expect k_hidden_size == v_hidden_size");
+    }
+
+    const auto& past_dims = past->Shape().GetDims();
+    if (past_dims.size() != 5) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'past' is expected to have 5 dimension, got ",
+                             past_dims.size());
+    }
+
+    if (past_dims[0] != 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'past' dimension 0 shall have length of 2");
+    }
+
+    if (past_dims[1] != batch_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Inputs 'past' dimension 1 shall have same length as dimension 0 of input 0");
+    }
+
+    if (static_cast<int>(past_dims[2]) != num_heads_) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Inputs 'past' dimension 2 shall have length of num_heads", num_heads_);
+    }
+
+    if (static_cast<int>(past_dims[4]) != k_hidden_size / num_heads_) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Inputs 'past' dimension 2 shall have length of ", k_hidden_size / num_heads_);
+    }
+
+    if (!past_present_share_buffer_) {
+      past_sequence_length = past_dims[3];
+    } else {
+      if (past_seq_len == nullptr || !::onnxruntime::IsScalarOr1ElementVector(past_seq_len)) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "past_sequence_length tensor must be of one element when past_present_share_buffer is set");
+      }
+      past_sequence_length = *past_seq_len->Data<int32_t>();
+    }
+  }
+
+  int64_t total_sequence_length = kv_sequence_length + past_sequence_length;
+  if (past != nullptr && past_present_share_buffer_) {
+    const auto& past_dims = past->Shape().GetDims();
+    if (past_dims[3] < total_sequence_length) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "when past_present_share_buffer, past tensor sequence must not smaller than total_sequence_length ");
+    }
+  }
+
+  int64_t max_sequence_length = -1;
+  AttentionMaskType mask_type = AttentionMaskType::MASK_NONE;
+  if (mask_index != nullptr) {
+    mask_type = AttentionMaskType::MASK_UNKNOWN;
+    auto status = this->CheckMask(mask_index, mask_type,
+                                  max_sequence_length, batch_size, sequence_length, total_sequence_length);
+    if (status != Status::OK()) {
+      return status;
+    }
+
+    if (mask_type == AttentionMaskType::MASK_2D_DUMMY) {
+      mask_index = nullptr;
+      mask_type = AttentionMaskType::MASK_NONE;
+    }
+  }
+
+  gsl::span<const int64_t> attention_bias_dims;
+  if (attention_bias != nullptr) {
+    attention_bias_dims = attention_bias->Shape().GetDims();
+
+    ORT_RETURN_IF_ERROR(::onnxruntime::contrib::multihead_attention_helper::CheckAttentionBias(
+        attention_bias_dims, batch_size, num_heads_, sequence_length, total_sequence_length));
+  }
+
+  if (past != nullptr && past_present_share_buffer_) {
+    if (max_sequence_length <= 0) {
+      max_sequence_length = past->Shape().GetDims()[3];
+    }
+    if (max_sequence_length != past->Shape().GetDims()[3]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "max_sequence_length not matching from mask and past when past_present_share_buffer_ is set");
+    }
+  }
+
+  if (parameters != nullptr) {
+    AttentionParameters* output_parameters = reinterpret_cast<AttentionParameters*>(parameters);
+    output_parameters->batch_size = static_cast<int>(batch_size);
+    output_parameters->sequence_length = static_cast<int>(sequence_length);
+    output_parameters->past_sequence_length = static_cast<int>(past_sequence_length);
+    output_parameters->kv_sequence_length = static_cast<int>(kv_sequence_length);
+    output_parameters->total_sequence_length = static_cast<int>(total_sequence_length);
+    output_parameters->max_sequence_length = static_cast<int>(max_sequence_length);
+    output_parameters->input_hidden_size = static_cast<int>(input_hidden_size);
+    output_parameters->hidden_size = static_cast<int>(q_hidden_size);
+    output_parameters->v_hidden_size = static_cast<int>(v_hidden_size);
+    output_parameters->head_size = static_cast<int>(q_hidden_size) / num_heads_;
+    output_parameters->v_head_size = static_cast<int>(v_hidden_size) / num_heads_;
+    output_parameters->num_heads = num_heads_;
+    output_parameters->is_unidirectional = is_unidirectional_;
+    output_parameters->past_present_share_buffer = (past_present_share_buffer_ != 0 && past != nullptr);
+    output_parameters->do_rotary = do_rotary_;
+    output_parameters->rotary_dim = rotary_embedding_ == 0 ? (int)(output_parameters->head_size) : rotary_embedding_;
+    output_parameters->mask_filter_value = mask_filter_value_;
+    output_parameters->scale = scale_;
+    output_parameters->mask_type = mask_type;
+    output_parameters->broadcast_attn_bias_dim_0 = attention_bias_dims.size() > 0 && attention_bias_dims[0] == 1;
+    output_parameters->broadcast_attn_bias_dim_1 = attention_bias_dims.size() > 1 && attention_bias_dims[1] == 1;
+    output_parameters->qkv_format = Q_K_V_BNSH;
+  }
+
+  return Status::OK();
+}
+
+inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
+                                         const TensorShape& weights_shape,
+                                         const TensorShape& bias_shape,
+                                         const Tensor*& mask_index,
+                                         const Tensor* past,
+                                         const Tensor* attention_bias,
+                                         void* parameters,
+                                         const int max_threads_per_block,
+                                         const Tensor* past_seq_len) const {
+  if (num_heads_ > max_threads_per_block) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block);
+  }
+
+  return CheckInputs(input_shape, weights_shape, bias_shape, mask_index, past, attention_bias, parameters, past_seq_len);
+}
+
+template <typename TOpKernelContext>
+inline Tensor* AttentionBase::GetPresent(TOpKernelContext* context,
+                                         const Tensor* past,
+                                         int batch_size,
+                                         int head_size,
+                                         int kv_sequence_length,
+                                         int& past_sequence_length) const {
+  past_sequence_length = (nullptr != past) ? static_cast<int>(past->Shape().GetDims()[3]) : 0;
+  std::array<int64_t, 5> present_dims{2, batch_size, num_heads_,
+                                      static_cast<int64_t>(kv_sequence_length) + past_sequence_length, head_size};
+
+  TensorShape present_shape(present_dims);
+  Tensor* present = context->Output(1, present_shape);
+  if (nullptr != past && nullptr == present) {
+    ORT_THROW("Expect to have present state output when past state input is given");
+  }
+
+  return present;
+}
+#endif  // SHARED_PROVIDER
+
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h b/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h
index 9a123e80adc18..f316a0dfdf91c 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h
@@ -10,33 +10,33 @@ namespace contrib {
 
 // Parameters deduced from node attributes and inputs/outputs.
 struct AttentionParameters {
-  int batch_size;
-  int sequence_length;
-  int kv_sequence_length;     // input sequence length of K or V
-  int past_sequence_length;   // sequence length in past state of K or V
-  int total_sequence_length;  // total sequence length of K or V
-  int max_sequence_length;    // max sequence length from 4D mask
-  int input_hidden_size;      // first dimension of weights for input projection
-  int hidden_size;            // hidden size of Q or K
-  int head_size;              // hidden size per head of Q or K
-  int v_hidden_size;          // hidden size of V
-  int v_head_size;            // hidden size per head of V
-  int num_heads;
-  int num_splits;      // number of splits for splitkv
+  int batch_size = 0;
+  int sequence_length = 0;
+  int kv_sequence_length = 0;     // input sequence length of K or V
+  int past_sequence_length = 0;   // sequence length in past state of K or V
+  int total_sequence_length = 0;  // total sequence length of K or V
+  int max_sequence_length = 0;    // max sequence length from 4D mask
+  int input_hidden_size = 0;      // first dimension of weights for input projection
+  int hidden_size = 0;            // hidden size of Q or K
+  int head_size = 0;              // hidden size per head of Q or K
+  int v_hidden_size = 0;          // hidden size of V
+  int v_head_size = 0;            // hidden size per head of V
+  int num_heads = 0;
+  int num_splits = 0;  // number of splits for splitkv
   int rotary_dim = 0;  // rotary embedding dimension
-  int beam_width;
+  int beam_width = 0;
   bool is_unidirectional = false;
   bool past_present_share_buffer = false;
   bool is_packed_qkv = false;  // whether qkv is packed
   bool do_rotary = false;
   bool broadcast_attn_bias_dim_0 = false;
   bool broadcast_attn_bias_dim_1 = false;
-  float mask_filter_value;
-  float scale;
+  float mask_filter_value = 0.0f;
+  float scale = 0.0f;
   bool use_tf32 = false;
   bool is_output_bnsh = false;  // whether the output format is BNSH
-  AttentionMaskType mask_type;
-  AttentionQkvFormat qkv_format;
+  AttentionMaskType mask_type = AttentionMaskType::MASK_NONE;
+  AttentionQkvFormat qkv_format = AttentionQkvFormat::Q_K_V_BNSH;
 };
 
 // Parameters deduced from node attributes and inputs/outputs.
diff --git a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc
deleted file mode 100644
index 97f75d297d789..0000000000000
--- a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "longformer_attention_base.h"
-
-namespace onnxruntime {
-namespace contrib {
-
-Status LongformerAttentionBase::CheckInputs(const TensorShape& input_shape,
-                                            const TensorShape& weights_shape,
-                                            const TensorShape& bias_shape,
-                                            const TensorShape& attention_mask_shape,
-                                            const TensorShape& global_weights_shape,
-                                            const TensorShape& global_bias_shape,
-                                            const TensorShape& global_mask_shape) const {
-  // Input shapes:
-  //   input                 : (batch_size, sequence_length, hidden_size)
-  //   weights               : (hidden_size, 3 * hidden_size) -- format 1
-  //                           (3, hidden_size, hidden_size)  -- format 0
-  //   bias                  : (3 * hidden_size)              -- format 1 (bias for Q, K, V)
-  //                           (5 * hidden_size)              -- format 0 (bias for Q, K, V, Global_K, Global_V)
-  //   attention_mask        : (batch_size, sequence_length)
-  //   global_weights        : (hidden_size, 3 * hidden_size) -- format 1
-  //                           (3, hidden_size, hidden_size)  -- format 0
-  //   global_bias           : (3 * hidden_size)              -- format 1 (bias for Global_Q, Global_K, Global_V)
-  //                           (1 * hidden_size)              -- format 0 (bias for Global_Q)
-  //   global_attention_mask : (batch_size, sequence_length)
-
-  const auto& dims = input_shape.GetDims();
-  if (dims.size() != 3) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ",
-                           dims.size());
-  }
-
-  int batch_size = static_cast<int>(dims[0]);
-  int sequence_length = static_cast<int>(dims[1]);
-  auto hidden_size = dims[2];
-  if (sequence_length % (2 * window_) != 0) {
-    return ORT_MAKE_STATUS(
-        ONNXRUNTIME, INVALID_ARGUMENT,
-        "Input 'input' dimension 1 should be divisible by 2W, where W is value of the window attribute.");
-  }
-  if (hidden_size % num_heads_ != 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'input' dimension 2 should be divisible by value of the num_heads attribute.");
-  }
-
-  const auto& weights_dims = weights_shape.GetDims();
-  bool use_merged_qkv_weights = (weights_shape.NumDimensions() == 2);
-  if (use_merged_qkv_weights) {
-    if (weights_dims[0] != hidden_size || weights_dims[1] != 3 * hidden_size) {
-      return ORT_MAKE_STATUS(
-          ONNXRUNTIME, INVALID_ARGUMENT,
-          "Input 'weights' shape should be (hidden_size, 3 * hidden_size) for format 1");
-    }
-  } else {
-    if (weights_dims.size() != 3 ||
-        weights_dims[0] != 3 || weights_dims[1] != hidden_size || weights_dims[2] != hidden_size) {
-      return ORT_MAKE_STATUS(
-          ONNXRUNTIME, INVALID_ARGUMENT,
-          "Input 'weights' shape should be (3, hidden_size, hidden_size) for format 0");
-    }
-  }
-
-  const auto& bias_dims = bias_shape.GetDims();
-  if (bias_dims.size() != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ",
-                           bias_dims.size());
-  }
-
-  if (use_merged_qkv_weights) {
-    if (bias_dims[0] != 3 * hidden_size) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Input 'bias' shape should be (3 * hidden_size) for format 1");
-    }
-  } else {
-    if (bias_dims[0] != 5 * hidden_size) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Input 'bias' shape should be (5 * hidden_size) for format 0");
-    }
-  }
-
-  const auto& mask_dims = attention_mask_shape.GetDims();
-  if (mask_dims.size() == 2) {
-    if (static_cast<int>(mask_dims[0]) != batch_size || static_cast<int>(mask_dims[1]) != sequence_length) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'attention_mask' shape shall be (batch_size, sequence_length)");
-    }
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'attention_mask' is expected to have 2 dimensions, got ", mask_dims.size());
-  }
-
-  const auto& global_weights_dims = global_weights_shape.GetDims();
-  if (use_merged_qkv_weights) {
-    if (global_weights_dims.size() != 2 ||
-        global_weights_dims[0] != hidden_size || global_weights_dims[1] != 3 * hidden_size) {
-      return ORT_MAKE_STATUS(
-          ONNXRUNTIME, INVALID_ARGUMENT,
-          "Input 'global_weights' shape should be (hidden_size, 3 * hidden_size) for format 1");
-    }
-  } else {
-    if (global_weights_dims.size() != 3 || global_weights_dims[0] != 3 ||
-        global_weights_dims[1] != hidden_size || global_weights_dims[2] != hidden_size) {
-      return ORT_MAKE_STATUS(
-          ONNXRUNTIME, INVALID_ARGUMENT,
-          "Input 'global_weights' shape should be (3, hidden_size, hidden_size) for format 0");
-    }
-  }
-
-  const auto& global_bias_dims = global_bias_shape.GetDims();
-  if (global_bias_dims.size() != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global_bias' is expected to have 1 dimension, got ",
-                           global_bias_dims.size());
-  }
-
-  if (use_merged_qkv_weights) {
-    if (global_bias_dims[0] != 3 * hidden_size) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Input 'global_bias' shape should be (3 * hidden_size) for format 1");
-    }
-  } else {
-    if (global_bias_dims[0] != hidden_size) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Input 'global_bias' shape should be (hidden_size) for format 0");
-    }
-  }
-
-  const auto& global_mask_dims = global_mask_shape.GetDims();
-  if (global_mask_dims.size() != 2 ||
-      static_cast<int>(global_mask_dims[0]) != batch_size ||
-      static_cast<int>(global_mask_dims[1]) != sequence_length) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'global_attention_mask' shape shall be (batch_size, sequence_length)");
-  }
-
-  return Status::OK();
-}
-
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h
index ac1cccaa83cf9..bb1dfea38ae80 100644
--- a/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/longformer_attention_base.h
@@ -4,7 +4,9 @@
 #pragma once
 
 #include "core/common/common.h"
+#ifndef SHARED_PROVIDER
 #include "core/framework/op_kernel.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -20,7 +22,8 @@ class LongformerAttentionBase {
                      const TensorShape& global_attention_mask_shape) const;
 
  protected:
-  LongformerAttentionBase(const OpKernelInfo& info) {
+  template <typename KernelInfoType>
+  LongformerAttentionBase(const KernelInfoType& info) {
     int64_t num_heads = 0;
     ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
     num_heads_ = static_cast<int>(num_heads);
@@ -43,5 +46,126 @@ constexpr const char* kUseHalf4 = "ORT_LONGFORMER_USE_HALF4";
 
 }  // namespace longformer
 
+#ifndef SHARED_PROVIDER
+// Inline implementation of CheckInputs for non-SHARED_PROVIDER builds.
+inline Status LongformerAttentionBase::CheckInputs(const TensorShape& input_shape,
+                                                   const TensorShape& weights_shape,
+                                                   const TensorShape& bias_shape,
+                                                   const TensorShape& attention_mask_shape,
+                                                   const TensorShape& global_weights_shape,
+                                                   const TensorShape& global_bias_shape,
+                                                   const TensorShape& global_mask_shape) const {
+  const auto& dims = input_shape.GetDims();
+  if (dims.size() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is expected to have 3 dimensions, got ",
+                           dims.size());
+  }
+
+  int batch_size = static_cast<int>(dims[0]);
+  int sequence_length = static_cast<int>(dims[1]);
+  auto hidden_size = dims[2];
+  if (sequence_length % (2 * window_) != 0) {
+    return ORT_MAKE_STATUS(
+        ONNXRUNTIME, INVALID_ARGUMENT,
+        "Input 'input' dimension 1 should be divisible by 2W, where W is value of the window attribute.");
+  }
+  if (hidden_size % num_heads_ != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'input' dimension 2 should be divisible by value of the num_heads attribute.");
+  }
+
+  const auto& weights_dims = weights_shape.GetDims();
+  bool use_merged_qkv_weights = (weights_shape.NumDimensions() == 2);
+  if (use_merged_qkv_weights) {
+    if (weights_dims[0] != hidden_size || weights_dims[1] != 3 * hidden_size) {
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT,
+          "Input 'weights' shape should be (hidden_size, 3 * hidden_size) for format 1");
+    }
+  } else {
+    if (weights_dims.size() != 3 ||
+        weights_dims[0] != 3 || weights_dims[1] != hidden_size || weights_dims[2] != hidden_size) {
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT,
+          "Input 'weights' shape should be (3, hidden_size, hidden_size) for format 0");
+    }
+  }
+
+  const auto& bias_dims = bias_shape.GetDims();
+  if (bias_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ",
+                           bias_dims.size());
+  }
+
+  if (use_merged_qkv_weights) {
+    if (bias_dims[0] != 3 * hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'bias' shape should be (3 * hidden_size) for format 1");
+    }
+  } else {
+    if (bias_dims[0] != 5 * hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'bias' shape should be (5 * hidden_size) for format 0");
+    }
+  }
+
+  const auto& mask_dims = attention_mask_shape.GetDims();
+  if (mask_dims.size() == 2) {
+    if (static_cast<int>(mask_dims[0]) != batch_size || static_cast<int>(mask_dims[1]) != sequence_length) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Inputs 'attention_mask' shape shall be (batch_size, sequence_length)");
+    }
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'attention_mask' is expected to have 2 dimensions, got ", mask_dims.size());
+  }
+
+  const auto& global_weights_dims = global_weights_shape.GetDims();
+  if (use_merged_qkv_weights) {
+    if (global_weights_dims.size() != 2 ||
+        global_weights_dims[0] != hidden_size || global_weights_dims[1] != 3 * hidden_size) {
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT,
+          "Input 'global_weights' shape should be (hidden_size, 3 * hidden_size) for format 1");
+    }
+  } else {
+    if (global_weights_dims.size() != 3 || global_weights_dims[0] != 3 ||
+        global_weights_dims[1] != hidden_size || global_weights_dims[2] != hidden_size) {
+      return ORT_MAKE_STATUS(
+          ONNXRUNTIME, INVALID_ARGUMENT,
+          "Input 'global_weights' shape should be (3, hidden_size, hidden_size) for format 0");
+    }
+  }
+
+  const auto& global_bias_dims = global_bias_shape.GetDims();
+  if (global_bias_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'global_bias' is expected to have 1 dimension, got ",
+                           global_bias_dims.size());
+  }
+
+  if (use_merged_qkv_weights) {
+    if (global_bias_dims[0] != 3 * hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'global_bias' shape should be (3 * hidden_size) for format 1");
+    }
+  } else {
+    if (global_bias_dims[0] != hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 'global_bias' shape should be (hidden_size) for format 0");
+    }
+  }
+
+  const auto& global_mask_dims = global_mask_shape.GetDims();
+  if (global_mask_dims.size() != 2 ||
+      static_cast<int>(global_mask_dims[0]) != batch_size ||
+      static_cast<int>(global_mask_dims[1]) != sequence_length) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'global_attention_mask' shape shall be (batch_size, sequence_length)");
+  }
+
+  return Status::OK();
+}
+#endif  // SHARED_PROVIDER
+
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/crop.h b/onnxruntime/contrib_ops/cpu/crop.h
index 3b72ef429c1f7..97577304e948e 100644
--- a/onnxruntime/contrib_ops/cpu/crop.h
+++ b/onnxruntime/contrib_ops/cpu/crop.h
@@ -4,7 +4,9 @@
 #pragma once
 
 #include "core/common/common.h"
+#ifndef SHARED_PROVIDER
 #include "core/framework/op_kernel.h"
+#endif
 
 #include <gsl/gsl>
 
@@ -13,9 +15,10 @@ namespace contrib {
 
 class CropBase {
  protected:
-  CropBase(const OpKernelInfo& info)
-      : border_(info.GetAttrsOrDefault<int64_t>("border")),
-        scale_(info.GetAttrsOrDefault<int64_t>("scale")) {
+  template <typename KernelInfoType>
+  CropBase(const KernelInfoType& info)
+      : border_(info.template GetAttrsOrDefault<int64_t>("border")),
+        scale_(info.template GetAttrsOrDefault<int64_t>("scale")) {
   }
 
   Status ValidateInput(const Tensor* X) const {
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index a656abb098911..56bff8aa30f68 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -237,9 +237,22 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 ||
-             strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0) {
+             strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0 ||
+             // Accept pre-1.25 names "WebGPU_Buffer"/"WebNN_Tensor" for backward compatibility
+             // with released onnxruntime-genai that still uses the old names.
+             // Normalize to the current (short) constant so downstream name comparisons work.
+             // See: https://github.com/microsoft/onnxruntime/pull/27207
+             strcmp(name1, "WebGPU_Buffer") == 0 ||
+             strcmp(name1, "WebNN_Tensor") == 0) {
+    // Map old long names to current short constants to keep downstream name comparisons consistent.
+    const char* normalized_name = name1;
+    if (strcmp(name1, "WebGPU_Buffer") == 0) {
+      normalized_name = onnxruntime::WEBGPU_BUFFER;
+    } else if (strcmp(name1, "WebNN_Tensor") == 0) {
+      normalized_name = onnxruntime::WEBNN_TENSOR;
+    }
     *out = new OrtMemoryInfo(
-        name1, type,
+        normalized_name, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, device_id),
         mem_type1);
 
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 6bcbdc401619a..bee7f048b7c6e 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -349,66 +349,112 @@ Status TensorProtoWithExternalDataToTensorProto(
   return Status::OK();
 }
 
-Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
-                                const std::filesystem::path& location,
-                                const std::filesystem::path& model_path) {
-  // Reject absolute paths
-  ORT_RETURN_IF(location.is_absolute(),
-                "Absolute paths not allowed for external data location");
-  if (!base_dir.empty()) {
-    // Resolve and verify the path stays within model directory
-    auto base_canonical = std::filesystem::weakly_canonical(base_dir);
-    // If the symlink exists, it resolves to the target path;
-    // so if the symlink is outside the directory it would be caught here.
-    auto resolved = std::filesystem::weakly_canonical(base_dir / location);
-
-    // Check that resolved path starts with base directory
-    auto [base_end, resolved_it] = std::mismatch(
-        base_canonical.begin(), base_canonical.end(),
-        resolved.begin(), resolved.end());
-
-    if (base_end != base_canonical.end()) {
-      // If validation against logical base_dir fails, we check against the
-      // real (canonical) path of the model file to support symlinked models
-      // (e.g. models in Hugging Face Hub local cache).
-      if (!model_path.empty()) {
-        auto real_model_dir = std::filesystem::weakly_canonical(model_path).parent_path();
-
-        auto [real_base_end, real_resolved_it] = std::mismatch(
-            real_model_dir.begin(), real_model_dir.end(),
-            resolved.begin(), resolved.end());
-
-        if (real_base_end == real_model_dir.end()) {
-          return Status::OK();
-        }
+// Wraps std::filesystem::weakly_canonical with error_code handling.
+static Status WeaklyCanonicalPath(const std::filesystem::path& path, std::filesystem::path& result) {
+  std::error_code ec;
+  result = std::filesystem::weakly_canonical(path, ec);
+  ORT_RETURN_IF(ec, "Failed to get the weakly canonical path: ", path, " - ", ec.message());
+  return Status::OK();
+}
 
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                               "External data path: ", location, " (resolved path: ", resolved,
-                               ") escapes both model directory: ", base_dir,
-                               " and real model directory: ", real_model_dir);
-      }
+// Wraps std::filesystem::exists with error_code handling.
+static Status PathExists(const std::filesystem::path& path, bool& exists) {
+  std::error_code ec;
+  exists = std::filesystem::exists(path, ec);
+  ORT_RETURN_IF(ec, "Failed to check existence of path: ", path, " - ", ec.message());
+  return Status::OK();
+}
 
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                             "External data path: ", location, " (resolved path: ", resolved,
-                             ") escapes model directory: ", base_dir);
-    }
-  } else {
-    // The basedir is empty, which occurs when 1) the session loads a model from bytes and 2) the application does not
-    // set an external file folder path via the session config option
-    // `kOrtSessionOptionsModelExternalInitializersFileFolderPath`.
-
-    // We conservatively check that the normalized relative path does not contain ".." path components that would allow
-    // access to arbitrary files outside of the current working directory. Based on ONNX checker validation.
-    auto norm_location = location.lexically_normal();
-
-    for (const auto& path_component : norm_location) {
-      if (path_component == ORT_TSTR("..")) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "External data path: ", location,
-                               " (model loaded from bytes) escapes working directory");
-      }
+// Checks whether `path` has the given path prefix.
+static bool HasPathComponentPrefix(const std::filesystem::path& prefix, const std::filesystem::path& path) {
+  auto [prefix_end, path_it] = std::mismatch(prefix.begin(), prefix.end(), path.begin(), path.end());
+  return prefix_end == prefix.end();
+}
+
+Status ValidateExternalDataPath(const std::filesystem::path& model_path,
+                                const std::filesystem::path& external_data_path) {
+  ORT_RETURN_IF(external_data_path.empty(), "Empty external data path not allowed");
+
+  // Note: Use !root_path().empty() to reject paths like '/some/path` even on Windows.
+  ORT_RETURN_IF(!external_data_path.root_path().empty(), "Absolute path not allowed for external data location");
+
+#if defined(__wasm__)
+  std::error_code error_code;
+  std::filesystem::current_path(error_code);
+  if (error_code) {
+    // If we can't access the current working directory in a WASM build, we assume that the WASM
+    // environment does not have a virtual filesystem and defer validation to an ExternalDataLoader for
+    // a WASM EP.
+    return Status::OK();
+  }
+#endif
+
+  // Determine the model directory: use model file's parent directory if provided,
+  // otherwise use the current working directory.
+  std::filesystem::path model_dir = model_path.empty() || model_path.parent_path().empty()
+                                        ? std::filesystem::path{"."}
+                                        : model_path.parent_path();
+
+  // Resolve the model directory and the external data path to their weakly canonical forms, which
+  // resolves symlinks but does not require that the paths actually exist yet.
+  std::filesystem::path model_dir_canonical;
+  std::filesystem::path external_data_path_canonical;
+  ORT_RETURN_IF_ERROR(WeaklyCanonicalPath(model_dir, model_dir_canonical));
+  ORT_RETURN_IF_ERROR(WeaklyCanonicalPath(model_dir_canonical / external_data_path, external_data_path_canonical));
+
+  // Check that the external data path is contained by the model directory.
+  // If it is, check if the external data file actually exists.
+  if (HasPathComponentPrefix(model_dir_canonical, external_data_path_canonical)) {
+    bool path_exists = false;
+    ORT_RETURN_IF_ERROR(PathExists(external_data_path_canonical, path_exists));
+    ORT_RETURN_IF(!path_exists, "External data path does not exist: ", external_data_path_canonical);
+    return Status::OK();
+  }
+
+  if (model_path.empty()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "External data path for model loaded from bytes escapes working directory. ",
+                           "External data path: ", external_data_path, " resolved path: ",
+                           external_data_path_canonical, " ", "working directory: ", model_dir);
+  }
+
+  // The model file itself may be a symlink. Therefore, check against the real/canonical directory of the model
+  // after resolving all symlinks.
+  //
+  // This supports symlinked models (e.g., Hugging Face Hub local cache) where the canonical
+  // parent of the model file differs from the parent directory of the symlinked model file.
+  std::error_code ec;
+  if (!std::filesystem::is_symlink(model_path, ec)) {
+    // Note: is_symlink returns false if file is not a symlink, file does not exist, or an error
+    // occurred (e.g., permissions). In any of these cases, we just return an error.
+    std::string fs_error_msg;
+    if (ec) {
+      fs_error_msg = " filesystem::is_symlink error: " + ec.message();
     }
+
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "External data path for model escapes model directory. ",
+                           "External data path: ", external_data_path, " resolved path: ",
+                           external_data_path_canonical, " ", "model directory: ", model_dir, fs_error_msg);
   }
-  return Status::OK();
+
+  std::filesystem::path real_model_path;
+  ORT_RETURN_IF_ERROR(WeaklyCanonicalPath(model_path, real_model_path));
+  auto real_model_dir = real_model_path.parent_path();
+
+  // Check that the external data path is contained by the real model directory.
+  // If it is, check if the external data file actually exists.
+  if (HasPathComponentPrefix(real_model_dir, external_data_path_canonical)) {
+    bool path_exists = false;
+    ORT_RETURN_IF_ERROR(PathExists(external_data_path_canonical, path_exists));
+    ORT_RETURN_IF(!path_exists, "External data path does not exist: ", external_data_path_canonical);
+    return Status::OK();
+  }
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                         "External data path: ", external_data_path, " (resolved path: ",
+                         external_data_path_canonical, ") escapes both model directory: ", model_dir,
+                         " and real model directory: ", real_model_dir);
 }
 
 Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto,
diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h
index f3f33a32b8076..e7649c072416c 100644
--- a/onnxruntime/core/framework/tensorprotoutils.h
+++ b/onnxruntime/core/framework/tensorprotoutils.h
@@ -539,19 +539,33 @@ Status TensorProtoWithExternalDataToTensorProto(
     ONNX_NAMESPACE::TensorProto& new_tensor_proto);
 
 /// <summary>
-/// Validates if the external data path is under the model directory.
-/// If the model is a symlink, it checks against both the logical model directory (base_dir)
-/// and the real/canonical directory of the model.
-/// If the `base_dir` is empty, the function only ensures that `location` is not an absolute path.
+/// Validates that the given external data path is not an absolute path, is under the model directory
+/// (after resolving symlinks), and exists.
+///
+/// The model path can be empty if the model is loaded from bytes and the application did not specify a directory
+/// for external data files. In this case, the external data path must be contained under the current working
+/// directory.
+///
+/// The model path can point to a non-existing model file if the model is loaded from bytes and the application
+/// specified a directory for external data files via the session config entry
+/// `kOrtSessionOptionsModelExternalInitializersFileFolderPath`. In this case, the model_path is set to
+/// "<kOrtSessionOptionsModelExternalInitializersFileFolderPath> / virtual_model.onnx" and the external data path
+/// must be contained under `kOrtSessionOptionsModelExternalInitializersFileFolderPath`.
+///
+/// If the model itself is a symlink, this function checks against both the directory containing the symlink
+/// and the real/canonical directory of the model after resolving all symlinks.
+///
+/// On WASM builds, this function skips most validation (except checks for non-empty/non-absolute path) if we are
+/// unable to query the current working directory, as this indicates that the WASM environment does not have
+/// a valid filesystem. If skipped, an ExternalDataLoader will validate the location and contents of the
+/// external data file at the time of access.
 /// </summary>
-/// <param name="base_dir">Logical model location directory</param>
-/// <param name="location">Location string retrieved from TensorProto external data</param>
-/// <param name="model_path">Optional path to the model file, used for canonical path validation if base_dir check fails</param>
-/// <returns>The function will fail if the resolved full path is not under the logical model directory
-///          nor the real directory of the model path</returns>
-Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
-                                const std::filesystem::path& location,
-                                const std::filesystem::path& model_path = {});
+/// <param name="model_path">Path to the model file. Can be empty or point to a virtual file.</param>
+/// <param name="external_data_path">External data file path to be validated.
+/// Retrieved from TensorProto external data info</param>
+/// <returns>The function will fail if the resolved `external_data_path` path is not under the model directory</returns>
+Status ValidateExternalDataPath(const std::filesystem::path& model_path,
+                                const std::filesystem::path& external_data_path);
 
 #endif  // !defined(SHARED_PROVIDER)
 
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 5aa466ecb5bc7..3599edbfcd357 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -3742,10 +3742,7 @@ Status Graph::ConvertInitializersIntoOrtValues() {
   FindAllSubgraphs(all_subgraphs);
 
   const auto& model_path = GetModel().ModelPath();
-  PathString model_dir;
-  if (!model_path.empty()) {
-    ORT_RETURN_IF_ERROR(GetDirNameFromFilePath(model_path, model_dir));
-  }
+  std::unordered_set<PathString> validated_external_data_paths;
 
   auto put_weights_maybe_in_memory_func = [&](Graph& graph) -> Status {
     // if we have any initializers that are not in memory, put them there.
@@ -3771,11 +3768,17 @@ Status Graph::ConvertInitializersIntoOrtValues() {
           std::unique_ptr<onnxruntime::ExternalDataInfo> external_data_info;
           ORT_RETURN_IF_ERROR(onnxruntime::ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info));
           const auto& location = external_data_info->GetRelPath();
-          auto st = utils::ValidateExternalDataPath(model_dir, location, model_path);
-          if (!st.IsOK()) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                                   "External data path validation failed for initializer: ", tensor_proto.name(),
-                                   ". Error: ", st.ErrorMessage());
+
+          if (validated_external_data_paths.count(location) == 0) {
+            auto st = utils::ValidateExternalDataPath(model_path, location);
+
+            if (!st.IsOK()) {
+              return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                     "External data path validation failed for initializer: ", tensor_proto.name(),
+                                     ". Error: ", st.ErrorMessage());
+            }
+
+            validated_external_data_paths.insert(location);
           }
         }
         continue;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
index da2e8fc37382a..fdc0818e8437b 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc
@@ -43,7 +43,7 @@ bool IsDQWeightSigned(int32_t dt_weight) {
 }
 
 // Holds transposed weight/scale/zp tensors and their TensorProtos for MatMulNBits.
-// Used by both DQMatMulToMatMulNBitsAction and DQCastMatMulToMatMulNBitsAction.
+// Used by DQMatMulToMatMulNBitsAction.
 struct TransposedQuantizedTensors {
   Tensor weight;
   Tensor scale;
@@ -486,146 +486,6 @@ Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph,
   return Status::OK();
 }
 
-DQCastMatMulToMatMulNBitsAction::DQCastMatMulToMatMulNBitsAction(
-    int64_t accuracy_level,
-    concurrency::ThreadPool* intra_op_thread_pool)
-    : accuracy_level_{accuracy_level},
-      intra_op_thread_pool_{intra_op_thread_pool} {
-  ORT_ENFORCE(accuracy_level_ >= 0 && accuracy_level_ <= 4, "MatMulNBits accuracy level must be between 0 and 4");
-}
-
-Status DQCastMatMulToMatMulNBitsAction::Run(Graph& graph, const NodesToOptimize& selected_nodes) const {
-  // Selected nodes layout (from DQCastMatMulToMatMulNBitsSelector):
-  //   Input(0) = DQ node
-  //   Input(1) = Cast on input B (between DQ and MatMul)
-  //   Target() = MatMul node
-  auto* dq_node = selected_nodes.Input(0);
-  auto* cast_b_node = selected_nodes.Input(1);
-  auto& matmul_node = selected_nodes.Target();
-
-  // --- Transpose DQ weights/scales/zp via shared helper ---
-  TransposedQuantizedTensors transposed;
-  ORT_RETURN_IF_ERROR(TransposeDQWeightsForMatMulNBits(
-      graph, *dq_node, "fused_DQ_Cast_MatMul", intra_op_thread_pool_, transposed));
-
-  // MatMulNBits operates in the DQ scale dtype.
-  // Always insert Cast on input A (to DQ dtype) and Cast on output (DQ dtype to MatMul output dtype).
-  // ORT's redundant cast elimination optimizer will clean up unnecessary casts later.
-
-  // Determine DQ output element type (e.g., fp16)
-  int32_t dq_output_dtype = cast_b_node->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
-  // Determine MatMul output element type (e.g., fp32)
-  int32_t matmul_output_dtype = matmul_node.OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
-
-  const auto& dq_attrs = dq_node->GetAttributes();
-  const auto* weight_arg = dq_node->InputDefs()[0];
-  auto K = weight_arg->Shape()->dim(0).dim_value();
-  auto N = weight_arg->Shape()->dim(1).dim_value();
-  auto block_size = dq_attrs.at("block_size").i();
-  int32_t dt_weight = weight_arg->TypeAsProto()->tensor_type().elem_type();
-  auto bits = DQWeightBits(dt_weight);
-
-  // --- Create fp16 NodeArg for MatMulNBits input A ---
-  NodeArg* matmul_input_a = matmul_node.MutableInputDefs()[0];
-  ONNX_NAMESPACE::TypeProto input_a_fp16_type;
-  input_a_fp16_type.mutable_tensor_type()->set_elem_type(dq_output_dtype);
-  if (matmul_input_a->Shape()) {
-    *input_a_fp16_type.mutable_tensor_type()->mutable_shape() =
-        matmul_input_a->TypeAsProto()->tensor_type().shape();
-  }
-  auto cast_a_out_name = graph.GenerateNodeArgName(matmul_node.Name() + "_input_a_cast");
-  NodeArg* input_a_arg = &graph.GetOrCreateNodeArg(cast_a_out_name, &input_a_fp16_type);
-
-  // --- Create fp16 NodeArg for MatMulNBits output ---
-  ONNX_NAMESPACE::TypeProto output_fp16_type;
-  output_fp16_type.mutable_tensor_type()->set_elem_type(dq_output_dtype);
-  if (matmul_node.OutputDefs()[0]->Shape()) {
-    *output_fp16_type.mutable_tensor_type()->mutable_shape() =
-        matmul_node.OutputDefs()[0]->TypeAsProto()->tensor_type().shape();
-  }
-  auto mnb_out_name = graph.GenerateNodeArgName(matmul_node.Name() + "_matmulnbits_out");
-  NodeArg* mnb_output_arg = &graph.GetOrCreateNodeArg(mnb_out_name, &output_fp16_type);
-
-  // --- Create MatMulNBits node ---
-  NodeAttributes attrs;
-  utils::SetNodeAttribute(utils::MakeAttribute("K", K), attrs);
-  utils::SetNodeAttribute(utils::MakeAttribute("N", N), attrs);
-  utils::SetNodeAttribute(utils::MakeAttribute("bits", bits), attrs);
-  utils::SetNodeAttribute(utils::MakeAttribute("block_size", block_size), attrs);
-  utils::SetNodeAttribute(utils::MakeAttribute("accuracy_level", accuracy_level_), attrs);
-
-  auto& new_node = graph.AddNode(
-      graph.GenerateNodeName(matmul_node.Name() + "_MatMulNBits"),
-      "MatMulNBits",
-      "Fused DQ+Cast+MatMul to MatMulNBits",
-      {input_a_arg},
-      {mnb_output_arg},
-      &attrs,
-      kMSDomain);
-
-  const auto& target_provider = matmul_node.GetExecutionProviderType();
-  new_node.SetExecutionProviderType(target_provider.empty() ? kCpuExecutionProvider : target_provider);
-
-  // Add transposed weight, scale, zp to inputs
-  auto& input_defs = new_node.MutableInputDefs();
-  input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.weight_proto, std::move(transposed.weight)));
-  new_node.MutableInputArgsCount().push_back(1);
-
-  input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, transposed.scale_proto, std::move(transposed.scale)));
-  new_node.MutableInputArgsCount().push_back(1);
-
-  if (transposed.zero_point_proto) {
-    input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, *transposed.zero_point_proto, std::move(*transposed.zero_point)));
-    new_node.MutableInputArgsCount().push_back(1);
-  }
-
-  // --- Insert Cast on input A: matmul_input_dtype -> dq_output_dtype ---
-  {
-    NodeAttributes cast_attrs;
-    utils::SetNodeAttribute(
-        utils::MakeAttribute("to", static_cast<int64_t>(dq_output_dtype)),
-        cast_attrs);
-    auto& cast_node = graph.AddNode(
-        graph.GenerateNodeName(matmul_node.Name() + "_Cast_input_a"),
-        "Cast", "",
-        {matmul_input_a},
-        {input_a_arg},
-        &cast_attrs,
-        kOnnxDomain);
-    cast_node.SetExecutionProviderType(new_node.GetExecutionProviderType());
-  }
-
-  // --- Insert Cast on output: dq_output_dtype -> matmul_output_dtype ---
-  {
-    NodeAttributes cast_attrs;
-    utils::SetNodeAttribute(
-        utils::MakeAttribute("to", static_cast<int64_t>(matmul_output_dtype)),
-        cast_attrs);
-    auto& cast_node = graph.AddNode(
-        graph.GenerateNodeName(matmul_node.Name() + "_Cast_output"),
-        "Cast", "",
-        {mnb_output_arg},
-        {const_cast<NodeArg*>(matmul_node.OutputDefs()[0])},
-        &cast_attrs,
-        kOnnxDomain);
-    cast_node.SetExecutionProviderType(new_node.GetExecutionProviderType());
-  }
-
-  // --- Remove original nodes ---
-  auto remove_node = [&graph](Node* node) {
-    if (node) {
-      graph_utils::RemoveNodeOutputEdges(graph, *node);
-      graph.RemoveNode(node->Index());
-    }
-  };
-
-  remove_node(&matmul_node);
-  remove_node(cast_b_node);
-  remove_node(dq_node);
-
-  return Status::OK();
-}
-
 static std::vector<NodeAndMoveInfo> GetGemmMoveInfo(bool does_q_node_exist) {
   NTO::NodeLocation dq_A{NTO::NodeType::kInput, 0};
   NTO::NodeLocation dq_B{NTO::NodeType::kInput, 1};
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
index e112959cc58da..02a8353707599 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h
@@ -107,20 +107,6 @@ struct DQMatMulToMatMulNBitsAction : public ReplaceWithNew {
   concurrency::ThreadPool* intra_op_thread_pool_;
 };
 
-// Used together with DQCastMatMulToMatMulNBitsSelector.
-// Handles DQ -> Cast(fp16->fp32) -> MatMul fusion to MatMulNBits,
-// including optional Cast on input A and output type alignment.
-struct DQCastMatMulToMatMulNBitsAction : public Action {
-  DQCastMatMulToMatMulNBitsAction(int64_t accuracy_level,
-                                  concurrency::ThreadPool* intra_op_thread_pool);
-
-  Status Run(Graph&, const NodesToOptimize& selected_nodes) const override;
-
- private:
-  int64_t accuracy_level_;
-  concurrency::ThreadPool* intra_op_thread_pool_;
-};
-
 struct GemmReplaceWithQuant : public Action {
   GemmReplaceWithQuant();
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index 0b04445692c9b..8cab6911646f2 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -7,6 +7,7 @@
 #include <unordered_map>
 
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
+
 #include "core/mlas/inc/mlas.h"
 
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h"
@@ -306,7 +307,12 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi
                                                          intra_op_thread_pool);
 
 #if !defined(ORT_MINIMAL_BUILD)
-  std::vector<const char*> providers = {kCpuExecutionProvider, kCudaExecutionProvider, kDmlExecutionProvider};
+  // Include "" (empty string) to match nodes not yet assigned to an EP.
+  // For FP16 models on CPU EP, FP16 MatMul nodes are not claimed during partitioning
+  // (no FP16 MatMul kernel on CPU), leaving their EP unassigned. The DQ->MatMul fusion
+  // should still apply; the action assigns kCpuExecutionProvider to the resulting
+  // MatMulNBits node (which has both float and float16 CPU kernels).
+  std::vector<const char*> providers = {kCpuExecutionProvider, kCudaExecutionProvider, kDmlExecutionProvider, ""};
   std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>(providers);
   qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                          {{"MatMul", {}}},
@@ -316,25 +322,6 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi
 #else
   qdq_selector_action_registry.RegisterAction(action_name, std::move(action));
 #endif
-
-  // DQ -> Cast(fp16->fp32) -> MatMul pattern.
-  // Handles FP16 models where Cast nodes are inserted between DQ and MatMul.
-  const std::string cast_action_name{"DQCastMatMulToMatMulNBits"};
-
-  std::unique_ptr<Action> cast_action =
-      std::make_unique<QDQ::DQCastMatMulToMatMulNBitsAction>(qdq_matmulnbits_accuracy_level,
-                                                             intra_op_thread_pool);
-
-#if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> cast_selector =
-      std::make_unique<QDQ::DQCastMatMulToMatMulNBitsSelector>(providers);
-  qdq_selector_action_registry.RegisterSelectorAndAction(cast_action_name,
-                                                         {{"MatMul", {}}},
-                                                         std::move(cast_selector),
-                                                         std::move(cast_action));
-#else
-  qdq_selector_action_registry.RegisterAction(cast_action_name, std::move(cast_action));
-#endif
 }
 
 void GemmQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
@@ -416,7 +403,9 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer(
           apply_context,
           // this transformer is compatible with CPU, DML, ACL and CUDA EP.
           // There is further EP control on the rule level.
-          {kCpuExecutionProvider, kDmlExecutionProvider, kAclExecutionProvider, kCudaExecutionProvider}} {
+          // Also accept nodes with empty EP (unassigned) so that individual selectors
+          // that include "" in their compatible providers can match unassigned nodes.
+          {kCpuExecutionProvider, kDmlExecutionProvider, kAclExecutionProvider, kCudaExecutionProvider, ""}} {
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index c39dfeb082e35..8a00fe11ff3fd 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -651,75 +651,6 @@ bool DQMatMulNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Nod
   return ValidateBlockwiseDQForMatMulNBits(graph, *dq_nodes[0]);
 }
 
-std::optional<NodesToOptimizeIndices>
-DQCastMatMulToMatMulNBitsSelector::Select(const GraphViewer& graph_viewer, const Node& node) const {
-  // Check EP compatibility
-  const std::string_view node_ep = node.GetExecutionProviderType();
-  if (!compatible_providers_.empty() &&
-      std::find(compatible_providers_.begin(), compatible_providers_.end(), node_ep) == compatible_providers_.end()) {
-    return std::nullopt;
-  }
-
-  const auto& graph = graph_viewer.GetGraph();
-
-  // node must be MatMul
-  if (node.OpType() != "MatMul") {
-    return std::nullopt;
-  }
-
-  if (node.InputDefs().size() < 2) {
-    return std::nullopt;
-  }
-
-  // Check input B: must be Cast(fp16->fp32)
-  const Node* cast_b = graph_viewer.GetProducerNode(node.InputDefs()[1]->Name());
-  if (!cast_b || cast_b->OpType() != "Cast") {
-    return std::nullopt;
-  }
-
-  const auto& cast_b_attrs = cast_b->GetAttributes();
-  auto to_iter = cast_b_attrs.find("to");
-  if (to_iter == cast_b_attrs.end() ||
-      to_iter->second.i() != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) {
-    return std::nullopt;
-  }
-
-  // Cast B input must be fp16
-  if (!cast_b->InputDefs()[0]->TypeAsProto() ||
-      cast_b->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type() !=
-          ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) {
-    return std::nullopt;
-  }
-
-  // Cast B must have exactly 1 output edge (to MatMul) and not be a graph output
-  if (!optimizer_utils::CheckOutputEdges(graph, *cast_b, 1)) {
-    return std::nullopt;
-  }
-
-  // Cast B's input must come from a DQ node
-  const Node* dq_node = graph_viewer.GetProducerNode(cast_b->InputDefs()[0]->Name());
-  if (!dq_node || dq_node->OpType() != QDQ::DQOpName) {
-    return std::nullopt;
-  }
-
-  // DQ must have exactly 1 output edge (to Cast B) and not be a graph output
-  if (!optimizer_utils::CheckOutputEdges(graph, *dq_node, 1)) {
-    return std::nullopt;
-  }
-
-  if (!ValidateBlockwiseDQForMatMulNBits(graph, *dq_node)) {
-    return std::nullopt;
-  }
-
-  // Build selection
-  NodesToOptimizeIndicesBuilder builder;
-  builder.input_nodes.push_back(dq_node->Index());
-  builder.input_nodes.push_back(cast_b->Index());
-  builder.target_node = node.Index();
-
-  return builder.Build();
-}
-
 bool GemmNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const Node* redundant_clip_node,
                                   const std::vector<const Node*>& dq_nodes,
                                   const std::vector<const Node*>& q_nodes) const {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index 5c10668733785..79c374b301442 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -461,27 +461,6 @@ class DQMatMulToMatMulNBitsSelector : public BaseSelector {
       : BaseSelector(std::make_unique<DQMatMulNodeGroupSelector>(), compatible_providers) {}
 };
 
-// Convert "DQ -> Cast(fp16->fp32) -> MatMul" to "MatMulNBits".
-// Handles Cast(fp16->fp32) between DQ and MatMul on input B, and optionally on input A.
-// Selection layout:
-//   input_nodes[0] = DQ node
-//   input_nodes[1] = Cast on input B (between DQ and MatMul)
-//   target_node = MatMul
-//   output_nodes = {}
-class DQCastMatMulToMatMulNBitsSelector : public NodeSelector {
- public:
-  explicit DQCastMatMulToMatMulNBitsSelector(gsl::span<const char*> compatible_providers = {})
-      : compatible_providers_(compatible_providers.begin(), compatible_providers.end()) {}
-
-  DQCastMatMulToMatMulNBitsSelector(DQCastMatMulToMatMulNBitsSelector&& rhs) noexcept
-      : compatible_providers_(std::move(rhs.compatible_providers_)) {}
-
-  std::optional<NodesToOptimizeIndices> Select(const GraphViewer& graph_viewer, const Node& node) const override;
-
- private:
-  std::vector<std::string> compatible_providers_;
-};
-
 // Input: DQ nodes for A, B and optional C
 // Output: optional Q node for Y
 class GemmSelector : public BaseSelector {
diff --git a/onnxruntime/core/providers/cpu/math/cumsum.cc b/onnxruntime/core/providers/cpu/math/cumsum.cc
index 8321b81021d19..14ea6712f7f46 100644
--- a/onnxruntime/core/providers/cpu/math/cumsum.cc
+++ b/onnxruntime/core/providers/cpu/math/cumsum.cc
@@ -13,29 +13,6 @@ using namespace onnxruntime;
 
 namespace onnxruntime {
 
-namespace cumsum_op {
-Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out) {
-  if (!axis_tensor)
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor must be provided to the CumSum op");
-
-  if (axis_tensor->Shape().NumDimensions() > 1)
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be 0D or 1D");
-
-  if (axis_tensor->IsDataType<int32_t>()) {
-    axis_out = static_cast<int64_t>(axis_tensor->Data<int32_t>()[0]);
-  } else if (axis_tensor->IsDataType<int64_t>()) {
-    axis_out = axis_tensor->Data<int64_t>()[0];
-  } else {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be of type `int32_t` or `int64_t`");
-  }
-
-  axis_out = HandleNegativeAxis(axis_out, input_rank);
-
-  return Status::OK();
-}
-
-}  // namespace cumsum_op
-
 ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(
     CumSum,
     11,
diff --git a/onnxruntime/core/providers/cpu/math/cumsum.h b/onnxruntime/core/providers/cpu/math/cumsum.h
index fa1c1ceb0df10..b7443ada40861 100644
--- a/onnxruntime/core/providers/cpu/math/cumsum.h
+++ b/onnxruntime/core/providers/cpu/math/cumsum.h
@@ -1,11 +1,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#pragma once
 
 #include "core/common/common.h"
+#include "core/providers/common.h"
+
+#ifndef SHARED_PROVIDER
 #include "core/framework/op_kernel.h"
+#endif
 
 namespace onnxruntime {
 
+#ifndef SHARED_PROVIDER
 template <class T>
 class CumSum final : public OpKernel {
  public:
@@ -17,10 +23,33 @@ class CumSum final : public OpKernel {
   int64_t exclusive_;
   int64_t reverse_;
 };
+#endif
 
 namespace cumsum_op {
 
+#ifdef SHARED_PROVIDER
 Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out);
+#else
+inline Status GetAxis(const Tensor* axis_tensor, int64_t input_rank, int64_t& axis_out) {
+  if (!axis_tensor)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor must be provided to the CumSum op");
+
+  if (axis_tensor->Shape().NumDimensions() > 1)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be 0D or 1D");
+
+  if (axis_tensor->IsDataType<int32_t>()) {
+    axis_out = static_cast<int64_t>(axis_tensor->Data<int32_t>()[0]);
+  } else if (axis_tensor->IsDataType<int64_t>()) {
+    axis_out = axis_tensor->Data<int64_t>()[0];
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Axis tensor should be of type `int32_t` or `int64_t`");
+  }
+
+  axis_out = HandleNegativeAxis(axis_out, input_rank);
+
+  return Status::OK();
+}
+#endif  // SHARED_PROVIDER
 
 }  // namespace cumsum_op
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
index 87958a9f7e2dd..0680be3aea49c 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
@@ -258,76 +258,6 @@ void RoiAlignForward(const TensorShape& output_shape, const T* bottom_data, floa
 }
 }  // namespace
 
-Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, const Tensor* batch_indices_ptr) {
-  constexpr int64_t EXPECTED_NUM_ROI_DIMS = 2;
-  constexpr int64_t EXPECTED_SECOND_ROI_DIM = 4;
-  if (!X_ptr) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null input X ptr");
-  }
-  if (!rois_ptr) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null rois_ptr");
-  }
-  if (!batch_indices_ptr) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null batch_indices_ptr");
-  }
-
-  const auto& rois_dims = rois_ptr->Shape();
-  const auto& batch_indices_dims = batch_indices_ptr->Shape();
-
-  if (batch_indices_dims.NumDimensions() != 1) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                  "Number of dimensions for batch indices should be exactly 1");
-  }
-
-  // validate rois_dims
-  if (rois_dims.NumDimensions() != EXPECTED_NUM_ROI_DIMS) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                  "Number of dimensions for rois should be exactly " + std::to_string(EXPECTED_NUM_ROI_DIMS));
-  }
-  if (rois_dims[1] != EXPECTED_SECOND_ROI_DIM) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                  "Second dimension for rois should be exactly " + std::to_string(EXPECTED_SECOND_ROI_DIM));
-  }
-
-  // first dimension of batch_indices and rois should match
-  if (batch_indices_dims[0] != rois_dims[0]) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                  "First dimension (num_rois) of batch_indices and rois don't match");
-  }
-
-  if (batch_indices_ptr->Location().device.Type() == OrtDevice::CPU) {
-    // Validate batch_indices values are within [0, batch_size) when the tensor
-    // data is accessible from the host (CPU).
-    const int64_t batch_size = X_ptr->Shape()[0];
-    const int64_t num_rois = batch_indices_dims[0];
-
-    auto check_bounds = [batch_size, num_rois](const auto* batch_indices_data) -> Status {
-      for (int64_t i = 0; i < num_rois; ++i) {
-        if (batch_indices_data[i] < 0 || batch_indices_data[i] >= batch_size) {
-          return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                        "batch_indices value " + std::to_string(batch_indices_data[i]) +
-                            " at index " + std::to_string(i) +
-                            " is out of range [0, " + std::to_string(batch_size) + ")");
-        }
-      }
-      return Status::OK();
-    };
-
-    if (batch_indices_ptr->IsDataType<int64_t>()) {
-      auto status = check_bounds(batch_indices_ptr->Data<int64_t>());
-      if (!status.IsOK()) return status;
-    } else if (batch_indices_ptr->IsDataType<int32_t>()) {
-      auto status = check_bounds(batch_indices_ptr->Data<int32_t>());
-      if (!status.IsOK()) return status;
-    } else {
-      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
-                    "batch_indices must be of type int64_t or int32_t");
-    }
-  }
-
-  return Status::OK();
-}
-
 template <typename T>
 Status RoiAlign<T>::Compute(OpKernelContext* context) const {
   const auto* X_ptr = context->Input<Tensor>(0);
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.h b/onnxruntime/core/providers/cpu/object_detection/roialign.h
index 1bb8bd34c5cb2..bb97de158369b 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.h
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.h
@@ -3,12 +3,86 @@
 
 #pragma once
 
-#include "core/framework/op_kernel.h"
+#include <algorithm>
 #include <cctype>
+#include <string>
+
+#include "core/common/common.h"
+#ifndef SHARED_PROVIDER
+#include "core/framework/op_kernel.h"
+#endif
 
 namespace onnxruntime {
 
+#ifdef SHARED_PROVIDER
 Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, const Tensor* batch_indices_ptr);
+#else
+inline Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, const Tensor* batch_indices_ptr) {
+  constexpr int64_t EXPECTED_NUM_ROI_DIMS = 2;
+  constexpr int64_t EXPECTED_SECOND_ROI_DIM = 4;
+  if (!X_ptr) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null input X ptr");
+  }
+  if (!rois_ptr) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null rois_ptr");
+  }
+  if (!batch_indices_ptr) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Null batch_indices_ptr");
+  }
+
+  const auto& rois_dims = rois_ptr->Shape();
+  const auto& batch_indices_dims = batch_indices_ptr->Shape();
+
+  if (batch_indices_dims.NumDimensions() != 1) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Number of dimensions for batch indices should be exactly 1");
+  }
+
+  if (rois_dims.NumDimensions() != EXPECTED_NUM_ROI_DIMS) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Number of dimensions for rois should be exactly " + std::to_string(EXPECTED_NUM_ROI_DIMS));
+  }
+  if (rois_dims[1] != EXPECTED_SECOND_ROI_DIM) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Second dimension for rois should be exactly " + std::to_string(EXPECTED_SECOND_ROI_DIM));
+  }
+
+  if (batch_indices_dims[0] != rois_dims[0]) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "First dimension (num_rois) of batch_indices and rois don't match");
+  }
+
+  if (batch_indices_ptr->Location().device.Type() == OrtDevice::CPU) {
+    const int64_t batch_size = X_ptr->Shape()[0];
+    const int64_t num_rois = batch_indices_dims[0];
+
+    auto check_bounds = [batch_size, num_rois](const auto* batch_indices_data) -> Status {
+      for (int64_t i = 0; i < num_rois; ++i) {
+        if (batch_indices_data[i] < 0 || batch_indices_data[i] >= batch_size) {
+          return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                        "batch_indices value " + std::to_string(batch_indices_data[i]) +
+                            " at index " + std::to_string(i) +
+                            " is out of range [0, " + std::to_string(batch_size) + ")");
+        }
+      }
+      return Status::OK();
+    };
+
+    if (batch_indices_ptr->IsDataType<int64_t>()) {
+      auto status = check_bounds(batch_indices_ptr->Data<int64_t>());
+      if (!status.IsOK()) return status;
+    } else if (batch_indices_ptr->IsDataType<int32_t>()) {
+      auto status = check_bounds(batch_indices_ptr->Data<int32_t>());
+      if (!status.IsOK()) return status;
+    } else {
+      return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                    "batch_indices must be of type int64_t or int32_t");
+    }
+  }
+
+  return Status::OK();
+}
+#endif
 
 enum struct RoiAlignMode {
   avg = 0,
@@ -17,10 +91,10 @@ enum struct RoiAlignMode {
 
 class RoiAlignBase {
  public:
-  explicit RoiAlignBase(const OpKernelInfo& info) {
-    // mode
+  template <typename TKernelInfo>
+  explicit RoiAlignBase(const TKernelInfo& info) {
     std::string mode;
-    if (info.GetAttr<std::string>("mode", &mode).IsOK()) {
+    if (info.template GetAttr<std::string>("mode", &mode).IsOK()) {
       std::transform(mode.begin(), mode.end(), mode.begin(), [](char i) { return static_cast<char>(::tolower(i)); });
       if (mode == "avg") {
         mode_ = RoiAlignMode::avg;
@@ -31,41 +105,33 @@ class RoiAlignBase {
       }
     }
 
-    // output_height
     int64_t output_height_tmp;
-    if (info.GetAttr<int64_t>("output_height", &output_height_tmp).IsOK()) {
+    if (info.template GetAttr<int64_t>("output_height", &output_height_tmp).IsOK()) {
       output_height_ = output_height_tmp;
     }
 
-    // output_width
     int64_t output_width_tmp;
-    if (info.GetAttr<int64_t>("output_width", &output_width_tmp).IsOK()) {
+    if (info.template GetAttr<int64_t>("output_width", &output_width_tmp).IsOK()) {
       output_width_ = output_width_tmp;
     }
 
-    // sampling_ratio
     int64_t sampling_ratio_tmp;
-    if (info.GetAttr<int64_t>("sampling_ratio", &sampling_ratio_tmp).IsOK()) {
+    if (info.template GetAttr<int64_t>("sampling_ratio", &sampling_ratio_tmp).IsOK()) {
       sampling_ratio_ = sampling_ratio_tmp;
       ORT_ENFORCE(sampling_ratio_ >= 0, "Sampling ratio should be >=0, but it was ", sampling_ratio_);
     }
 
-    // spatial_scale
     float spatial_scale_tmp;
-    if (info.GetAttr<float>("spatial_scale", &spatial_scale_tmp).IsOK()) {
+    if (info.template GetAttr<float>("spatial_scale", &spatial_scale_tmp).IsOK()) {
       spatial_scale_ = spatial_scale_tmp;
     }
 
     std::string coordinate_transformation_mode;
-    if (info.GetAttr<std::string>("coordinate_transformation_mode", &coordinate_transformation_mode).IsOK()) {
-      if (coordinate_transformation_mode == "half_pixel")
-        half_pixel_ = true;
-      else
-        half_pixel_ = false;
+    if (info.template GetAttr<std::string>("coordinate_transformation_mode", &coordinate_transformation_mode).IsOK()) {
+      half_pixel_ = coordinate_transformation_mode == "half_pixel";
     }
 
     if (mode_ == RoiAlignMode::max && sampling_ratio_ != 1) {
-      // TODO(fdwr): Issue #6146. ORT 1.13 will correct the incorrect summation of max mode with PR #7354.
       LOGS_DEFAULT(WARNING) << "The existing summation for max mode and sampling ratios besides 1 is incorrect "
                             << "and will be fixed in the next ORT 1.13 release. Thus the results of RoiAlign "
                             << "will be different.";
diff --git a/onnxruntime/core/providers/cpu/tensor/concat.cc b/onnxruntime/core/providers/cpu/tensor/concat.cc
index e3d5c0600420f..98d61ed1d3127 100644
--- a/onnxruntime/core/providers/cpu/tensor/concat.cc
+++ b/onnxruntime/core/providers/cpu/tensor/concat.cc
@@ -49,14 +49,6 @@ using EnabledDataTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExec
                                                                         Concat, Input, 0);
 }  // namespace
 
-// this method will be shared between 'Concat' (CPU and GPU) and
-// 'ConcatFromSequence' ('concat' and 'stack' modes) to validate inputs
-Status ConcatBase::PrepareForCompute(OpKernelContext* ctx,
-                                     const InlinedTensorsVector& input_tensors,
-                                     Prepare& p) const {
-  return PrepareForComputeImpl(ctx, input_tensors, p);
-}
-
 namespace {
 TensorShapeVector StridesForStack(const TensorShapeVector& full_strides, uint64_t axis) {
   // if we are stacking, skip the dimension that will be stacked along in the output strides
diff --git a/onnxruntime/core/providers/cpu/tensor/concatbase.h b/onnxruntime/core/providers/cpu/tensor/concatbase.h
index b9085b2a9318b..df2eb78c61180 100644
--- a/onnxruntime/core/providers/cpu/tensor/concatbase.h
+++ b/onnxruntime/core/providers/cpu/tensor/concatbase.h
@@ -209,8 +209,16 @@ class ConcatBase {
     return Status::OK();
   }
 
+#ifdef SHARED_PROVIDER
   Status PrepareForCompute(OpKernelContext* ctx, const InlinedTensorsVector& input_tensors,
                            Prepare& p) const;
+#else
+  template <typename KernelContextType>
+  inline Status PrepareForCompute(KernelContextType* ctx, const InlinedTensorsVector& input_tensors,
+                                  Prepare& p) const {
+    return PrepareForComputeImpl(ctx, input_tensors, p);
+  }
+#endif
 
  protected:
   template <typename KernelInfoType>
diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc
index f171b33ee5f4f..3b3c67e7d818b 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather.cc
@@ -56,10 +56,6 @@ ONNX_CPU_OPERATOR_KERNEL(
         .TypeConstraint("Tind", BuildKernelDefConstraintsFromTypeList<EnabledIndexTypes>()),
     Gather);
 
-Status GatherBase::PrepareForCompute(OpKernelContext* context, Prepare& p) const {
-  return PrepareForComputeImpl(context, p);
-}
-
 template <typename Tin>
 Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uint8_t* dst_base, bool is_string_type,
                       const size_t element_bytes, const int64_t block_size, const int64_t M,
diff --git a/onnxruntime/core/providers/cpu/tensor/gatherbase.h b/onnxruntime/core/providers/cpu/tensor/gatherbase.h
index 1f5e85c554a78..fc29c04290883 100644
--- a/onnxruntime/core/providers/cpu/tensor/gatherbase.h
+++ b/onnxruntime/core/providers/cpu/tensor/gatherbase.h
@@ -46,7 +46,14 @@ class GatherBase {
     return Status::OK();
   }
 
+#ifdef SHARED_PROVIDER
   Status PrepareForCompute(OpKernelContext* context, Prepare& p) const;
+#else
+  template <typename KernelContextType>
+  inline Status PrepareForCompute(KernelContextType* context, Prepare& p) const {
+    return PrepareForComputeImpl(context, p);
+  }
+#endif
 
  protected:
   template <typename KernelInfoType>
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
index 3218c8952d6ec..14a22fa7be0af 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
@@ -9,8 +9,9 @@ namespace onnxruntime {
 
 class SpaceDepthBase {
  protected:
-  explicit SpaceDepthBase(const OpKernelInfo& info) {
-    ORT_ENFORCE(info.GetAttr("blocksize", &blocksize_).IsOK(),
+  template <typename KernelInfoType>
+  explicit SpaceDepthBase(const KernelInfoType& info) {
+    ORT_ENFORCE(info.template GetAttr<int64_t>("blocksize", &blocksize_).IsOK(),
                 "Attribute blocksize is not set.");
   }
 
diff --git a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
index 1b6ee02061d34..5fdb57b1a5e35 100644
--- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
+++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.cc
@@ -77,57 +77,6 @@ ONNX_CPU_OPERATOR_KERNEL(
         .TypeConstraint("T", DataTypeImpl::AllTensorTypes()),
     Unsqueeze);
 
-Status UnsqueezeBase::PrepareCompute(OpKernelContext* ctx, Prepare& p) const {
-  const auto* X = ctx->Input<Tensor>(0);
-  ORT_ENFORCE(X != nullptr);
-  auto& input_tensor = *X;
-
-  TensorShapeVector axes;
-  size_t num_inputs = ctx->InputCount();
-  if (num_inputs == 2) {  // axes is an input
-    const Tensor* axes_tensor = ctx->Input<Tensor>(1);
-    ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
-    ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 0 ||
-                    axes_tensor->Shape().NumDimensions() == 1,
-                "An axes tensor must be a scalar or a 1-D tensor.");
-    auto data_span = axes_tensor->template DataAsSpan<int64_t>();
-    axes.assign(data_span.begin(), data_span.end());
-  } else {
-    axes.assign(axes_.begin(), axes_.end());
-  }
-
-  // New dimension count is the current dimensions + the number of entries in axes
-  // Initialize output_dims to 0 in each axis initially
-  TensorShapeVector output_dims(axes.size() + input_tensor.Shape().NumDimensions(), 0);
-
-  // Set all axes indices to 1 in output_dims and check for duplicates
-  for (int64_t axis : axes) {
-    // Valid axis range is [0, output_rank - 1]
-    axis = HandleNegativeAxis(axis, onnxruntime::narrow<int64_t>(output_dims.size()));
-    if (axis < 0 || axis >= static_cast<int64_t>(output_dims.size()))
-      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has an out of range axis");
-    if (output_dims[onnxruntime::narrow<size_t>(axis)] != 0)
-      return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'axes' has a duplicate axis");
-    output_dims[onnxruntime::narrow<size_t>(axis)] = 1;
-  }
-
-  // Now fill in the zero entries with the existing shape
-  {
-    auto begin = input_tensor.Shape().GetDims().begin();
-    for (auto& axisSize : output_dims) {
-      if (axisSize == 0)
-        axisSize = *begin++;
-    }
-    assert(begin == input_tensor.Shape().GetDims().end());
-  }
-
-  TensorShape output_shape(output_dims);
-  p.output_tensor = ctx->Output(0, output_shape);
-  ORT_ENFORCE(nullptr != p.output_tensor);
-  p.input_tensor = &input_tensor;
-  return Status::OK();
-}
-
 Status Unsqueeze::Compute(OpKernelContext* ctx) const {
   Prepare p;
   ORT_RETURN_IF_ERROR(PrepareCompute(ctx, p));
diff --git a/onnxruntime/core/providers/cpu/tensor/unsqueeze.h b/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
index 5a8a318923da5..09a77c113e022 100644
--- a/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
+++ b/onnxruntime/core/providers/cpu/tensor/unsqueeze.h
@@ -19,7 +19,57 @@ class UnsqueezeBase {
     Tensor* output_tensor = nullptr;
   };
 
+#ifdef SHARED_PROVIDER
   Status PrepareCompute(OpKernelContext* context, Prepare& p) const;
+#else
+  template <typename KernelContextType>
+  inline Status PrepareCompute(KernelContextType* ctx, Prepare& p) const {
+    const auto* X = ctx->template Input<Tensor>(0);
+    ORT_ENFORCE(X != nullptr);
+    auto& input_tensor = *X;
+
+    TensorShapeVector axes;
+    size_t num_inputs = ctx->InputCount();
+    if (num_inputs == 2) {
+      const Tensor* axes_tensor = ctx->template Input<Tensor>(1);
+      ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
+      ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 0 ||
+                      axes_tensor->Shape().NumDimensions() == 1,
+                  "An axes tensor must be a scalar or a 1-D tensor.");
+      auto data_span = axes_tensor->template DataAsSpan<int64_t>();
+      axes.assign(data_span.begin(), data_span.end());
+    } else {
+      axes.assign(axes_.begin(), axes_.end());
+    }
+
+    TensorShapeVector output_dims(axes.size() + input_tensor.Shape().NumDimensions(), 0);
+
+    for (int64_t axis : axes) {
+      axis = HandleNegativeAxis(axis, onnxruntime::narrow<int64_t>(output_dims.size()));
+      if (axis < 0 || axis >= static_cast<int64_t>(output_dims.size()))
+        return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has an out of range axis");
+      if (output_dims[onnxruntime::narrow<size_t>(axis)] != 0)
+        return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "'axes' has a duplicate axis");
+      output_dims[onnxruntime::narrow<size_t>(axis)] = 1;
+    }
+
+    {
+      auto begin = input_tensor.Shape().GetDims().begin();
+      for (auto& axis_size : output_dims) {
+        if (axis_size == 0)
+          axis_size = *begin++;
+      }
+      assert(begin == input_tensor.Shape().GetDims().end());
+    }
+
+    TensorShape output_shape(output_dims);
+    p.output_tensor = ctx->Output(0, output_shape);
+    ORT_ENFORCE(nullptr != p.output_tensor);
+    p.input_tensor = &input_tensor;
+    return Status::OK();
+  }
+#endif
+
   static TensorShapeVector ComputeOutputShape(
       const TensorShape& input_shape,
       const TensorShapeVector& axes) {
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
index b533f1b7dc80b..87ba56bc45dad 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -2,10 +2,6 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/tensor/upsample.h"
-
-#include <limits>
-
-#include "core/common/inlined_containers.h"
 #include "core/common/safeint.h"
 #include "core/platform/threadpool.h"
 #include "core/providers/cpu/tensor/upsample_antialias.h"
@@ -35,46 +31,6 @@ REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(int8_t, 9, 9);
 REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
 
-void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
-                                            InlinedVector<float>& scales) const {
-  // AspectRatioPolicy::STRETCH is default policy when opset < 18
-  if (keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH) {
-    return;
-  }
-
-  InlinedHashSet<int64_t> axes_set(axes_.begin(), axes_.end());
-
-  float scale_in_policy = 0.0f;
-  if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_LARGER) {
-    scale_in_policy = std::numeric_limits<float>::max();
-
-    for (size_t i = 0; i < scales.size(); i++) {
-      if (axes_set.empty() || axes_set.count(i) > 0) {
-        scale_in_policy = std::min(scale_in_policy, scales[i]);
-      }
-    }
-  } else if (keep_aspect_ratio_policy_ == AspectRatioPolicy ::NOT_SMALLER) {
-    scale_in_policy = std::numeric_limits<float>::min();
-
-    for (size_t i = 0; i < scales.size(); i++) {
-      if (axes_set.empty() || axes_set.count(i) > 0) {
-        scale_in_policy = std::max(scale_in_policy, scales[i]);
-      }
-    }
-  }
-
-  for (size_t i = 0; i < scales.size(); i++) {
-    // if axes is not specified (AKA axes_set.empty()), we apply the policy to all axes
-    if (axes_set.empty() || axes_set.count(i) > 0) {
-      scales[i] = scale_in_policy;
-      output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
-    } else {
-      scales[i] = 1.0f;
-      output_dims[i] = input_dims[i];
-    }
-  }
-}
-
 template <typename T>
 void UpsampleNearest2x(int64_t batch_size,
                        int64_t num_channels,
diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
index b0e309a70444f..7dcf88133e967 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h
@@ -4,9 +4,12 @@
 #pragma once
 
 #include <algorithm>
+#include <cmath>
+#include <limits>
 #include <string>
 #include <string_view>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include <core/common/inlined_containers_fwd.h>
@@ -120,6 +123,49 @@ void PrintAntiAliasBuffers(std::ostream& os, gsl::span<int64_t> bounds, gsl::spa
   os << std::endl;
 }
 
+namespace upsamplebase_helper {
+
+inline void AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                     InlinedVector<float>& scales, AspectRatioPolicy keep_aspect_ratio_policy,
+                                     const TensorShapeVector& axes) {
+  if (keep_aspect_ratio_policy == AspectRatioPolicy::STRETCH) {
+    return;
+  }
+
+  std::unordered_set<int64_t> axes_set(axes.begin(), axes.end());
+
+  float scale_in_policy = 0.0f;
+  if (keep_aspect_ratio_policy == AspectRatioPolicy::NOT_LARGER) {
+    scale_in_policy = std::numeric_limits<float>::max();
+
+    for (size_t i = 0; i < scales.size(); ++i) {
+      if (axes_set.empty() || axes_set.count(static_cast<int64_t>(i)) > 0) {
+        scale_in_policy = std::min(scale_in_policy, scales[i]);
+      }
+    }
+  } else if (keep_aspect_ratio_policy == AspectRatioPolicy::NOT_SMALLER) {
+    scale_in_policy = std::numeric_limits<float>::min();
+
+    for (size_t i = 0; i < scales.size(); ++i) {
+      if (axes_set.empty() || axes_set.count(static_cast<int64_t>(i)) > 0) {
+        scale_in_policy = std::max(scale_in_policy, scales[i]);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < scales.size(); ++i) {
+    if (axes_set.empty() || axes_set.count(static_cast<int64_t>(i)) > 0) {
+      scales[i] = scale_in_policy;
+      output_dims[i] = static_cast<int64_t>(std::round(scales[i] * input_dims[i]));
+    } else {
+      scales[i] = 1.0f;
+      output_dims[i] = input_dims[i];
+    }
+  }
+}
+
+}  // namespace upsamplebase_helper
+
 class UpsampleBase {
  public:
   // Make this available in other EP via provider bridge
@@ -597,6 +643,13 @@ class UpsampleBase {
   }
 };  // UpsampleBase
 
+#ifndef SHARED_PROVIDER
+inline void UpsampleBase::AdjustOutputSizeAsPolicy(TensorShapeVector& output_dims, gsl::span<const int64_t> input_dims,
+                                                   InlinedVector<float>& scales) const {
+  upsamplebase_helper::AdjustOutputSizeAsPolicy(output_dims, input_dims, scales, keep_aspect_ratio_policy_, axes_);
+}
+#endif
+
 }  // namespace onnxruntime
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 60ac16018f539..9cfc38c8f292f 100755
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -944,8 +944,11 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Resize);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, uint8_t, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, ReverseSequence);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 15, float, RoiAlign);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 15, double, RoiAlign);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, float, RoiAlign);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, double, RoiAlign);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, MLFloat16, RoiAlign);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Slice);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int64_t, Slice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, ThresholdedRelu);
@@ -1424,6 +1427,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterND);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 19, float, GridSample);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, MLFloat16, RoiAlign);
 
 // Opset 17
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
@@ -1592,6 +1596,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, HardSwish);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, GridSample);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, RoiAlign);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, RoiAlign);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, RoiAlign);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, RoiAlign);
 
 // Opset 23.
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, float, Attention);
@@ -2027,8 +2035,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, uint8_t, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, ReverseSequence)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, RoiAlign)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 15, float, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 15, double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, float, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 21, MLFloat16, RoiAlign)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int32_t, Slice)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, int64_t, Slice)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, ThresholdedRelu)>,
@@ -2676,6 +2687,10 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, HardSwish)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, RoiAlign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, RoiAlign)>,
 
       // Opset 23
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, 23, float, Attention)>,
diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign.cc b/onnxruntime/core/providers/cuda/object_detection/roialign.cc
index 71fb066c2898f..b4f63b3aa04f2 100644
--- a/onnxruntime/core/providers/cuda/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cuda/object_detection/roialign.cc
@@ -7,11 +7,37 @@
 namespace onnxruntime {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                         \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                         \
+#define ADD_VERSIONED_TYPED_ROIALIGN_OP_10(T)                            \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                               \
       RoiAlign,                                                          \
       kOnnxDomain,                                                       \
       10,                                                                \
+      15,                                                                \
+      T,                                                                 \
+      kCudaExecutionProvider,                                            \
+      (*KernelDefBuilder::Create())                                      \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())        \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<int64_t>()), \
+      RoiAlign<T>);
+
+#define ADD_VERSIONED_TYPED_ROIALIGN_OP_16(T)                            \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                               \
+      RoiAlign,                                                          \
+      kOnnxDomain,                                                       \
+      16,                                                                \
+      21,                                                                \
+      T,                                                                 \
+      kCudaExecutionProvider,                                            \
+      (*KernelDefBuilder::Create())                                      \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())        \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<int64_t>()), \
+      RoiAlign<T>);
+
+#define ADD_TYPED_ROIALIGN_OP_22(T)                                      \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                         \
+      RoiAlign,                                                          \
+      kOnnxDomain,                                                       \
+      22,                                                                \
       T,                                                                 \
       kCudaExecutionProvider,                                            \
       (*KernelDefBuilder::Create())                                      \
@@ -67,13 +93,19 @@ Status RoiAlign<T>::ComputeInternal(OpKernelContext* context) const {
   return Status::OK();
 }
 
-#define SPECIALIZED_COMPUTE(T) \
-  REGISTER_KERNEL_TYPED(T)     \
+#define SPECIALIZED_COMPUTE(T)          \
+  ADD_VERSIONED_TYPED_ROIALIGN_OP_10(T) \
+  ADD_VERSIONED_TYPED_ROIALIGN_OP_16(T) \
+  ADD_TYPED_ROIALIGN_OP_22(T)           \
   template Status RoiAlign<T>::ComputeInternal(OpKernelContext* ctx) const;
 
 SPECIALIZED_COMPUTE(float)
 SPECIALIZED_COMPUTE(double)
-// SPECIALIZED_COMPUTE(MLFloat16)
+SPECIALIZED_COMPUTE(MLFloat16)
+
+// BFloat16 is available for RoiAlign op from version 22:
+ADD_TYPED_ROIALIGN_OP_22(BFloat16)
+template Status RoiAlign<BFloat16>::ComputeInternal(OpKernelContext* ctx) const;
 
 }  // namespace cuda
 };  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
index 7acfd9d075461..76f6f26fd8a02 100644
--- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
+++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
@@ -17,12 +17,13 @@
 
 #include "roialign_impl.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/shared_inc/accumulation_type.h"
 
 namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-__device__ T bilinear_interpolate(
+__device__ AccumulationType_t<T> bilinear_interpolate(
     const T* bottom_data,
     const int height,
     const int width,
@@ -30,51 +31,61 @@ __device__ T bilinear_interpolate(
     T x,
     const bool is_mode_avg,
     const int index /* index for debug only*/) {
+  using TAcc = AccumulationType_t<T>;
+
+  TAcc y_acc = static_cast<TAcc>(y);
+  TAcc x_acc = static_cast<TAcc>(x);
+
   // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+  if (y_acc < static_cast<TAcc>(-1.0f) || y_acc > static_cast<TAcc>(height) ||
+      x_acc < static_cast<TAcc>(-1.0f) || x_acc > static_cast<TAcc>(width)) {
     // empty
-    return 0;
+    return static_cast<TAcc>(0.0f);
   }
 
-  if (y <= 0) {
-    y = 0;
+  if (y_acc <= static_cast<TAcc>(0.0f)) {
+    y_acc = static_cast<TAcc>(0.0f);
   }
-  if (x <= 0) {
-    x = 0;
+  if (x_acc <= static_cast<TAcc>(0.0f)) {
+    x_acc = static_cast<TAcc>(0.0f);
   }
 
-  int y_low = (int)y;
-  int x_low = (int)x;
+  int y_low = static_cast<int>(y_acc);
+  int x_low = static_cast<int>(x_acc);
   int y_high;
   int x_high;
 
   if (y_low >= height - 1) {
     y_high = y_low = height - 1;
-    y = (T)y_low;
+    y_acc = static_cast<TAcc>(y_low);
   } else {
     y_high = y_low + 1;
   }
 
   if (x_low >= width - 1) {
     x_high = x_low = width - 1;
-    x = (T)x_low;
+    x_acc = static_cast<TAcc>(x_low);
   } else {
     x_high = x_low + 1;
   }
 
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
+  TAcc ly = y_acc - static_cast<TAcc>(y_low);
+  TAcc lx = x_acc - static_cast<TAcc>(x_low);
+  TAcc hy = static_cast<TAcc>(1.0f) - ly;
+  TAcc hx = static_cast<TAcc>(1.0f) - lx;
   // do bilinear interpolation
-  T v1 = bottom_data[y_low * width + x_low];
-  T v2 = bottom_data[y_low * width + x_high];
-  T v3 = bottom_data[y_high * width + x_low];
-  T v4 = bottom_data[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  TAcc v1 = static_cast<TAcc>(bottom_data[y_low * width + x_low]);
+  TAcc v2 = static_cast<TAcc>(bottom_data[y_low * width + x_high]);
+  TAcc v3 = static_cast<TAcc>(bottom_data[y_high * width + x_low]);
+  TAcc v4 = static_cast<TAcc>(bottom_data[y_high * width + x_high]);
+  TAcc w1 = hy * hx;
+  TAcc w2 = hy * lx;
+  TAcc w3 = ly * hx;
+  TAcc w4 = ly * lx;
 
-  T val = is_mode_avg
-              ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)             // mode Avg
-              : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
+  TAcc val = is_mode_avg
+                 ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)             // mode Avg
+                 : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
 
   return val;
 }
@@ -97,6 +108,8 @@ __global__ void RoIAlignForward(
     const bool half_pixel,
     const int64_t* batch_indices_ptr,
     const int64_t batch_size) {
+  using TAcc = AccumulationType_t<T>;
+
   for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -111,26 +124,27 @@ __global__ void RoIAlignForward(
     // If the index is out of range, we set the output to 0 for this RoI element.
     if (roi_batch_ind < 0 || roi_batch_ind >= batch_size) {
       CUDA_KERNEL_ASSERT(false && "batch_indices values are out of range");
-      top_data[index] = 0;
+      top_data[index] = static_cast<T>(0.0f);
       continue;
     }
 
     // Do not using rounding; this implementation detail is critical
-    T roi_offset = half_pixel ? T(0.5) : T(0);
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
+    const TAcc spatial_scale_acc = static_cast<TAcc>(spatial_scale);
+    const TAcc roi_offset = half_pixel ? static_cast<TAcc>(0.5f) : static_cast<TAcc>(0.0f);
+    TAcc roi_start_w = static_cast<TAcc>(offset_bottom_rois[0]) * spatial_scale_acc - roi_offset;
+    TAcc roi_start_h = static_cast<TAcc>(offset_bottom_rois[1]) * spatial_scale_acc - roi_offset;
+    TAcc roi_end_w = static_cast<TAcc>(offset_bottom_rois[2]) * spatial_scale_acc - roi_offset;
+    TAcc roi_end_h = static_cast<TAcc>(offset_bottom_rois[3]) * spatial_scale_acc - roi_offset;
+
+    TAcc roi_width = roi_end_w - roi_start_w;
+    TAcc roi_height = roi_end_h - roi_start_h;
     if (!half_pixel) {  // backward compatibility
       // Force malformed ROIs to be 1x1
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
+      roi_width = max(roi_width, static_cast<TAcc>(1.0f));
+      roi_height = max(roi_height, static_cast<TAcc>(1.0f));
     }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    const TAcc bin_size_h = roi_height / static_cast<TAcc>(pooled_height);
+    const TAcc bin_size_w = roi_width / static_cast<TAcc>(pooled_width);
 
     const T* offset_bottom_data =
         bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) * height * width);
@@ -138,27 +152,27 @@ __global__ void RoIAlignForward(
     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
                              ? sampling_ratio
-                             : _Ceil(roi_height / pooled_height);  // e.g., = 2
+                             : static_cast<int>(_Ceil(roi_height / static_cast<TAcc>(pooled_height)));  // e.g., = 2
     int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : _Ceil(roi_width / pooled_width);
+        (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(_Ceil(roi_width / static_cast<TAcc>(pooled_width)));
 
     // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    const TAcc count = static_cast<TAcc>(roi_bin_grid_h * roi_bin_grid_w);  // e.g. = 4
 
-    T output_val = 0.;
+    TAcc output_val = static_cast<TAcc>(0.0f);
     bool max_flag = false;
     for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
     {
-      const T y = roi_start_h + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      const TAcc y = roi_start_h + static_cast<TAcc>(ph) * bin_size_h +
+                     (static_cast<TAcc>(iy) + static_cast<TAcc>(0.5f)) * bin_size_h /
+                         static_cast<TAcc>(roi_bin_grid_h);  // e.g., 0.5, 1.5
       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_start_w + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
+        const TAcc x = roi_start_w + static_cast<TAcc>(pw) * bin_size_w +
+                       (static_cast<TAcc>(ix) + static_cast<TAcc>(0.5f)) * bin_size_w /
+                           static_cast<TAcc>(roi_bin_grid_w);
 
-        T val = bilinear_interpolate(
-            offset_bottom_data, height, width, y, x, is_mode_avg, index);
+        const TAcc val = bilinear_interpolate(
+            offset_bottom_data, height, width, static_cast<T>(y), static_cast<T>(x), is_mode_avg, index);
 
         if (is_mode_avg) {
           output_val += val;
@@ -176,7 +190,7 @@ __global__ void RoIAlignForward(
       output_val /= count;
     }
 
-    top_data[index] = output_val;
+    top_data[index] = static_cast<T>(output_val);
   }
 }
 
@@ -241,6 +255,8 @@ void RoiAlignImpl(
 
 SPECIALIZED_IMPL(float)
 SPECIALIZED_IMPL(double)
+SPECIALIZED_IMPL(half)
+SPECIALIZED_IMPL(BFloat16)
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/grid_sample.cc b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc
index b9d47a27e8e83..d97d5fcbb0b5b 100755
--- a/onnxruntime/core/providers/cuda/tensor/grid_sample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc
@@ -51,7 +51,6 @@ template <typename T, bool IsNHWC>
 GridSample<T, IsNHWC>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
   opset_start_version_ = info.node().SinceVersion();
 
-  std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
   std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
   align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
 
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc
index e7032d5880581..e2c08618264dd 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@@ -380,5 +380,11 @@ Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
   return BaseCompute(context, roi_array, scales_array, output_dims);
 }
 
+template class Upsample<float>;
+template class Upsample<double>;
+template class Upsample<MLFloat16>;
+template class Upsample<int32_t>;
+template class Upsample<uint8_t>;
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
index 93f0e47005050..c57322e46bfe8 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h
@@ -45,332 +45,6 @@ static int GetNumProfiles(std::unordered_map<std::string, std::vector<std::vecto
   return num_profile;
 }
 
-/*
- * Seralize engine profile
- * The profile contains min/max shape ranges of dynamic shape dimensions of each input tensor
- * For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b
- * has one dynamic shape dimension: dim_1. The data in profile will be,
- * key: tensor_a, value: dim_0 min_shape max_shape dim_2 min_shape max_shape
- * key: tensor_b, value: dim_1 min_shape max_shape
- *
- * [Deprecated] Use SerializeProfileV2
- */
-static void SerializeProfile(const std::string& file_name, std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, int64_t>>>& shape_ranges) {
-  // Serialize profile
-  flexbuffers::Builder builder;
-  auto profile_start = builder.StartMap();
-  for (auto outer_it = shape_ranges.begin(); outer_it != shape_ranges.end(); ++outer_it) {
-    builder.TypedVector(outer_it->first.c_str(), [&] {
-      for (auto inner_it = outer_it->second.begin(); inner_it != outer_it->second.end(); ++inner_it) {
-        builder.Int(inner_it->first);
-        builder.Int(inner_it->second.first);
-        builder.Int(inner_it->second.second);
-      }
-    });
-  }
-  builder.EndMap(profile_start);
-  builder.Finish();
-
-  // Save flexbuffer
-  std::ofstream file(file_name, std::ios::binary | std::ios::out);
-  auto buf = builder.GetBuffer();
-  size_t size = builder.GetSize();
-  file.write(reinterpret_cast<const char*>(&buf[0]), size);
-  file.close();
-}
-
-// Deserialize engine profile
-// [Deprecated] Use DeserializeProfileV2
-static std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, int64_t>>> DeserializeProfile(std::ifstream& infile) {
-  // Load flexbuffer
-  infile.seekg(0, std::ios::end);
-  size_t length = infile.tellg();
-  infile.seekg(0, std::ios::beg);
-  std::unique_ptr<char[]> data{new char[length]};
-  infile.read((char*)data.get(), length);
-  infile.close();
-
-  // Deserialize profile
-  std::unordered_map<std::string, std::unordered_map<size_t, std::pair<int64_t, int64_t>>> shape_ranges;
-  auto tensors_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap();
-  auto keys = tensors_range_entries.Keys();
-  auto values = tensors_range_entries.Values();
-  for (size_t i = 0, i_end = keys.size(); i < i_end; ++i) {
-    auto dim_range_vectors = values[i].AsTypedVector();
-    std::unordered_map<size_t, std::pair<int64_t, int64_t>> inner_map;
-    for (size_t j = 0, j_end = dim_range_vectors.size() / 3; j < j_end; ++j) {
-      size_t idx = 3 * j;
-      inner_map[dim_range_vectors[idx].AsInt64()] = std::make_pair(dim_range_vectors[idx + 1].AsInt64(), dim_range_vectors[idx + 2].AsInt64());
-    }
-    shape_ranges[keys[i].AsString().c_str()] = inner_map;
-  }
-  return shape_ranges;
-}
-
-/*
- * Seralize engine profile. (This function starts from ORT 1.15)
- *
- *
- * (1) Single profile case:
- * Assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2,
- * and tensor_b has one dynamic shape dimension: dim_1.
- *
- * The data before serialization will be:
- * {
- *   tensor_a: {
- *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0]],
- *     dim_2: [[min_shape_2, max_shape_2, opt_shape_2]]
- *   },
- *   tensor_b: {
- *     dim_1: [[min_shape_1, max_shape_1, opt_shape_1]]
- *   }
- * }
- *
- * The data after serialization will be:
- * {
- *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_2, min_shape_2, max_shape_2, opt_shape_2]
- *   tensor_b: [dim_1, min_shape_1, max_shape_1, opt_shape_1]
- * }
- *
- *
- * (2) Multiple profiles case:
- * For example, if the data before serialization is:
- * {
- *   tensor_a: {
- *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
- *   },
- *   tensor_b: {
- *     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
- *   }
- * }
- *
- * The data after serialization will be:
- * {
- *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_0, min_shape_1, max_shape_1, opt_shape_1]
- *              |                                          |  |                                          |
- *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
- *
- *   tensor_b: [dim_1, min_shape_2, max_shape_2, opt_shape_2, dim_1, min_shape_3, max_shape_3, opt_shape_3]
- *              |                                          |  |                                          |
- *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
- * }
- *
- */
-static void SerializeProfileV2(const std::string& file_name, std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>>& shape_ranges) {
-  LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] In SerializeProfileV2()";
-  // Serialize profile
-  flexbuffers::Builder builder;
-  auto tensor_map_start = builder.StartMap();
-  for (auto tensor_it = shape_ranges.begin(); tensor_it != shape_ranges.end(); tensor_it++) {  // iterate tensors
-    LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] input tensor is '" << tensor_it->first.c_str() << "'";
-    builder.TypedVector(tensor_it->first.c_str(), [&] {
-      for (auto dim_it = tensor_it->second.begin(); dim_it != tensor_it->second.end(); dim_it++) {
-        size_t num_profiles = dim_it->second.size();
-        for (size_t i = 0; i < num_profiles; i++) {
-          LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] profile #" << i << ", dim is " << dim_it->first;
-          builder.Int(dim_it->first);
-          builder.Int(dim_it->second[i][0]);
-          builder.Int(dim_it->second[i][1]);
-          builder.Int(dim_it->second[i][2]);
-          LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] " << dim_it->first << ", " << dim_it->second[i][0] << ", " << dim_it->second[i][1] << ", " << dim_it->second[i][2];
-        }
-      }
-    });
-  }
-  builder.EndMap(tensor_map_start);
-  builder.Finish();
-
-  // Save flexbuffer
-  std::ofstream file(file_name, std::ios::binary | std::ios::out);
-  auto buf = builder.GetBuffer();
-  size_t size = builder.GetSize();
-  file.write(reinterpret_cast<const char*>(&buf[0]), size);
-  file.close();
-}
-
-/*
- * Deserialize engine profile. (This function starts from ORT 1.15)
- *
- *
- * (1) Single profile case:
- * Assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2,
- * and tensor_b has one dynamic shape dimension: dim_1.
- *
- * The data in profile file will be:
- * {
- *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_2, min_shape_2, max_shape_2, opt_shape_2]
- *   tensor_b: [dim_1, min_shape_1, max_shape_1, opt_shape_1]
- * }
- *
- * The data after deserialization will be:
- * {
- *   tensor_a: {
- *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0]],
- *     dim_2: [[min_shape_2, max_shape_2, opt_shape_2]]
- *   },
- *   tensor_b: {
- *     dim_1: [[min_shape_1, max_shape_1, opt_shape_1]]
- *   }
- * }
- *
- *
- * (2) Multiple profiles case:
- * For example, if the data in profile file is:
- * {
- *   tensor_a: [dim_0, min_shape_0, max_shape_0, opt_shape_0, dim_0, min_shape_1, max_shape_1, opt_shape_1]
- *              |                                          |  |                                          |
- *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
- *
- *   tensor_b: [dim_1, min_shape_2, max_shape_2, opt_shape_2, dim_1, min_shape_3, max_shape_3, opt_shape_3]
- *              |                                          |  |                                          |
- *              ---------------- profile 0 -----------------  ---------------- profile 1 -----------------
- * }
- *
- * The data after deserialization will be:
- * {
- *   tensor_a: {
- *     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
- *   },
- *   tensor_b: {
- *     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
- *   }
- * }
- */
-static std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> DeserializeProfileV2(std::ifstream& infile) {
-  LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] In DeserializeProfileV2()";
-  // Load flexbuffer
-  infile.seekg(0, std::ios::end);
-  size_t length = infile.tellg();
-  infile.seekg(0, std::ios::beg);
-  std::unique_ptr<char[]> data{new char[length]};
-  infile.read((char*)data.get(), length);
-  infile.close();
-
-  // Deserialize profile
-  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> shape_ranges;
-  auto tensors_range_entries = flexbuffers::GetRoot((const uint8_t*)data.get(), length).AsMap();
-  auto keys = tensors_range_entries.Keys();
-  auto values = tensors_range_entries.Values();
-  for (size_t i = 0, end = keys.size(); i < end; ++i) {  // iterate tensors
-    LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] input tensor is '" << keys[i].AsString().c_str() << "'";
-    auto dim_range_vector = values[i].AsTypedVector();
-    std::unordered_map<size_t, std::vector<std::vector<int64_t>>> inner_map;
-    std::vector<std::vector<int64_t>> profile_vector;
-
-    for (size_t k = 0; k < (dim_range_vector.size() / 4); k++) {  // iterate dim, min, max, opt for all profiles
-      std::vector<int64_t> shape_vector;
-      auto idx = 4 * k;
-      auto dim = dim_range_vector[idx].AsInt64();
-      shape_vector.push_back(dim_range_vector[idx + 1].AsInt64());  // min shape
-      shape_vector.push_back(dim_range_vector[idx + 2].AsInt64());  // max shape
-      shape_vector.push_back(dim_range_vector[idx + 3].AsInt64());  // opt shape
-
-      if (inner_map.find(dim) == inner_map.end()) {
-        inner_map[dim] = profile_vector;
-      }
-      inner_map[dim].push_back(shape_vector);
-      LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] " << dim << ", " << shape_vector[0] << ", " << shape_vector[1] << ", " << shape_vector[2];
-    }
-    shape_ranges[keys[i].AsString().c_str()] = inner_map;
-  }
-  return shape_ranges;
-}
-
-/*
- * Compare profile shapes from profile file (.profile) with explicit profile min/max/opt shapes.
- * Return false meaning no need to rebuild engine if everything is same.
- * Otherwise return true and engine needs to be rebuilt.
- */
-static bool CompareProfiles(const std::string& file_name,
-                            std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_min_shapes,
-                            std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_max_shapes,
-                            std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_opt_shapes) {
-  std::ifstream profile_file(file_name, std::ios::binary | std::ios::in);
-  if (!profile_file) {
-    LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] " << file_name << " doesn't exist.";
-    return true;
-  }
-
-  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> shape_ranges;
-  shape_ranges = DeserializeProfileV2(profile_file);
-
-  /* The format of the two data structures are below, for example:
-   *
-   * shape_ranges:
-   * {
-   *   tensor_a: {
-   *     dim_0: [[min_shape, max_shape, opt_shape]],
-   *     dim_2: [[min_shape, max_shape, opt_shape]]
-   *   },
-   *   tensor_b: {
-   *     dim_1: [[min_shape, max_shape, opt_shape]]
-   *   }
-   * }
-   *
-   * profile_min_shapes:
-   * {
-   *   tensor_a: [[dim_0_value_0, dim_1_value_1, dim_2_value_2]],
-   *   tensor_b: [[dim_0_value_3, dim_1_value_4, dim_2_value_5]]
-   * }
-   *
-   */
-
-  // Check number of dynamic shape inputs
-  if (profile_min_shapes.size() != shape_ranges.size()) {
-    LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] Numbers of dynamic shape inputs are not the same.";
-    return true;
-  }
-
-  // Iterate through shape_ranges map
-  for (auto tensor_it = shape_ranges.begin(); tensor_it != shape_ranges.end(); tensor_it++) {  // iterate tensors
-    auto tensor_name = tensor_it->first;
-    if (profile_min_shapes.find(tensor_name) == profile_min_shapes.end()) {
-      LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] Tensor name '" << tensor_name << "' doesn't exist in trt_profile_min_shapes.";
-      return true;
-    }
-
-    for (auto dim_it = tensor_it->second.begin(); dim_it != tensor_it->second.end(); dim_it++) {  // iterate dimensions
-      auto dim = dim_it->first;
-      auto num_profiles = GetNumProfiles(profile_min_shapes);
-
-      if (dim_it->second.size() != static_cast<size_t>(num_profiles)) {
-        LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] Numbers of profiles are not the same.";
-        return true;
-      }
-
-      for (size_t i = 0; i < dim_it->second.size(); i++) {  // iterate (multiple) profile(s)
-        auto shape_values = dim_it->second[i];
-        if (dim > (profile_min_shapes[tensor_name][i].size() - 1)) {
-          LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] dimension " << dim << " of '" << tensor_name << "' in " << file_name << " exceeds the total dimension of trt_profile_min_shapes.";
-          return true;
-        }
-
-        LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] min shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_min_shapes[tensor_name][i][dim];
-        LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] min shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[0] << " in " << file_name;
-        if (profile_min_shapes[tensor_name][i][dim] != shape_values[0]) {
-          LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] min shape values of dimension " << dim << " of '" << tensor_name << "' are not the same";
-          return true;
-        }
-
-        LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] max shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_max_shapes[tensor_name][i][dim];
-        LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] max shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[1] << " in " << file_name;
-        if (profile_max_shapes[tensor_name][i][dim] != shape_values[1]) {
-          LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] max shape values of dimension " << dim << " of '" << tensor_name << "' are not the same";
-          return true;
-        }
-
-        LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] opt shape value of dimension " << dim << " of '" << tensor_name << "' is " << profile_opt_shapes[tensor_name][i][dim];
-        LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] opt shape value of dimension " << dim << " of '" << tensor_name << "' is " << shape_values[2] << " in " << file_name;
-        if (profile_opt_shapes[tensor_name][i][dim] != shape_values[2]) {
-          LOGS_DEFAULT(VERBOSE) << "[NvTensorRTRTX EP] opt shape values of dimension " << dim << " of '" << tensor_name << "' are not the same";
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
 /*
  * Get cache by name
  *
@@ -394,37 +68,6 @@ static std::string GetComputeCapability(const cudaDeviceProp& prop) {
   return compute_capability;
 }
 
-/*
- * Get cache by type
- *
- * \param root root path of the cache
- * \param file_extension It could be ".engine", ".profile" or ".timing"
- */
-static std::vector<fs::path> GetCachesByType(const std::string& root, std::string file_extension) {
-  std::vector<fs::path> cache_files;
-  for (const auto& entry : fs::directory_iterator(root)) {
-    if (fs::path(file_extension) == fs::path(entry).extension()) {
-      cache_files.push_back(fs::path(entry));
-    }
-  }
-  return cache_files;
-}
-
-static bool IsCacheExistedByType(const std::string& root, std::string file_extension) {
-  auto cache_files = GetCachesByType(root, file_extension);
-  if (cache_files.size() == 0) {
-    return false;
-  }
-  return true;
-}
-
-static void RemoveCachesByType(const std::string& root, std::string file_extension) {
-  auto cache_files = GetCachesByType(root, file_extension);
-  for (const auto& entry : cache_files) {
-    fs::remove(entry);
-  }
-}
-
 /**
  * <summary>
  * Helper class to generate engine id via model name/model content/env metadata
@@ -631,51 +274,6 @@ static bool ParseProfileShapes(std::string profile_shapes_string, std::unordered
   return true;
 }
 
-static std::vector<std::string> split(const std::string& str, char delimiter) {
-  std::vector<std::string> tokens;
-  std::string token;
-  std::istringstream tokenStream(str);
-  while (std::getline(tokenStream, token, delimiter)) {
-    tokens.push_back(token);
-  }
-  return tokens;
-}
-
-static std::string join(const std::vector<std::string>& vec, const std::string& delimiter) {
-  std::string result;
-  for (size_t i = 0; i < vec.size(); ++i) {
-    result += vec[i];
-    if (i < vec.size() - 1) {
-      result += delimiter;
-    }
-  }
-  return result;
-}
-
-/*
- * Parse engine cache name suffix when user customizes prefix for engine cache name
- *
- * For example:
- * When default subgraph name is "NvExecutionProvider_TRTKernel_graph_torch-jit-export_2068723788287043730_189_189_fp16"
- * This func will generate the suffix "2068723788287043730_189_fp16"
- *
- */
-static std::string GetCacheSuffix(const std::string& fused_node_name, const std::string& trt_node_name_with_precision) {
-  std::vector<std::string> split_fused_node_name = split(fused_node_name, '_');
-  if (split_fused_node_name.size() >= 3) {
-    // Get index of model hash from fused_node_name
-    std::string model_hash = split_fused_node_name[split_fused_node_name.size() - 3];
-    size_t index = fused_node_name.find(model_hash);
-    // Parse suffix from trt_node_name_with_precision, as it has additional precision info
-    std::vector<std::string> suffix_group = split(trt_node_name_with_precision.substr(index), '_');
-    if (suffix_group.size() > 2) {
-      suffix_group.erase(suffix_group.begin() + 2);
-    }
-    return join(suffix_group, "_");
-  }
-  return "";
-}
-
 /*
  * Checks if there is a an element with value `-1` in nvinfer1::Dims
  */
@@ -700,37 +298,4 @@ static bool checkTrtTensorIsDynamic(nvinfer1::ITensor* tensor) {
     return checkTrtDimIsDynamic(tensor->getDimensions());
   }
 }
-
-struct ScopedContext {
-  explicit ScopedContext(int device_id) : pushed_(true) {
-    CUcontext cu_context = 0;
-    CU_CALL_THROW(cuCtxGetCurrent(&cu_context));
-    if (!cu_context) {
-      // cuCtxGetCurrent succeeded but returned nullptr, which indicates that no CUDA context
-      // is currently set for this thread. This implicates that there is not user created context.
-      // We use runtime API to initialize a context for the specified device.
-      CUDA_CALL_THROW(cudaSetDevice(device_id));
-      CU_CALL_THROW(cuCtxGetCurrent(&cu_context));
-    }
-    CU_CALL_THROW(cuCtxPushCurrent(cu_context));
-  }
-
-  /** \brief Push an existing context (e.g. CIG context); pop on destruction. */
-  explicit ScopedContext(CUcontext ctx) : pushed_(ctx != nullptr) {
-    if (ctx != nullptr) {
-      CU_CALL_THROW(cuCtxPushCurrent(ctx));
-    }
-  }
-
-  ScopedContext(const ScopedContext&) = delete;
-
-  ~ScopedContext() {
-    if (pushed_) {
-      cuCtxPopCurrent(nullptr);
-    }
-  }
-
- private:
-  bool pushed_ = true;
-};
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
index 4d5c5b45f65dd..31ff17f241371 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_factory.cc
@@ -16,7 +16,6 @@
 #include "core/framework/plugin_ep_stream.h"
 #include "core/providers/nv_tensorrt_rtx/nv_provider_options.h"
 #include "core/providers/nv_tensorrt_rtx/nv_execution_provider_custom_ops.h"
-#include "core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h"
 #include "core/providers/cuda/cuda_stream_handle.h"
 
 // D3D12 headers for graphics interop on Windows
@@ -30,6 +29,7 @@
 #include "nv_provider_factory_creator.h"
 #include "nv_data_transfer.h"
 #include "nv_allocator.h"
+#include "nv_scoped_context.h"
 
 using namespace onnxruntime;
 
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_scoped_context.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_scoped_context.h
new file mode 100644
index 0000000000000..8a16533b01c7b
--- /dev/null
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_scoped_context.h
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Licensed under the MIT License.
+
+#include "nv_includes.h"
+#include "core/providers/cuda/cuda_pch.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+
+namespace onnxruntime {
+struct ScopedContext {
+  explicit ScopedContext(int device_id) : pushed_(true) {
+    CUcontext cu_context = 0;
+    CU_CALL_THROW(cuCtxGetCurrent(&cu_context));
+    if (!cu_context) {
+      // cuCtxGetCurrent succeeded but returned nullptr, which indicates that no CUDA context
+      // is currently set for this thread. This implicates that there is not user created context.
+      // We use runtime API to initialize a context for the specified device.
+      CUDA_CALL_THROW(cudaSetDevice(device_id));
+      CU_CALL_THROW(cuCtxGetCurrent(&cu_context));
+    }
+    CU_CALL_THROW(cuCtxPushCurrent(cu_context));
+  }
+
+  /** \brief Push an existing context (e.g. CIG context); pop on destruction. */
+  explicit ScopedContext(CUcontext ctx) : pushed_(ctx != nullptr) {
+    if (ctx != nullptr) {
+      CU_CALL_THROW(cuCtxPushCurrent(ctx));
+    }
+  }
+
+  ScopedContext(const ScopedContext&) = delete;
+
+  ~ScopedContext() {
+    if (pushed_) {
+      cuCtxPopCurrent(nullptr);
+    }
+  }
+
+ private:
+  bool pushed_ = true;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index eba0a8c2615aa..52060489e7c54 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -445,6 +445,7 @@ void QnnLogging(const char* format,
                 QnnLog_Level_t level,
                 uint64_t timestamp,
                 va_list argument_parameter) {
+  ORT_UNUSED_PARAMETER(level);
   ORT_UNUSED_PARAMETER(timestamp);
 
   if (!::onnxruntime::logging::LoggingManager::HasDefaultLogger()) {
@@ -454,8 +455,7 @@ void QnnLogging(const char* format,
   }
 
   const auto& logger = ::onnxruntime::logging::LoggingManager::DefaultLogger();
-  // Map QNN log level to ORT severity
-  logging::Severity severity = QnnBackendManager::MapQNNLogLevelToOrtSeverity(level);
+  const auto severity = ::onnxruntime::logging::Severity::kVERBOSE;
   const auto data_type = ::onnxruntime::logging::DataType::SYSTEM;
 
   if (logger.OutputIsEnabled(severity, data_type)) {
@@ -529,22 +529,6 @@ QnnLog_Level_t QnnBackendManager::MapOrtSeverityToQNNLogLevel(logging::Severity
   }
 }
 
-/* static */ logging::Severity QnnBackendManager::MapQNNLogLevelToOrtSeverity(QnnLog_Level_t qnn_log_level) {
-  // Map QNN log level to ORT log severity
-  switch (qnn_log_level) {
-    case QNN_LOG_LEVEL_VERBOSE:
-    case QNN_LOG_LEVEL_DEBUG:
-      return logging::Severity::kVERBOSE;
-    case QNN_LOG_LEVEL_INFO:
-      return logging::Severity::kINFO;
-    case QNN_LOG_LEVEL_WARN:
-      return logging::Severity::kWARNING;
-    case QNN_LOG_LEVEL_ERROR:
-    default:
-      return logging::Severity::kERROR;
-  }
-}
-
 Status QnnBackendManager::ResetQnnLogLevel(std::optional<logging::Severity> ort_log_level) {
   std::lock_guard<std::recursive_mutex> lock(logger_recursive_mutex_);
   if (!backend_setup_completed_ || logger_ == nullptr) {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index dfa40a2c8aa0d..fe4ec0b7018a5 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -270,9 +270,6 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
   Qnn_ErrorHandle_t ReleaseDmaData(Qnn_ContextBinaryDmaDataMem_t data_mem, void* mapped_base_ptr);
 #endif
 
-  QnnLog_Level_t MapOrtSeverityToQNNLogLevel(logging::Severity ort_log_level);
-  static logging::Severity MapQNNLogLevelToOrtSeverity(QnnLog_Level_t qnn_log_level);
-
 #ifdef QNN_FILE_MAPPED_WEIGHTS_AVAILABLE
   typedef struct FileMappingCallbackInfo {
     void* const mapped_file_ptr;
@@ -379,6 +376,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
 
   const char* QnnProfileErrorToString(QnnProfile_Error_t error);
   std::string QnnErrorHandleToString(Qnn_ErrorHandle_t error);
+  QnnLog_Level_t MapOrtSeverityToQNNLogLevel(logging::Severity ort_log_level);
 
   // Adds a new QNN context.
   // Transfers ownership of `context_handle` (i.e., responsibility of freeing it) to this instance
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index ad22187a75cd9..a49376066009a 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -46,8 +46,8 @@ using namespace onnxruntime;
 /// @brief Gets the path of directory containing the dynamic library that contains the address.
 /// @param address An address of a function or variable in the dynamic library.
 /// @return The path of the directory containing the dynamic library, or an empty string if the path cannot be determined.
-static onnxruntime::PathString GetDynamicLibraryLocationByAddress(const void* address) {
 #ifdef _WIN32
+static onnxruntime::PathString GetDynamicLibraryLocationByAddress(const void* address) {
   HMODULE moduleHandle;
   if (!::GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
                             reinterpret_cast<LPCWSTR>(address), &moduleHandle)) {
@@ -66,11 +66,9 @@ static onnxruntime::PathString GetDynamicLibraryLocationByAddress(const void* ad
     buffer.resize(requiredSize);
     return {std::move(buffer)};
   }
-#else
-  std::ignore = address;
-#endif
   return {};
 }
+#endif
 
 vaip_core::OrtApiForVaip* create_org_api_hook();
 struct OrtVitisAIEpAPI {
diff --git a/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc
new file mode 100644
index 0000000000000..968086ebd2613
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file REQUIRES the following external definitions:
+// FILE_NAME, VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE, and VER_STRING
+
+#include <Winver.h>
+
+#if defined(DEBUG) || defined(_DEBUG)
+#define VER_DEBUG VS_FF_DEBUG
+#else
+#define VER_DEBUG 0
+#endif
+
+// -----------------------------------------------------------------------------
+
+VS_VERSION_INFO VERSIONINFO
+FILEVERSION     VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE
+PRODUCTVERSION  VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE
+FILEFLAGSMASK   VS_FFI_FILEFLAGSMASK
+FILEFLAGS       VER_DEBUG
+FILEOS          VOS__WINDOWS32
+FILETYPE        VFT_DLL
+FILESUBTYPE     VFT2_UNKNOWN
+
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904E4"
+        BEGIN
+            VALUE "CompanyName",      "Microsoft Corporation"
+            VALUE "FileDescription",  "ONNX Runtime VitisAI Provider"
+            VALUE "FileVersion",      VER_STRING
+            VALUE "InternalName",     "ONNX Runtime VitisAI Provider"
+            VALUE "LegalCopyright",   "\251 Microsoft Corporation. All rights reserved."
+            VALUE "OriginalFilename", FILE_NAME
+            VALUE "ProductName",      "Microsoft\256 Windows\256 Operating System"
+            VALUE "ProductVersion",   VER_STRING
+        END
+    END
+
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1252
+    END
+END
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 7ea25ea115567..f64a40145f535 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -113,7 +113,6 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector<FusedNodeAndG
 }
 
 common::Status VitisAIExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
-  InlinedVector<const Node*> ep_context_node_ptrs;
   auto get_config_entry = [](const void* state, const char* entry_name) -> vaip_core::DllSafe<std::string> {
     const onnxruntime::RunOptions& run_options = *static_cast<const onnxruntime::RunOptions*>(state);
     auto ret = run_options.GetConfigOptions().GetConfigEntry(std::string(entry_name));
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 1a925a16a6a71..937a96a619822 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1545,6 +1545,55 @@ void addGlobalMethods(py::module& m) {
       },
       R"pbdoc("Validate a compiled model's compatibility information for one or more EP devices.)pbdoc");
 
+  m.def(
+      "get_compatibility_info_from_model",
+      [](const std::basic_string<ORTCHAR_T>& model_path, const std::string& ep_type) -> py::object {
+        Ort::AllocatorWithDefaultOptions allocator;
+        Ort::AllocatedStringPtr compat_info = Ort::GetCompatibilityInfoFromModelAllocated(
+            model_path.c_str(), ep_type.c_str(), allocator);
+        if (compat_info.get() == nullptr) {
+          return py::none();
+        }
+        return py::str(compat_info.get());
+      },
+      R"pbdoc(Extract EP compatibility info from a precompiled model file.
+
+Parses the model file to extract the compatibility info string for a specific execution provider
+from the model's metadata properties. Returns None if no compatibility info exists for the EP.
+
+Args:
+    model_path: Path to the ONNX model file.
+    ep_type: The execution provider type string (e.g. "CPUExecutionProvider").
+
+Returns:
+    The compatibility info string, or None if not found.
+)pbdoc");
+
+  m.def(
+      "get_compatibility_info_from_model_bytes",
+      [](const py::buffer& model_data, const std::string& ep_type) -> py::object {
+        py::buffer_info info = model_data.request();
+        Ort::AllocatorWithDefaultOptions allocator;
+        Ort::AllocatedStringPtr compat_info = Ort::GetCompatibilityInfoFromModelBytesAllocated(
+            info.ptr, static_cast<size_t>(info.size * info.itemsize), ep_type.c_str(), allocator);
+        if (compat_info.get() == nullptr) {
+          return py::none();
+        }
+        return py::str(compat_info.get());
+      },
+      R"pbdoc(Extract EP compatibility info from precompiled model bytes in memory.
+
+Same as get_compatibility_info_from_model but reads from a buffer instead of a file.
+Accepts bytes, bytearray, memoryview, or any object supporting the buffer protocol.
+
+Args:
+    model_data: The model data as a buffer (bytes, bytearray, memoryview, etc.).
+    ep_type: The execution provider type string (e.g. "CPUExecutionProvider").
+
+Returns:
+    The compatibility info string, or None if not found.
+)pbdoc");
+
   m.def(
       "copy_tensors",
       [](const std::vector<const OrtValue*>& src, const std::vector<OrtValue*>& dest, py::object& py_arg) {
diff --git a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
index ed067a1362663..743bf50f6c608 100644
--- a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
@@ -110,9 +110,8 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
                     )
                     return
             else:
-                # Shape inference failed. Use default skip_index=1 (no broadcasting) since both
-                # Add inputs have already been verified as non-initializer dynamic tensors above.
-                logger.debug("symbolic shape inference failed, using default skip_index for SkipLayerNormalization")
+                logger.debug("skip SkipLayerNormalization fusion since symbolic shape inference failed")
+                return
 
         gather_path = self.model.match_parent_path(add, ["Gather"], [None])
         if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None:
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 424d6cbac743c..8c5859823ac16 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -13,6 +13,7 @@
 #include <cstdint>
 #include <limits>
 #include <fstream>
+#include <utility>
 
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
@@ -519,43 +520,109 @@ class PathValidationTest : public ::testing::Test {
     // Clean up the temporary directory.
     std::filesystem::remove_all(base_dir_);
     std::filesystem::remove_all(outside_dir_);
+
+    for (const auto& other_dir : other_dirs_) {
+      std::filesystem::remove_all(other_dir);
+    }
+
+    for (const auto& other_file : other_files_) {
+      std::filesystem::remove(other_file);
+    }
+  }
+
+  // Create directory that will be removed during test teardown.
+  void CreateDirectories(std::filesystem::path dir) {
+    std::filesystem::create_directories(dir);
+    other_dirs_.push_back(std::move(dir));
+  }
+
+  // Create empty file that will be removed during test teardown.
+  void CreateEmptyFile(std::filesystem::path file_path) {
+    std::ofstream{file_path};
+    other_files_.push_back(std::move(file_path));
   }
 
   std::filesystem::path base_dir_;
   std::filesystem::path outside_dir_;
+  std::vector<std::filesystem::path> other_dirs_;
+  std::vector<std::filesystem::path> other_files_;
 };
 
 // Test cases for ValidateExternalDataPath.
 TEST_F(PathValidationTest, ValidateExternalDataPath) {
+  std::filesystem::path model_path = base_dir_ / "model.onnx";
+  std::filesystem::path cwd = std::filesystem::current_path();
+  const bool is_cwd_root = cwd == cwd.root_path();
+
+  // Create empty external data files that we'll need for testing.
+  CreateEmptyFile(base_dir_ / "data.bin");
+  CreateDirectories(base_dir_ / "sub");
+  CreateEmptyFile(base_dir_ / "sub" / "data.bin");
+  CreateEmptyFile(cwd / "data.bin");
+  CreateDirectories(cwd / "abc");
+  CreateEmptyFile(cwd / "abc" / "data.bin");
+  CreateEmptyFile(cwd / "data..bin");
+
   // Valid relative path.
-  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "data.bin"));
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "data.bin"));
 
-  // Empty location.
-  // Only validate it is not an absolute path.
-  ASSERT_TRUE(utils::ValidateExternalDataPath(base_dir_, "").IsOK());
+  // Empty location not allowed.
+  {
+    Status status = utils::ValidateExternalDataPath(model_path, "");
+    ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Empty external data path"));
+  }
 
   // Path with ".." that escapes the base directory.
-  ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "../data.bin").IsOK());
+  ASSERT_FALSE(utils::ValidateExternalDataPath(model_path, "../data.bin").IsOK());
 
   // Absolute path.
+  {
+    Status status;
 #ifdef _WIN32
-  ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "C:\\data.bin").IsOK());
-  ASSERT_FALSE(utils::ValidateExternalDataPath("", "C:\\data.bin").IsOK());
-#else
-  ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "/data.bin").IsOK());
-  ASSERT_FALSE(utils::ValidateExternalDataPath("", "/data.bin").IsOK());
+    status = utils::ValidateExternalDataPath(model_path, "C:\\data.bin");
+    ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed"));
+
+    status = utils::ValidateExternalDataPath("", "C:\\data.bin");
+    ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed"));
 #endif  // Absolute path.
 
+    // Paths starting from / should be rejected even on Windows.
+    status = utils::ValidateExternalDataPath(model_path, "/data.bin");
+    ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed"));
+
+    status = utils::ValidateExternalDataPath("", "/data.bin");
+    ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed"));
+  }
+
   // Windows vs Unix path separators.
-  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "sub/data.bin"));
-  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "sub\\data.bin"));
+#ifdef _WIN32
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "sub\\data.bin"));
+#endif
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "sub/data.bin"));
+
+  // Model in a directory that does not exist.
+  ASSERT_FALSE(utils::ValidateExternalDataPath("non_existent_dir/model.onnx", "data.bin").IsOK());
+
+  // Model path is a bare filename (no directory component).
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("model.onnx", "data.bin"));
+  ASSERT_EQ(utils::ValidateExternalDataPath("model.onnx", "../data.bin").IsOK(), is_cwd_root);
+
+  // Model relative path checks.
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("./model.onnx", "data.bin"));
+  ASSERT_EQ(utils::ValidateExternalDataPath("./model.onnx", "../data.bin").IsOK(), is_cwd_root);
+#ifdef _WIN32
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(".\\model.onnx", "data.bin"));
+  ASSERT_EQ(utils::ValidateExternalDataPath(".\\model.onnx", "../data.bin").IsOK(), is_cwd_root);
+#endif
 
-  // Base directory does not exist.
-  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("non_existent_dir", "data.bin"));
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("./abc/model.onnx", "data.bin"));
+#ifdef _WIN32
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(".\\abc\\model.onnx", "data.bin"));
+#endif
 
   //
-  // Tests for an empty base directory.
-  // The base directory would be empty when 1) the session loads a model from bytes and 2) the application does not
+  // Tests for an empty model path (model loaded from bytes).
+  // The model path would be empty when 1) the session loads a model from bytes and 2) the application does not
   // set an external file folder path via the session config option
   // kOrtSessionOptionsModelExternalInitializersFileFolderPath.
   //
@@ -568,17 +635,18 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) {
   ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data..bin"));
 
   // A path that would escape the current working directory is invalid.
-  ASSERT_FALSE(utils::ValidateExternalDataPath("", "../data.bin").IsOK());
+  ASSERT_EQ(utils::ValidateExternalDataPath("", "../data.bin").IsOK(), is_cwd_root);
 
   // A path that uses ".." but would not escape the current working directory should be fine.
   ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "a/../data.bin"));
 
   // A path with multiple internal ".." that would escape current working direction should fail.
-  ASSERT_FALSE(utils::ValidateExternalDataPath("", "a/../../data.bin").IsOK());
+  ASSERT_EQ(utils::ValidateExternalDataPath("", "a/../../data.bin").IsOK(), is_cwd_root);
 }
 
 TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkInside) {
   // Symbolic link that points inside the base directory.
+  auto model_path = base_dir_ / "model.onnx";
   try {
     auto target = base_dir_ / "target.bin";
     std::ofstream{target};
@@ -588,11 +656,12 @@ TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkInside) {
     GTEST_SKIP() << "Skipping symlink tests since symlink creation is not supported in this environment. Exception: "
                  << e.what();
   }
-  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "link.bin"));
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath(model_path, "link.bin"));
 }
 
 TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkOutside) {
   // Symbolic link that points outside the base directory.
+  auto model_path = base_dir_ / "model.onnx";
   auto outside_target = outside_dir_ / "outside.bin";
   try {
     {
@@ -603,7 +672,60 @@ TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkOutside) {
   } catch (const std::exception& e) {
     GTEST_SKIP() << "Skipping symlink tests since symlink creation is not supported in this environment. Exception: " << e.what();
   }
-  ASSERT_FALSE(utils::ValidateExternalDataPath(base_dir_, "outside_link.bin").IsOK());
+  ASSERT_FALSE(utils::ValidateExternalDataPath(model_path, "outside_link.bin").IsOK());
+}
+
+TEST_F(PathValidationTest, ValidateExternalDataPathEmptyModelPathWithSymlinkInside) {
+  // Test external data path validation when the model path is empty.
+  // Specifically tests that the following scenario is valid:
+  //   - A symbolic link within the current working directory pointing to a file still within CWD.
+  try {
+    std::filesystem::path cwd = std::filesystem::current_path();
+    std::filesystem::path sub_dir = cwd / "symlink_test_subdir";
+    CreateDirectories(sub_dir);
+
+    std::filesystem::path target = sub_dir / "target_inside.bin";
+    std::filesystem::path symlink = sub_dir / "link_inside.bin";
+    std::ofstream{target};
+    std::filesystem::create_symlink(target, symlink);
+  } catch (const std::exception& e) {
+    GTEST_SKIP() << "Skipping test due to failure setting up directory and symlink files: "
+                 << e.what();
+  }
+
+  EXPECT_STATUS_OK(utils::ValidateExternalDataPath("", "./symlink_test_subdir/link_inside.bin"));
+}
+
+TEST_F(PathValidationTest, ValidateExternalDataPathEmptyModelPathWithSymlinkOutside) {
+  // Test external data path validation when the model path is empty.
+  // Specifically tests that the following scenario is NOT valid:
+  //  - A symbolic link within the current working directory pointing to a file outside CWD.
+  try {
+    std::filesystem::path cwd = std::filesystem::current_path();
+    std::filesystem::path sub_dir = cwd / "symlink_test_subdir2";
+    CreateDirectories(sub_dir);
+
+    // Check if we can actually make a file outside of the current working directory (i.e., in a temp dir).
+    // This is only possible if the current working directory is NOT the same as the temp directory.
+    // Otherwise, we need to skip this test. This happens in Android CI.
+    auto [cwd_end, outside_end] = std::mismatch(cwd.begin(), cwd.end(), outside_dir_.begin(), outside_dir_.end());
+    if (cwd_end == cwd.end()) {
+      GTEST_SKIP() << "Skipping test that needs to create a symlink outside of the cwd because the cwd is the same as "
+                   << "the temp dir. cwd: " << cwd << " outside_dir_: " << outside_dir_;
+    }
+
+    std::filesystem::path outside_target = outside_dir_ / "outside_for_empty_basedir.bin";
+    std::filesystem::path symlink = sub_dir / "outside_link.bin";
+    std::ofstream{outside_target};
+    std::filesystem::create_symlink(outside_target, symlink);
+  } catch (const std::exception& e) {
+    GTEST_SKIP() << "Skipping test due to failure setting up directory and symlink files: "
+                 << e.what();
+  }
+
+  Status status = utils::ValidateExternalDataPath("", "./symlink_test_subdir2/outside_link.bin");
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("escapes working directory"));
 }
 
 // Tests for ValidateEmbeddedTensorProtoDataSizeAndShape and embedded initializer size limits
diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
index c0cd40ad95ad4..5d7eda39be271 100644
--- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
@@ -697,33 +697,29 @@ TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits_Cuda) {
   RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
 }
 
-// Cast-aware DQ->MatMul fusion tests
-// Pattern: DQ(int4->fp16) -> Cast(fp16->fp32) -> MatMul(fp32)
-// The Cast between DQ and MatMul on input B should be handled by the
-// DQCastMatMulToMatMulNBits selector-action pair.
-// MatMulNBits always operates in the DQ scale dtype (fp16).
-// The action always inserts Cast on input A and Cast on output.
-// ORT's redundant cast elimination optimizer cleans up unnecessary casts.
+// DQ(fp16) -> MatMul fusion test
+// Pattern: DQ(int4, fp16_scale) -> MatMul(fp16)
+// For FP16 models on CPU EP, CPU EP doesn't claim FP16 MatMul during partitioning
+// (no FP16 MatMul kernel on CPU), so the node's EP is empty "".
+// The DQ->MatMul fusion should still match and fuse to MatMulNBits.
 //
-// Input1(fp32)      DQ(int4->fp16)
-//   |                    |
-//    \             Cast(fp16->fp32)
-//     \              /
-//      MatMul(fp32)
+//  Input1(fp16)     DQ(int4->fp16)
+//     \               /
+//     MatMul(fp16)
 //          |
-//        output(fp32)
+//       output(fp16)
 //
 // After optimization:
-// Input1(fp32) -> Cast(fp32->fp16) -> MatMulNBits(fp16) -> Cast(fp16->fp32) -> output(fp32)
+//  Input1(fp16)  ->  MatMulNBits(fp16)  ->  output(fp16)
 template <typename T, bool use_zp>
 typename std::enable_if<std::is_same_v<T, Int4x2> || std::is_same_v<T, UInt4x2>, void>::type
-RunDQCastMatMulConverted(const std::vector<int64_t>& input1_shape,
+RunDQMatMulFP16Converted(const std::vector<int64_t>& input1_shape,
                          const std::vector<int64_t>& weight_shape,
                          const int64_t axis,
                          const int64_t block_size,
                          int64_t accuracy_level) {
   auto build_test_case = [&](ModelTestBuilder& builder) {
-    auto* input_arg = builder.MakeInput(input1_shape, -1.0f, 1.0f);
+    auto* input_arg = builder.MakeInput<MLFloat16>(input1_shape, MLFloat16(-1.0f), MLFloat16(1.0f));
     auto* output_arg = builder.MakeOutput();
 
     // DQ with fp16 scales
@@ -745,24 +741,14 @@ RunDQCastMatMulConverted(const std::vector<int64_t>& input1_shape,
       builder.AddNode("DequantizeLinear", {weight_arg, scale_arg}, {dq_output}, "", &dq_attrs);
     }
 
-    // Cast fp16 -> fp32
-    auto* cast_output = builder.MakeIntermediate();
-    NodeAttributes cast_attrs;
-    utils::SetNodeAttribute(utils::MakeAttribute("to",
-                                                 static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)),
-                            cast_attrs);
-    builder.AddNode("Cast", {dq_output}, {cast_output}, "", &cast_attrs);
-
-    // MatMul
-    builder.AddNode("MatMul", {input_arg, cast_output}, {output_arg});
+    // MatMul (fp16)
+    builder.AddNode("MatMul", {input_arg, dq_output}, {output_arg});
   };
 
   auto check_graph = [&](InferenceSessionWrapper& session) {
     auto op_to_count = CountOpsInGraph(session.GetGraph());
     const QDQOpKeys qdq_keys = GetQDQOpKeys(false);
     EXPECT_EQ(op_to_count["MatMul"], 0);
-    // B-side Cast removed. New Cast(fp32->fp16) on A and Cast(fp16->fp32) on output.
-    EXPECT_EQ(op_to_count["Cast"], 2);
     EXPECT_EQ(op_to_count["com.microsoft.MatMulNBits"], 1);
     EXPECT_EQ(op_to_count[qdq_keys.dequantize_linear], 0);
   };
@@ -786,12 +772,12 @@ RunDQCastMatMulConverted(const std::vector<int64_t>& input1_shape,
                     add_session_options_fn);
 }
 
-TEST(QDQTransformerTests, DQCastMatMulConvertedToMatMulNBits) {
-  // DQ(int4->fp16) -> Cast(fp16->fp32) -> MatMul should be fused to MatMulNBits
-  RunDQCastMatMulConverted<Int4x2, true>({12, 32}, {32, 16}, 0, 16, 0);
-  RunDQCastMatMulConverted<Int4x2, false>({12, 32}, {32, 16}, 0, 16, 0);
-  RunDQCastMatMulConverted<UInt4x2, true>({12, 32}, {32, 16}, 0, 16, 0);
-  RunDQCastMatMulConverted<UInt4x2, false>({12, 32}, {32, 16}, 0, 16, 0);
+TEST(QDQTransformerTests, DQMatMulFP16ConvertedToMatMulNBits) {
+  // DQ(int4, fp16_scale) -> MatMul(fp16) should be fused to MatMulNBits
+  RunDQMatMulFP16Converted<Int4x2, true>({12, 32}, {32, 16}, 0, 16, 0);
+  RunDQMatMulFP16Converted<Int4x2, false>({12, 32}, {32, 16}, 0, 16, 0);
+  RunDQMatMulFP16Converted<UInt4x2, true>({12, 32}, {32, 16}, 0, 16, 0);
+  RunDQMatMulFP16Converted<UInt4x2, false>({12, 32}, {32, 16}, 0, 16, 0);
 }
 
 #endif  // !defined(DISABLE_CONTRIB_OPS)
diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
index 1eeb3683bc9aa..c0efb993e7611 100644
--- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
+++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "gtest/gtest.h"
+#include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
 #include "test/common/trt_op_test_utils.h"
@@ -906,5 +907,88 @@ TEST(RoiAlignTest, BatchIndicesNegative_CUDA) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 #endif
 }
+
+TEST(RoiAlignTest, Float16_Opset16) {
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  if (cuda_ep.get() == nullptr) {
+    GTEST_SKIP() << "Skipping because there is no CUDA execution provider available.";
+  }
+
+  OpTester test("RoiAlign", 16);
+  test.AddAttribute<int64_t>("output_height", 3);
+  test.AddAttribute<int64_t>("output_width", 4);
+  test.AddAttribute<int64_t>("sampling_ratio", 2);
+  test.AddAttribute<float>("spatial_scale", 1.0f / 16.0f);
+
+  constexpr int N = 1;
+  constexpr int C = 1;
+  constexpr int H = 5;
+  constexpr int W = 5;
+
+  test.AddInput<MLFloat16>("X", {N, C, H, W}, ToFloat16({0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}));
+  test.AddInput<MLFloat16>("rois", {1, 4}, ToFloat16({0., 0., 4., 4.}));
+  test.AddInput<int64_t>("batch_indices", {1}, {0});
+  // Values calculated manually or from a known good run
+  test.AddOutput<MLFloat16>("Y", {1, 1, 3, 4}, ToFloat16({0.6665f, 1.333f, 2.0f, 2.666f, 4.0f, 4.668f, 5.332f, 6.0f, 7.332f, 8.0f, 8.664f, 9.336f}));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(std::move(cuda_ep));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(RoiAlignTest, Float16_Opset22) {
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  if (cuda_ep.get() == nullptr) {
+    GTEST_SKIP() << "Skipping because there is no CUDA execution provider available.";
+  }
+
+  OpTester test("RoiAlign", 22);
+  test.AddAttribute<int64_t>("output_height", 3);
+  test.AddAttribute<int64_t>("output_width", 4);
+  test.AddAttribute<int64_t>("sampling_ratio", 2);
+  test.AddAttribute<float>("spatial_scale", 1.0f / 16.0f);
+
+  constexpr int N = 1;
+  constexpr int C = 1;
+  constexpr int H = 5;
+  constexpr int W = 5;
+
+  test.AddInput<MLFloat16>("X", {N, C, H, W}, ToFloat16({0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}));
+  test.AddInput<MLFloat16>("rois", {1, 4}, ToFloat16({0., 0., 4., 4.}));
+  test.AddInput<int64_t>("batch_indices", {1}, {0});
+  test.AddOutput<MLFloat16>("Y", {1, 1, 3, 4}, ToFloat16({0.6665f, 1.333f, 2.0f, 2.666f, 4.0f, 4.668f, 5.332f, 6.0f, 7.332f, 8.0f, 8.664f, 9.336f}));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(std::move(cuda_ep));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+TEST(RoiAlignTest, BFloat16_Opset22) {
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  if (cuda_ep.get() == nullptr) {
+    GTEST_SKIP() << "Skipping because there is no CUDA execution provider available.";
+  }
+
+  OpTester test("RoiAlign", 22);
+  test.AddAttribute<int64_t>("output_height", 3);
+  test.AddAttribute<int64_t>("output_width", 4);
+  test.AddAttribute<int64_t>("sampling_ratio", 2);
+  test.AddAttribute<float>("spatial_scale", 1.0f / 16.0f);
+
+  constexpr int N = 1;
+  constexpr int C = 1;
+  constexpr int H = 5;
+  constexpr int W = 5;
+
+  test.AddInput<BFloat16>("X", {N, C, H, W}, ToBFloat16({0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}));
+  test.AddInput<BFloat16>("rois", {1, 4}, ToBFloat16({0., 0., 4., 4.}));
+  test.AddInput<int64_t>("batch_indices", {1}, {0});
+  test.AddOutput<BFloat16>("Y", {1, 1, 3, 4}, ToBFloat16({0.6665f, 1.333f, 2.0f, 2.666f, 4.0f, 4.668f, 5.332f, 6.0f, 7.332f, 8.0f, 8.664f, 9.336f}));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(std::move(cuda_ep));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc
index 7fd22cc59745f..2423d7f120b20 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_custom.inc
@@ -37,7 +37,7 @@ TYPED_TEST(GridSampleCustomTest, test_grid_sample_20_4D_linear_zeros_mixed_bound
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<TypeParam>("Y", Y_shape, Y_data);
-  RunTests(test, GetExecutionProviders(20));
+  RunTests(test, GetExecutionProviders());
 }
 
 TYPED_TEST(GridSampleCustomTest, test_grid_sample_20_4D_linear_zeros_mixed_bounds_left_top) {
@@ -69,6 +69,5 @@ TYPED_TEST(GridSampleCustomTest, test_grid_sample_20_4D_linear_zeros_mixed_bound
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<TypeParam>("Y", Y_shape, Y_data);
-  RunTests(test, GetExecutionProviders(20));
+  RunTests(test, GetExecutionProviders());
 }
-
diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc
index c7fccb086ea19..232477c96b4f4 100644
--- a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc
+++ b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_external_resource_importer_test.cc
@@ -1082,7 +1082,7 @@ TEST_F(NvExecutionProviderExternalResourceImporterTest, FullInferenceWithExterna
     // Configure to use our CUDA stream
     char stream_address[32];
     size_t stream_addr_val = reinterpret_cast<size_t>(ort_api_->SyncStream_GetHandle(ort_stream));
-    sprintf(stream_address, "%llu", static_cast<uint64_t>(stream_addr_val));
+    sprintf_s(stream_address, "%llu", static_cast<uint64_t>(stream_addr_val));
     const char* option_keys[] = {
         // TODO we should no longer require to set the compute stream at this point but there are too many cudaSetDevice calls from allocators and stream handling (NVBUG 5822116)
         onnxruntime::nv::provider_option_names::kUserComputeStream,
@@ -1095,7 +1095,7 @@ TEST_F(NvExecutionProviderExternalResourceImporterTest, FullInferenceWithExterna
     };
     char aux_stream_address[32];
     size_t aux_streams[] = {stream_addr_val};
-    sprintf(aux_stream_address, "%llu", reinterpret_cast<uint64_t>(aux_streams));
+    sprintf_s(aux_stream_address, "%llu", reinterpret_cast<uint64_t>(aux_streams));
     std::string max_shared_mem_size = std::to_string(1024 * 28);  // 28 KiB
     const char* option_values[] = {
         stream_address,
diff --git a/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py
index 8e69fdf088103..8d430ca3f4952 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py
@@ -4,10 +4,15 @@
 import os
 import platform
 import sys
+import tempfile
 import unittest
 
+import onnx
+
 from onnxruntime.capi.onnxruntime_pybind11_state import (
     OrtCompiledModelCompatibility,
+    get_compatibility_info_from_model,
+    get_compatibility_info_from_model_bytes,
     get_ep_devices,
     get_model_compatibility_for_ep_devices,
 )
@@ -17,6 +22,21 @@
     os.add_dll_directory(os.getcwd())
 
 
+def _create_model_with_compatibility_metadata(ep_compatibility_info=None):
+    """Create a minimal valid ONNX model with optional compatibility metadata."""
+    graph = onnx.helper.make_graph([], "test_graph", [], [])
+    model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
+
+    if ep_compatibility_info:
+        for ep_type, compat_info in ep_compatibility_info.items():
+            entry = onnx.StringStringEntryProto()
+            entry.key = f"ep_compatibility_info.{ep_type}"
+            entry.value = compat_info
+            model.metadata_props.append(entry)
+
+    return model.SerializeToString()
+
+
 class TestEpCompatibility(unittest.TestCase):
     def test_invalid_args(self):
         # empty devices
@@ -41,6 +61,63 @@ def test_basic_smoke(self):
         status = get_model_compatibility_for_ep_devices(selected, "arbitrary-compat-string")
         self.assertEqual(status, OrtCompiledModelCompatibility.EP_NOT_APPLICABLE)
 
+    def test_get_compatibility_info_from_model_bytes_with_metadata(self):
+        ep_type = "TestCompatEP"
+        expected_compat_info = "test_compat_v1.0_driver_123"
+        model_data = _create_model_with_compatibility_metadata({ep_type: expected_compat_info})
+
+        result = get_compatibility_info_from_model_bytes(model_data, ep_type)
+        self.assertIsNotNone(result)
+        self.assertEqual(result, expected_compat_info)
+
+    def test_get_compatibility_info_from_model_bytes_not_found(self):
+        model_data = _create_model_with_compatibility_metadata({"DifferentEP": "some_value"})
+
+        result = get_compatibility_info_from_model_bytes(model_data, "NonExistentEP")
+        self.assertIsNone(result)
+
+    def test_get_compatibility_info_from_model_bytes_no_metadata(self):
+        model_data = _create_model_with_compatibility_metadata()
+
+        result = get_compatibility_info_from_model_bytes(model_data, "AnyEP")
+        self.assertIsNone(result)
+
+    def test_get_compatibility_info_from_model_bytes_invalid_data(self):
+        with self.assertRaises(RuntimeError):
+            get_compatibility_info_from_model_bytes(b"this is not a valid ONNX model", "TestEP")
+
+    def test_get_compatibility_info_from_model_bytes_invalid_args(self):
+        with self.assertRaises(RuntimeError):
+            get_compatibility_info_from_model_bytes(b"", "TestEP")
+        with self.assertRaises(RuntimeError):
+            get_compatibility_info_from_model_bytes(b"data", "")
+
+    def test_get_compatibility_info_from_model_file_with_metadata(self):
+        ep_type = "TestCompatEP"
+        expected_compat_info = "file_compat_v2.0"
+        model_data = _create_model_with_compatibility_metadata({ep_type: expected_compat_info})
+
+        with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+            f.write(model_data)
+            model_path = f.name
+
+        try:
+            result = get_compatibility_info_from_model(model_path, ep_type)
+            self.assertIsNotNone(result)
+            self.assertEqual(result, expected_compat_info)
+        finally:
+            os.unlink(model_path)
+
+    def test_get_compatibility_info_from_model_file_not_found(self):
+        with self.assertRaises(RuntimeError):
+            get_compatibility_info_from_model("nonexistent_model_path.onnx", "TestEP")
+
+    def test_get_compatibility_info_from_model_invalid_args(self):
+        with self.assertRaises(RuntimeError):
+            get_compatibility_info_from_model("", "TestEP")
+        with self.assertRaises(RuntimeError):
+            get_compatibility_info_from_model("model.onnx", "")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/transformers/test_attention_fusion.py b/onnxruntime/test/python/transformers/test_attention_fusion.py
index d25432173a8f0..caaaa1aa628cf 100644
--- a/onnxruntime/test/python/transformers/test_attention_fusion.py
+++ b/onnxruntime/test/python/transformers/test_attention_fusion.py
@@ -395,17 +395,17 @@ def test_qwen3_normalization_fusion(self):
         ssln_count = sum(1 for n in nodes if n.op_type == "SkipSimplifiedLayerNormalization")
 
         # 4 RMSNorm patterns: pre-attn, Q-norm, K-norm, post-attn.
-        # Post-attn RMSNorm has an Add parent (residual) → fused as SkipSimplifiedLayerNormalization.
-        # Remaining 3 stay as SimplifiedLayerNormalization.
+        # Fallback for SkipLayerNormalization is disabled, so post-attn RMSNorm does not fuse.
+        # All 4 stay as SimplifiedLayerNormalization.
         self.assertEqual(
             sln_count,
-            3,
-            f"Expected 3 SimplifiedLayerNormalization (pre-attn + Q-norm + K-norm), got {sln_count}",
+            4,
+            f"Expected 4 SimplifiedLayerNormalization (pre-attn + Q-norm + K-norm + post-attn), got {sln_count}",
         )
         self.assertEqual(
             ssln_count,
-            1,
-            f"Expected 1 SkipSimplifiedLayerNormalization (residual + post-attn RMSNorm), got {ssln_count}",
+            0,
+            f"Expected 0 SkipSimplifiedLayerNormalization (residual + post-attn RMSNorm failed to fuse), got {ssln_count}",
         )
 
 
diff --git a/onnxruntime/test/shared_lib/test_allocator.cc b/onnxruntime/test/shared_lib/test_allocator.cc
index bf9e54e8b3c7b..c80712e272ad2 100644
--- a/onnxruntime/test/shared_lib/test_allocator.cc
+++ b/onnxruntime/test/shared_lib/test_allocator.cc
@@ -22,6 +22,27 @@ TEST(CApiTest, allocation_info) {
   ASSERT_EQ(OrtMemTypeDefault, cpu_mem_info_1.GetMemoryType());
 }
 
+// Verify that legacy (pre-1.25) memory info names "WebGPU_Buffer" and "WebNN_Tensor" are accepted
+// and normalized to the current short names "WebGPU_Buf" and "WebNN_Ten".
+// This ensures backward compatibility with released onnxruntime-genai that uses the old names.
+TEST(CApiTest, LegacyWebGpuWebNNMemoryInfoNames) {
+  // Old (pre-1.25) names must be accepted
+  Ort::MemoryInfo legacy_webgpu("WebGPU_Buffer", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  Ort::MemoryInfo legacy_webnn("WebNN_Tensor", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  // Current (short) names
+  Ort::MemoryInfo current_webgpu("WebGPU_Buf", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  Ort::MemoryInfo current_webnn("WebNN_Ten", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  // Legacy names should be normalized to the current names
+  ASSERT_EQ(std::string("WebGPU_Buf"), legacy_webgpu.GetAllocatorName());
+  ASSERT_EQ(std::string("WebNN_Ten"), legacy_webnn.GetAllocatorName());
+
+  // Memory infos created with legacy and current names should be equal
+  ASSERT_EQ(legacy_webgpu, current_webgpu);
+  ASSERT_EQ(legacy_webnn, current_webnn);
+}
+
 TEST(CApiTest, DefaultAllocator) {
   Ort::AllocatorWithDefaultOptions default_allocator;
   auto cpu_info = default_allocator.GetInfo();
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index e472cbcee12d6..14cf38eeb5afd 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -5000,8 +5000,7 @@ TEST(CApiTest, InMemoryModel_SessionConfigExternalFileFolder_ExternalDataOutside
 
   // Verify that the exception message indicates security or external data issues
   EXPECT_TRUE(exception_message.find("External data path") != std::string::npos &&
-              exception_message.find("escapes both model directory") != std::string::npos &&
-              exception_message.find("and real model directory") != std::string::npos)
+              exception_message.find("escapes model directory") != std::string::npos)
       << "Exception message should indicate external data or security issue. Got: " << exception_message;
 }