// Copyright (c) Microsoft Corporation. All rights reserved.
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Licensed under the MIT License.

#pragma once
#include <ctime>
#ifndef USE_CUDA_MINIMAL
#include <cudnn.h>
#else
typedef void* cudnnHandle_t;
typedef void* cublasHandle_t;
typedef void* cudnnStatus_t;
#endif
#include "core/providers/nv_tensorrt_rtx/nv_includes.h"
#include "core/session/onnxruntime_run_options_config_keys.h"
#include <mutex>
#include "core/providers/cuda/cuda_graph.h"
#include "nv_execution_provider_info.h"
#include "core/providers/nv_tensorrt_rtx/nv_file_utils.h"

namespace onnxruntime {

class TensorrtLogger : public nvinfer1::ILogger {
  nvinfer1::ILogger::Severity verbosity_;

 public:
  TensorrtLogger(Severity verbosity = Severity::kWARNING)
      : verbosity_(verbosity) {}
  void log(Severity severity, const char* msg) noexcept override {
    if (severity <= verbosity_) {
      time_t rawtime = std::time(0);
      struct tm stm;
#ifdef _MSC_VER
      gmtime_s(&stm, &rawtime);
#else
      gmtime_r(&rawtime, &stm);
#endif
      char buf[256];
      strftime(&buf[0], 256,
               "%Y-%m-%d %H:%M:%S",
               &stm);
      const char* sevstr = (severity == Severity::kINTERNAL_ERROR ? "    BUG" : severity == Severity::kERROR ? "  ERROR"
                                                                            : severity == Severity::kWARNING ? "WARNING"
                                                                            : severity == Severity::kINFO    ? "   INFO"
                                                                                                             : "UNKNOWN");
      if (severity <= Severity::kERROR) {
        LOGS_DEFAULT(ERROR) << "[" << buf << " " << sevstr << "] " << msg;
      } else {
        LOGS_DEFAULT(WARNING) << "[" << buf << " " << sevstr << "] " << msg;
      }
    }
  }
  void set_level(Severity verbosity) {
    verbosity_ = verbosity;
  }
  Severity get_level() const {
    return verbosity_;
  }
};

namespace tensorrt_ptr {
/*
 * custom deleter that will dump the optimized runtime cache when the execution context is destructed
 */
struct IExecutionContextDeleter {
  IExecutionContextDeleter() = default;
  IExecutionContextDeleter(const std::string& runtime_cache_path, std::unique_ptr<nvinfer1::IRuntimeCache>&& runtime_cache) : runtime_cache_path_(runtime_cache_path), runtime_cache_(std::move(runtime_cache)) {};
  void operator()(nvinfer1::IExecutionContext* context) {
    if (context != nullptr) {
      if (!runtime_cache_path_.empty()) {
        auto serialized_cache_data = std::unique_ptr<nvinfer1::IHostMemory>(runtime_cache_->serialize());
        file_utils::WriteFile(runtime_cache_path_, serialized_cache_data->data(), serialized_cache_data->size());
      }
      delete context;
    }
  }

 private:
  std::string runtime_cache_path_;
  std::unique_ptr<nvinfer1::IRuntimeCache> runtime_cache_;
};

struct TensorrtInferDeleter {
  template <typename T>
  void operator()(T* obj) const {
    if (obj) {
      delete obj;
    }
  }
};

template <typename T>
using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
using unique_pointer_exec_ctx = std::unique_ptr<nvinfer1::IExecutionContext, IExecutionContextDeleter>;
};  // namespace tensorrt_ptr

//
// Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
// not possible.
//
class OutputAllocator : public nvinfer1::IOutputAllocator {
 public:
  OutputAllocator() = delete;
  OutputAllocator(OrtAllocator* allocator) : alloc_(allocator) {};

  void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) noexcept override;

  void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override;

  void* getBuffer() {
    return outputPtr;
  }

  std::vector<int64_t>& getOutputShape() {
    return output_shapes;
  }

  uint64_t getSize() {
    return allocated_size;
  }

  ~OutputAllocator() override {
    alloc_->Free(alloc_, outputPtr);
  }

 private:
  OrtAllocator* alloc_;
  void* outputPtr{nullptr};
  uint64_t allocated_size = 0;
  std::vector<int64_t> output_shapes;
};

/*
 * This map saves the dimension range of the shape of the shape tensor or execution tensor:
 * tensor name -> ( dimension -> [min, max, opt] )
 */
using ShapeRangesMap = std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>>;

/**
 * @brief Container for tensor data and their shape.
 *
 */
struct TensorParams {
  const void* data{nullptr};
  nvinfer1::Dims dims;

  TensorParams() = default;

  TensorParams(const void* data_ptr, const std::vector<int64_t>& shape) {
    // Initialize data and dims from the Ort::ConstValue
    data = data_ptr;

    dims.nbDims = static_cast<int32_t>(shape.size());
    for (int i = 0; i < dims.nbDims; ++i) {
      dims.d[i] = static_cast<int32_t>(shape[i]);
    }
  }

  TensorParams(const void* data_ptr, nvinfer1::Dims& shape) {
    // Initialize data and dims from the Ort::ConstValue
    data = data_ptr;

    dims = shape;
  }

  bool operator!=(const TensorParams& other) const {
    if (data != other.data || dims.nbDims != other.dims.nbDims)
      return true;

    for (int i = 0; i < dims.nbDims; ++i) {
      if (dims.d[i] != other.dims.d[i])
        return true;
    }
    return false;
  }
};

// Data structure to hold user weights when ModelProtos are serialized with external data
class TensorrtUserWeights {
 public:
  TensorrtUserWeights(const std::string& name, const std::string& data) : name_(name),
                                                                          data_cpy_(data) {
                                                                          };

  TensorrtUserWeights(const std::string& name, const void* data, size_t size) : name_(name), data_(data), size_(size) {
                                                                                };

  const char* Name() const {
    return name_.c_str();
  };

  const void* Data() const {
    if (!data_cpy_.empty()) {
      return data_cpy_.data();
    }
    return data_;
  }

  int64_t Size() const {
    if (!data_cpy_.empty()) {
      return static_cast<int64_t>(data_cpy_.size());
    }
    return static_cast<int64_t>(size_);
  }

 private:
  std::string name_{};
  std::string data_cpy_{};
  void const* data_;
  size_t size_;
};

// Information to construct kernel function state.
struct TensorrtFuncState {
  AllocateFunc test_allocate_func = nullptr;
  DestroyFunc test_release_func = nullptr;
  AllocatorHandle allocator = nullptr;
  std::string fused_node_name;
  nvinfer1::IBuilder* builder;
  std::unique_ptr<nvinfer1::ICudaEngine>* engine = nullptr;
  tensorrt_ptr::unique_pointer_exec_ctx* context = nullptr;
  std::unique_ptr<nvinfer1::INetworkDefinition>* network = nullptr;
  std::vector<std::unordered_map<std::string, size_t>> input_info;
  std::vector<std::unordered_map<std::string, size_t>> output_info;
  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
  std::mutex* tensorrt_mu_ptr = nullptr;
  bool engine_cache_enable = false;
  std::string engine_cache_path;
  nvinfer1::IRuntime* runtime = nullptr;
  std::vector<nvinfer1::IOptimizationProfile*> profiles;
  bool engine_decryption_enable = false;
  int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
  int (*engine_encryption)(const char*, char*, size_t) = nullptr;
  bool detailed_build_log = false;
  bool sparsity_enable = false;
  int auxiliary_streams = -1;
  bool cuda_graph_enable = 0;
  bool is_dynamic_shape = false;
  std::string cache_prefix;
  std::string cache_suffix;
  // runtime parameters
  std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
  std::vector<TensorParams> input_tensors;
  std::vector<TensorParams> output_tensors;
  bool is_first_run = true;              // Indicates if this is the first run of the engine
  bool skip_io_binding_allowed = false;  // Indicates if input/output binding can be skipped
  IAllocatorUniquePtr<void> context_memory = nullptr;
  size_t context_memory_size = 0;
};

// Minimum information to construct kernel function state for direct engine load code path
struct TensorrtShortFuncState {
  AllocateFunc test_allocate_func = nullptr;
  DestroyFunc test_release_func = nullptr;
  AllocatorHandle allocator = nullptr;
  std::string fused_node_name;
  std::unique_ptr<nvinfer1::ICudaEngine>* engine = nullptr;
  tensorrt_ptr::unique_pointer_exec_ctx* context = nullptr;
  std::vector<std::unordered_map<std::string, size_t>> input_info;
  std::vector<std::unordered_map<std::string, size_t>> output_info;
  std::mutex* tensorrt_mu_ptr = nullptr;
  bool is_dynamic_shape = false;
  // runtime parameters
  std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
  std::vector<TensorParams> input_tensors;
  std::vector<TensorParams> output_tensors;
  bool is_first_run = true;              // Indicates if this is the first run of the engine
  bool skip_io_binding_allowed = false;  // Indicates if input/output binding can be skipped
  IAllocatorUniquePtr<void> context_memory = nullptr;
  size_t context_memory_size = 0;
};

// Holds important information for building valid ORT graph.
struct SubGraphContext {
  std::unordered_set<std::string> output_args;
  std::unordered_map<std::string, const NodeArg*> inputs_and_initializers;
  std::unordered_map<std::string, const NodeArg*> manually_added_graph_inputs;
};

using SubGraphContextMap = std::unordered_map<std::string, std::unique_ptr<SubGraphContext>>;
using DDSOutputAllocatorMap = std::unordered_map<std::string, std::unique_ptr<OutputAllocator>>;
std::string GetWeightRefittedEnginePath(std::string engine_cache_path);

// Logical device representation.
class NvExecutionProvider : public IExecutionProvider {
 public:
  explicit NvExecutionProvider(const NvExecutionProviderInfo& info);
  // TODO: we might want to transition to this, it allows for an easier option specification:
  //  explicit NvExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
  virtual ~NvExecutionProvider();

  cublasHandle_t PerThreadDefaultCublasHandle() {
    return GetPerThreadContext().CublasHandle();
  }

  cudnnHandle_t PerThreadDefaultCudnnHandle() {
    return GetPerThreadContext().CudnnHandle();
  }

  virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
  std::unique_ptr<IDataTransfer> GetDataTransfer() const override;

  std::vector<std::unique_ptr<ComputeCapability>>
  GetCapability(const GraphViewer& graph,
                const IKernelLookup& /*kernel_lookup*/,
                const GraphOptimizerRegistry& graph_optimizer_registry,
                IResourceAccountant* /* resource_accountant */) const override;

  int GetDeviceId() const { return device_id_; }
  Status Sync() const;

  common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                         std::vector<NodeComputeInfo>& node_compute_funcs) override;

  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;

  ProviderOptions GetProviderOptions() const override {
    return NvExecutionProviderInfo::ToProviderOptions(info_);
  }

  void RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const override;

  void GetCustomOpDomainList(std::vector<OrtCustomOpDomain*>& custom_op_domain_list) const override;

  OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override;

  std::vector<AllocatorPtr> CreatePreferredAllocators() override;

  // CUDA Graph support
  bool IsGraphCaptureEnabled() const override;
  bool IsGraphCaptured(int graph_annotation_id) const override;
  Status ReplayGraph(int graph_annotation_id) override;
  void HandleCudaGraphStart(cudaStream_t stream, bool require_io_binding, CudaGraphAnnotation_t cuda_graph_annotation_id, bool& graph_replay_on_this_run, bool& should_start_capture);

  static common::Status RefitEngine(std::string onnx_model_filename,
                                    std::string& onnx_model_folder_path,
                                    bool path_check,
                                    const void* onnx_model_bytestream,
                                    size_t onnx_model_bytestream_size,
                                    const void* onnx_external_data_bytestream,
                                    size_t onnx_external_data_bytestream_size,
                                    nvinfer1::ICudaEngine* trt_engine,
                                    bool detailed_build_log);

  const InlinedVector<const Node*> GetEpContextNodes() const override;

  // Engine compatibility validation methods
  std::string GetCompiledModelCompatibilityInfo(const onnxruntime::GraphViewer& graph_viewer) const override;

  common::Status ValidateCompiledModelCompatibilityInfo(
      const std::string& compatibility_info,
      OrtCompiledModelCompatibility& model_compatibility) const override;

 private:
  mutable NvExecutionProviderInfo info_;
  bool external_stream_ = false;
  cudaStream_t stream_ = nullptr;
  bool external_aux_streams_ = false;
  cudaStream_t* aux_streams_ = nullptr;
  int max_partition_iterations_ = 1000;
  size_t min_subgraph_size_ = 1;
  size_t max_workspace_size_ = 0;
  size_t max_shared_mem_size_ = 0;
  bool force_sequential_engine_build_ = false;
  bool dump_subgraphs_ = false;
  bool engine_cache_enable_ = false;
  bool weight_stripped_engine_enable_ = false;
  bool weight_stripped_engine_refit_ = false;
  std::string onnx_model_folder_path_;
  const void* onnx_model_bytestream_;
  size_t onnx_model_bytestream_size_;
  bool use_external_data_initializer_ = false;
  const void* onnx_external_data_bytestream_ = nullptr;
  size_t onnx_external_data_bytestream_size_ = 0;
  bool sparsity_enable_ = false;
  int auxiliary_streams_ = -1;
  std::string cache_path_, engine_decryption_lib_path_;
  std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
  std::mutex tensorrt_mu_;
  int device_id_;
  std::string compute_capability_;
  size_t max_ctx_mem_size_ = 0;
  mutable char model_path_[4096] = {};  // Reserved for max path length
  bool engine_decryption_enable_ = false;
  int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
  int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
  bool detailed_build_log_ = false;
  bool cuda_graph_enable_ = false;
  bool multi_profile_enable_ = false;
  std::filesystem::path runtime_cache_;
  std::string cache_prefix_;
  std::string op_types_to_exclude_;
  int nv_profile_index_ = 0;
  std::unique_ptr<onnxruntime::Model> ep_context_model_;

  // The format is as for TENSORRT_VERSION: (MAJOR * 100 + MINOR) * 100 + PATCH
  int32_t trt_version_;
  int32_t cuda_version_;

  // The OrtAllocator object will be get during ep compute time
  // and should be kept for the lifetime of TRT EP object.
  OrtAllocator* alloc_ = nullptr;

  // For create/dump EP context node model
  bool dump_ep_context_model_ = false;
  std::string ep_context_file_path_;
  int ep_context_embed_mode_ = 0;
  std::string ctx_model_path_;
  std::string engine_cache_relative_path_to_context_model_dir;

  std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
  mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;

  mutable std::unique_ptr<nvinfer1::IBuilder> builder_;

  // Following maps that hold TRT objects will be accessible by different threads if ORT is using multithreading.
  // In general, TensorRT objects are not thread safe; accesses to an object from different threads must be serialized by the client.
  // But there are still some thread safe operations, please see here https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
  // For those non thread safe operations, TRT EP uses (1) lock_guard or (2) PerThreadContext to make sure synchronization.
  std::unordered_map<std::string, std::unique_ptr<nvinfer1::ICudaEngine>> engines_;
  std::unordered_map<std::string, tensorrt_ptr::unique_pointer_exec_ctx> contexts_;
  std::unordered_map<std::string, std::unique_ptr<nvinfer1::IBuilder>> builders_;
  std::unordered_map<std::string, std::unique_ptr<nvinfer1::INetworkDefinition>> networks_;
  std::unordered_map<std::string, std::vector<std::unordered_map<std::string, size_t>>> input_info_;
  std::unordered_map<std::string, std::vector<std::unordered_map<std::string, size_t>>> output_info_;
  std::unordered_map<std::string, std::vector<std::vector<int64_t>>> profile_min_shapes_;
  std::unordered_map<std::string, std::vector<std::vector<int64_t>>> profile_max_shapes_;
  std::unordered_map<std::string, std::vector<std::vector<int64_t>>> profile_opt_shapes_;
  std::unordered_map<std::string, ShapeRangesMap> input_shape_ranges_;  // The profile shape ranges that the engine is built with
  std::unordered_map<std::string, std::vector<nvinfer1::IOptimizationProfile*>> profiles_;
  std::unordered_map<std::string, DDSOutputAllocatorMap> dds_output_allocator_maps_;

  // Storage for engine headers (64 bytes) for compatibility validation
  // Maps fused_node_name -> hex-encoded engine header
  mutable std::unordered_map<std::string, std::string> engine_headers_;

  // for external stream, we need to create its cudnn/cublass handle before cuda EP enable cuda graph capture
  cudnnHandle_t external_cudnn_handle_ = nullptr;
  cublasHandle_t external_cublas_handle_ = nullptr;

  // Call cudaStreamSynchronize() after TRT enqueueV3()
  mutable bool sync_stream_after_enqueue_ = true;

  // [Note] We don't use PerThreadContext for now since it has issue with multithreading
  //
  // TRT or CUDA objects that must be maintained on a per thread basis will be put under this PerThreadContext data structure.
  // For example, TensorRT execution context and CUDA graph are the ones to be put here.
  class PerThreadContext final {
   public:
    PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream);
    ~PerThreadContext();

    cublasHandle_t CublasHandle() const {
      return external_cublas_handle_;
    }

    cudnnHandle_t CudnnHandle() const {
      return external_cudnn_handle_;
    }

    bool IsTensorRTContextInMap(std::string fused_node);
    nvinfer1::IExecutionContext& GetTensorRTContext(std::string fused_node);
    bool UpdateTensorRTContext(std::string fused_node, tensorrt_ptr::unique_pointer_exec_ctx context);
    void ResetTensorRTContext(std::string fused_node);

    // CUDA Graph management
    void SetCudaGraphStream(cudaStream_t stream) { cuda_graph_.SetStream(stream); }
    bool IsGraphCaptureAllowed(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
    bool IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
    CudaGraphAnnotation_t GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const;
    void SetCurrentGraphAnnotationId(CudaGraphAnnotation_t cuda_graph_annotation_id);
    CudaGraphAnnotation_t GetCurrentGraphAnnotationId() const;
    void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
    void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
    bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
    Status ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag);
    void IncrementRegularRunCountBeforeGraphCapture(CudaGraphAnnotation_t cuda_graph_annotation_id);
    void ResetWarmupRuns(CudaGraphAnnotation_t cuda_graph_annotation_id);
    void DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id);

   private:
    cudnnHandle_t external_cudnn_handle_ = nullptr;
    cublasHandle_t external_cublas_handle_ = nullptr;

    // Maintaining execution context on a per thread basis is suggested by TRT doc.
    // Also, for enqueueV2() in execution context, to perform inference concurrently in multiple streams, use one execution context per stream.
    // ORT multi-streams feature uses one stream for one thread, therefore maintaining execution context on a per thread basis is necessary for TRT EP,
    // otherwise it may result in undefined behavior or synchronization issues.
    //
    // See more details here:
    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
    // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_execution_context.html#a63cd95430852038ce864e17c670e0b36
    std::unordered_map<std::string, tensorrt_ptr::unique_pointer_exec_ctx> trt_context_map_;

    // The profile shape ranges for the engine that the execution context maintained by the PerThreadContext is built with.
    // TRT EP needs this info to determine whether to rebuild the execution context.
    std::unordered_map<std::string, ShapeRangesMap> input_shape_ranges_;

    // Cuda graph with multi threads will be supported in the future, so cuda_graph_ is put under PerThreadContext.
    // ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph instance is enough (no need to maintain one CUDAGraph instance per TRT subgraph)
    CUDAGraph cuda_graph_;
    // Map of graph id to regular_run_count_before_graph_capture
    std::unordered_map<CudaGraphAnnotation_t, int> graph_id_to_run_count_;
    bool is_graph_captured_ = false;
    int regular_run_count_before_graph_capture_ = 0;
    // Current graph annotation ID for this run
    CudaGraphAnnotation_t current_graph_annotation_id_ = kCudaGraphAnnotationDefault;
    // There is chance (currently only happens in CUDA EP) that the second regular run allocates GPU memory for causes like:
    // (1) memory pattern is enabled. (2) arena allocation for stream.
    // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs
    // to allocate enough memory in Arena before graph capturing.
    const int min_num_runs_before_cuda_graph_capture_ = 2;  // required min regular runs before graph capture for the necessary memory allocations.
    // https://github.com/NVIDIA/TensorRT/blob/main/samples/common/sampleInference.cpp#L1258-L1291 Based on the trtexec code
  };

  using PerThreadContextMap = std::unordered_map<const NvExecutionProvider*, std::weak_ptr<PerThreadContext>>;
  // thread local PerThreadContext cache

  struct ContextCacheHolder {
    ContextCacheHolder() {
      // Keep a weak pointer to the object, if the weak pointer can be locked, then the shared pointer is still around, so we can reset it
      RunOnUnload([&, weak_p_ = std::weak_ptr<PerThreadContextMap>(p)] {
        if (auto lock = weak_p_.lock()) {
          p.reset();
        }
      });
    }

    std::shared_ptr<PerThreadContextMap> p = std::make_shared<PerThreadContextMap>();
  };

  static const std::shared_ptr<PerThreadContextMap>& PerThreadContextCache() {
    thread_local const ContextCacheHolder per_thread_context_cache;
    return per_thread_context_cache.p;
  }

  struct PerThreadContextState {
    // contexts that are currently active
    std::set<std::shared_ptr<PerThreadContext>, std::owner_less<std::shared_ptr<PerThreadContext>>> active_contexts;
    // contexts available for reuse
    std::vector<std::shared_ptr<PerThreadContext>> retired_context_pool;
    // weak references to thread local caches from which this NvExecutionProvider instance's entry should be removed
    // upon destruction
    std::set<std::weak_ptr<PerThreadContextMap>, std::owner_less<std::weak_ptr<PerThreadContextMap>>>
        caches_to_update_on_destruction;
    // synchronizes access to PerThreadContextState members
    std::mutex mutex;
  };

  // The execution provider maintains the PerThreadContexts in this structure.
  // Synchronization is required to update the contained structures.
  // On the other hand, access to an individual PerThreadContext is assumed to be from a single thread at a time,
  // so synchronization is not required for that.
  mutable PerThreadContextState context_state_;

  PerThreadContext& GetPerThreadContext() const;
  void ReleasePerThreadContext() const;

  /**Get IndexedSubGraph based on node list of the subgraph*/
  std::unique_ptr<IndexedSubGraph> GetSubGraph(SubGraph_t graph_nodes_index,
                                               const GraphViewer& graph, const HashValue& model_hash, int subgraph_index) const;

  /**
  Get TensorRT supported node lists by calling Onnx-TensorRT parser recursively. Since each time the parser
  can only detect first unsupported node failure, it needs to wait for Onnxruntime to partition the graph
  and then detect next failure again. If there are too many iterations, which means many nodes in the graph
  are not supported by TensorRT, the process will be terminated and the whole graph is simply assigned to
  other execution provider.
  */
  SubGraphCollection_t GetSupportedList(SubGraphCollection_t supported_nodes_list, int iterations, const int max_iterations,
                                        const GraphViewer& graph, bool* early_termination) const;

  bool DetectTensorRTGraphCycles(SubGraphCollection_t& supported_nodes_vector, const GraphViewer& graph, const HashValue& model_hash, bool remove_cycles = true) const;

  /**
  Get a unique_lock object to control the concurrency behavior.
  Every api call not in the thread-safe operations(https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading)
  should be protected by a lock when invoked by multiple threads concurrently.
  */
  std::unique_lock<std::mutex> GetApiLock() const;

  /**Check the graph is the subgraph of control flow op*/
  bool IsSubGraphOfControlFlowOp(const GraphViewer& graph) const;

  /**Check whether all the nodes of the graph are assigned to specific ep*/
  bool AllNodesAssignedToSpecificEP(const GraphViewer& graph, const std::string& provider_type) const;

  /**Check whether all the nodes of subgraph are supported*/
  bool IsSubGraphFullySupported(SubGraphCollection_t supported_nodes_vector, const int number_of_ort_nodes) const;

  /**
   * Set inputs, initializers and outputs for all subgraphs during NvExecutionProvider::GetSupportedList()
   * and save those information in subgraph context data structure. It's useful for building a valid graph and
   * make Graph::Resolve() happy especially when dealing with nested control-flow op graph.
   */
  void BuildSubGraphContext(const Graph& build_graph) const;

  /**
   * Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
   */
  void SetGraphOuterScopeValuesAndInputs(Graph& build_graph, const Graph& graph) const;

  /**
   * If ORT TRT manually sets graph input in NvExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
   * we have to manully set all the graph inputs in order to pass Graph::Resolve().
   */
  void SetAllGraphInputs(Graph& graph) const;

  /**
   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
   * Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
   */
  bool IsInputInitializerOrOutput(const Graph& graph, const std::string& name, bool check_ancestors) const;

  /**
   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
   * Graph::ResolveContext::IsOuterScopeValue(). We have to implement this fuction again.
   */
  bool IsOuterScopeValue(const Graph& graph, const std::string& name) const;

  /**
   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
   * Graph::ResolveContext::IsLocalValue(). We have to implement this fuction again.
   */
  bool IsLocalValue(const Graph& graph, const std::string& name) const;

  /**
   * Create a vector of NodeComputeInfo instances directly from "TRT engine" wrapped onnx model without
   * going through the time-consuming processes of model parsing and engine building.
   */
  Status CreateNodeComputeInfoFromPrecompiledEngine(const GraphViewer& graph_body_viewer,
                                                    size_t node_idx,
                                                    const Node& fused_node,
                                                    std::unordered_map<std::string, size_t>& input_map,
                                                    std::unordered_map<std::string, size_t>& output_map,
                                                    std::vector<NodeComputeInfo>& node_compute_funcs);

  /**
   * Create a vector of NodeComputeInfo instances from graph.
   */
  Status CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
                                        const Node& fused_node,
                                        std::unordered_map<std::string, size_t>& input_map,
                                        std::unordered_map<std::string, size_t>& output_map,
                                        std::vector<NodeComputeInfo>& node_compute_funcs);

  /**
   * Get the pointer to the IBuilder instance.
   * This function only creates the instance at the first time it's being called."
   */
  nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const;
};
}  // namespace onnxruntime
