From 37e33881b9e995f8328da0a149d71f4314bd7aa1 Mon Sep 17 00:00:00 2001
From: Samaresh Kumar Singh <ssam3003@gmail.com>
Date: Tue, 31 Mar 2026 21:39:08 -0500
Subject: [PATCH 1/3] profiler: fix USE_KINETO=OFF build failure due to
 unconditional ActivityType.h include

kineto_shim.h was unconditionally including <ActivityType.h>, which is a Kineto header, causing a fatal compile error when building with USE_KINETO=OFF and Kineto is not installed (e.g. on Gentoo with system libraries).

Guard the include with #ifdef USE_KINETO and provide a minimal stub (enum class ActivityType : uint8_t { NONE = 0 }) so the data structures and function signatures that reference libkineto::ActivityType still compile. Guard the corresponding function bodies in kineto_shim.cpp (addCPUActivity,
deviceTypeFromActivity) and collection.cpp (scopeToType, kinetoType) that use concrete enum values, adding no-op stubs for the non-Kineto paths.Fixes the bug #178939
---
 torch/csrc/profiler/collection.cpp  |  8 ++++++++
 torch/csrc/profiler/kineto_shim.cpp | 16 ++++++++++++++--
 torch/csrc/profiler/kineto_shim.h   |  7 +++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 1be2d80310910..e8054d4f44df7 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -573,11 +573,13 @@ std::string toString(const ExtraFields<EventType::PyCall>& e) {
       e.callsite_.funcname_.str());
 }
 
+#ifdef USE_KINETO
 auto scopeToType(at::RecordScope scope) {
   return scope == at::RecordScope::USER_SCOPE
       ? libkineto::ActivityType::USER_ANNOTATION
       : libkineto::ActivityType::CPU_OP;
 }
+#endif
 
 int64_t torchOpEndNS(
     const ExtraFields<EventType::TorchOp>& e,
@@ -626,6 +628,7 @@ std::string Result::overload_name() const {
       [](const auto& e) -> std::string { return ""; }));
 }
 
+#ifdef USE_KINETO
 libkineto::ActivityType Result::kinetoType() const {
   return visit(c10::overloaded(
       ATTRIBUTE(TorchOp, scopeToType(e.scope_)),
@@ -638,6 +641,11 @@ libkineto::ActivityType Result::kinetoType() const {
       ATTRIBUTE(PythonGC, libkineto::ActivityType::PYTHON_FUNCTION),
       ATTRIBUTE(Kineto, e.activity_type_)));
 }
+#else
+libkineto::ActivityType Result::kinetoType() const {
+  return libkineto::ActivityType::NONE;
+}
+#endif
 
 uint64_t Result::correlationID() const {
   return visit(c10::overloaded(
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index fa232e1a01016..0ddf62997a07e 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -138,6 +138,7 @@ TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name)
 }
 #endif // USE_KINETO
 
+#ifdef USE_KINETO
 activity_t* TraceWrapper::addCPUActivity(
     const std::string& name,
     const libkineto::ActivityType type,
@@ -145,7 +146,6 @@ activity_t* TraceWrapper::addCPUActivity(
     const uint64_t correlation_id,
     const int64_t start_time,
     const int64_t end_time) {
-#ifdef USE_KINETO
   TORCH_CHECK((bool)(*this), "Cannot add event to non-existent trace.");
   cpu_trace_->emplace_activity(cpu_trace_->span, type, name);
   auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
@@ -157,10 +157,18 @@ activity_t* TraceWrapper::addCPUActivity(
     act.endTime = end_time;
   }
   return cpu_trace_->activities.back().get();
+}
 #else
+activity_t* TraceWrapper::addCPUActivity(
+    const std::string& name,
+    const libkineto::ActivityType type,
+    const DeviceAndResource device_and_resource,
+    const uint64_t correlation_id,
+    const int64_t start_time,
+    const int64_t end_time) {
   return nullptr;
-#endif // USE_KINETO
 }
+#endif // USE_KINETO
 
 void TraceWrapper::transferCpuTrace(int64_t end_time) {
 #ifdef USE_KINETO
@@ -473,6 +481,7 @@ void logInvariantViolation(
 
 namespace autograd::profiler {
 c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
+#ifdef USE_KINETO
   // PrivateUse1 kineto backend reuse some ActivityTypes,
   // If PrivateUse1 backend is enabled, this should return
   // c10::DeviceType::PrivateUse1.
@@ -524,6 +533,9 @@ c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type) {
       return c10::DeviceType::CPU;
     }
   }
+#else
+  return c10::DeviceType::CPU;
+#endif // USE_KINETO
 }
 
 void addMetadataJson(const std::string& key, const std::string& value) {
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index 4f9bdc6770507..44bcb0e18a6e0 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -12,7 +12,14 @@
 #undef USE_KINETO
 #endif
 
+#ifdef USE_KINETO
 #include <ActivityType.h>
+#else
+// Minimal stub so non-Kineto builds can compile types that hold ActivityType.
+namespace libkineto {
+enum class ActivityType : uint8_t { NONE = 0 };
+} // namespace libkineto
+#endif
 
 #include <torch/csrc/Export.h>
 #include <torch/csrc/profiler/api.h>

From f5e3a9d9aa12c6e2006a9243c97ef7703fcbbe2f Mon Sep 17 00:00:00 2001
From: Samaresh Kumar Singh <ssam3003@gmail.com>
Date: Wed, 1 Apr 2026 10:34:24 -0500
Subject: [PATCH 2/3] Fixed the USER_ANNOTATION/GPU_USER_ANNOTATION build
 errors with external kineto

When building against a system-installed kineto (e.g. Gentoo's sci-ml/kineto) that lacks USER_ANNOTATION and GPU_USER_ANNOTATION in its ActivityType enum,
init.cpp failed to compile because it referenced those enum member names directly.
---
 torch/csrc/autograd/init.cpp            |  7 +------
 torch/csrc/autograd/profiler_kineto.cpp |  7 +++++++
 torch/csrc/autograd/profiler_kineto.h   |  1 +
 torch/csrc/profiler/kineto_shim.h       | 11 +++++++++--
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 1e2442a5d99db..f8e8795c5805e 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -305,12 +305,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       .def("privateuse1_elapsed_us", &KinetoEvent::privateuse1ElapsedUs)
       .def(
           "is_user_annotation",
-          [](const KinetoEvent& e) {
-            return e.activityType() ==
-                (uint8_t)libkineto::ActivityType::USER_ANNOTATION ||
-                e.activityType() ==
-                (uint8_t)libkineto::ActivityType::GPU_USER_ANNOTATION;
-          })
+          [](const KinetoEvent& e) { return e.isUserAnnotation(); })
       .def(
           "is_python_function",
           [](const KinetoEvent& e) { return e.isPythonFunction(); })
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 9a076f58d7143..6b5165f0872fb 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -1081,6 +1081,13 @@ int64_t KinetoEvent::privateuse1ElapsedUs() const {
   return -1;
 }
 
+bool KinetoEvent::isUserAnnotation() const {
+  constexpr uint8_t kUserAnnotation = 1;
+  constexpr uint8_t kGpuUserAnnotation = 2;
+  const auto type = activityType();
+  return type == kUserAnnotation || type == kGpuUserAnnotation;
+}
+
 void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
   return result_->visit(c10::overloaded(
       [&in](const ExtraFields<EventType::TorchOp>& e) -> void {
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 777b8a5851ed5..c3135c61245d4 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -67,6 +67,7 @@ struct TORCH_API KinetoEvent {
   bool isPythonFunction() const;
   int64_t cudaElapsedUs() const;
   int64_t privateuse1ElapsedUs() const;
+  bool isUserAnnotation() const;
   void getPerfEventCounters(torch::profiler::perf_counters_t& /*in*/) const;
   extra_meta_t extraMeta() const;
   std::string metadataJson() const;
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index 44bcb0e18a6e0..ddc00c03fa9cc 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -15,9 +15,16 @@
 #ifdef USE_KINETO
 #include <ActivityType.h>
 #else
-// Minimal stub so non-Kineto builds can compile types that hold ActivityType.
 namespace libkineto {
-enum class ActivityType : uint8_t { NONE = 0 };
+enum class ActivityType : uint8_t {
+  CPU_OP = 0,
+  USER_ANNOTATION,
+  GPU_USER_ANNOTATION,
+  NONE = CPU_OP,
+};
+inline const char* toString(ActivityType) {
+  return "CPU_OP";
+}
 } // namespace libkineto
 #endif
 

From 717526b7ce90d242cb917db90f4cb13c85c93661 Mon Sep 17 00:00:00 2001
From: Samaresh Kumar Singh <ssam3003@gmail.com>
Date: Fri, 3 Apr 2026 13:01:01 -0500
Subject: [PATCH 3/3] profiler: guard GPU ActivityType checks with KINETO GPU
 backend macros

The externalId() function referenced GPU-specific libkineto::ActivityType
values (GPU_MEMCPY, GPU_MEMSET, CONCURRENT_KERNEL, CUDA_RUNTIME,
CUDA_DRIVER, PRIVATEUSE1_RUNTIME, PRIVATEUSE1_DRIVER) that are only
present in kineto builds with GPU backend support.

System-installed kineto packages built without CUDA/ROCm support
(LIBKINETO_NOCUPTI + LIBKINETO_NOROCTRACER) omit these enum values,
causing compile errors. Guard the check with the existing GPU backend
macros. Also restructures the condition to an early-return to avoid
duplicating the correlation ID lookup.
---
 torch/csrc/autograd/profiler_kineto.cpp | 38 ++++++++++++++-----------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 6b5165f0872fb..466e8d66ab518 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -1125,27 +1125,33 @@ int64_t KinetoEvent::externalId() const {
     return static_cast<int64_t>(linked);
   }
 
+#if defined(USE_KINETO) && \
+    (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
   // Orphaned GPU activities (no linked CPU op) in these types should not get
   // an External id, to avoid incorrect cross-linking in trace viewers.
+  // These GPU-specific ActivityType values are only present when kineto is
+  // built with GPU backend support (CUPTI or ROCtracer). CPU-only kineto
+  // builds (e.g. system packages without GPU support) omit them.
   auto type = static_cast<libkineto::ActivityType>(activityType());
-  if (type != libkineto::ActivityType::GPU_MEMCPY &&
-      type != libkineto::ActivityType::GPU_MEMSET &&
-      type != libkineto::ActivityType::CONCURRENT_KERNEL &&
-      type != libkineto::ActivityType::CUDA_RUNTIME &&
-      type != libkineto::ActivityType::CUDA_DRIVER &&
-      type != libkineto::ActivityType::PRIVATEUSE1_RUNTIME &&
-      type != libkineto::ActivityType::PRIVATEUSE1_DRIVER) {
-    return static_cast<int64_t>(result_->visit(c10::overloaded(
-        [](const ExtraFields<EventType::TorchOp>& e) -> uint64_t {
-          return e.correlation_id_;
-        },
-        [](const ExtraFields<EventType::Kineto>& e) -> uint64_t {
-          return e.correlation_id_;
-        },
-        [](const auto&) -> uint64_t { return 0; })));
+  if (type == libkineto::ActivityType::GPU_MEMCPY ||
+      type == libkineto::ActivityType::GPU_MEMSET ||
+      type == libkineto::ActivityType::CONCURRENT_KERNEL ||
+      type == libkineto::ActivityType::CUDA_RUNTIME ||
+      type == libkineto::ActivityType::CUDA_DRIVER ||
+      type == libkineto::ActivityType::PRIVATEUSE1_RUNTIME ||
+      type == libkineto::ActivityType::PRIVATEUSE1_DRIVER) {
+    return 0;
   }
+#endif
 
-  return 0;
+  return static_cast<int64_t>(result_->visit(c10::overloaded(
+      [](const ExtraFields<EventType::TorchOp>& e) -> uint64_t {
+        return e.correlation_id_;
+      },
+      [](const ExtraFields<EventType::Kineto>& e) -> uint64_t {
+        return e.correlation_id_;
+      },
+      [](const auto&) -> uint64_t { return 0; })));
 }
 
 #define FORWARD_FROM_RESULT(method_name, result_expr)                        \
