diff --git a/cli_flags.go b/cli_flags.go
index ac1ca806..724f1890 100644
--- a/cli_flags.go
+++ b/cli_flags.go
@@ -24,6 +24,7 @@ const (
 	defaultProbabilisticThreshold = tracer.ProbabilisticThresholdMax
 	defaultProbabilisticInterval  = 1 * time.Minute
 	defaultArgSendErrorFrames     = false
+	defaultOffCPUThreshold        = tracer.OffCPUThresholdMax
 
 	// This is the X in 2^(n + x) where n is the default hardcoded map size value
 	defaultArgMapScaleFactor = 0
@@ -61,6 +62,11 @@ var (
 		"If zero, monotonic-realtime clock sync will be performed once, " +
 		"on agent startup, but not periodically."
 	sendErrorFramesHelp = "Send error frames (devfiler only, breaks Kibana)"
+	offCPUThresholdHelp = fmt.Sprintf("If set to a value between 1 and %d will enable "+
+		"off cpu profiling: Every time an off-cpu entry point is hit, a random number between "+
+		"0 and %d is chosen. If the given threshold is greater than this random number, the off "+
+		"cpu trace is collected and reported.",
+		tracer.OffCPUThresholdMax-1, tracer.OffCPUThresholdMax-1)
 )
 
 // Package-scope variable, so that conditionally compiled other components can refer
@@ -114,6 +120,9 @@ func parseArgs() (*controller.Config, error) {
 	fs.BoolVar(&args.VerboseMode, "verbose", false, verboseModeHelp)
 	fs.BoolVar(&args.Version, "version", false, versionHelp)
 
+	fs.UintVar(&args.OffCPUThreshold, "off-cpu-threshold",
+		defaultOffCPUThreshold, offCPUThresholdHelp)
+
 	fs.Usage = func() {
 		fs.PrintDefaults()
 	}
diff --git a/internal/controller/config.go b/internal/controller/config.go
index 16daddc6..6885dc4a 100644
--- a/internal/controller/config.go
+++ b/internal/controller/config.go
@@ -30,6 +30,7 @@ type Config struct {
 	Tracers                string
 	VerboseMode            bool
 	Version                bool
+	OffCPUThreshold        uint
 
 	Reporter reporter.Reporter
 
diff --git a/internal/controller/controller.go b/internal/controller/controller.go
index 12628306..587994f0 100644
--- a/internal/controller/controller.go
+++ b/internal/controller/controller.go
@@ -129,6 +129,13 @@ func (c *Controller) Start(ctx context.Context) error {
 	}
 	log.Info("Attached tracer program")
 
+	if c.config.OffCPUThreshold < tracer.OffCPUThresholdMax {
+		if err := trc.StartOffCPUProfiling(); err != nil {
+			return fmt.Errorf("failed to start off-cpu profiling: %v", err)
+		}
+		log.Printf("Enabled off-cpu profiling")
+	}
+
 	if c.config.ProbabilisticThreshold < tracer.ProbabilisticThresholdMax {
 		trc.StartProbabilisticProfiling(ctx)
 		log.Printf("Enabled probabilistic profiling")
diff --git a/support/ebpf/bpfdefs.h b/support/ebpf/bpfdefs.h
index 7171b3c2..1271f845 100644
--- a/support/ebpf/bpfdefs.h
+++ b/support/ebpf/bpfdefs.h
@@ -83,6 +83,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, unsigned long long fla
     (void *)BPF_FUNC_perf_event_output;
 static int (*bpf_get_stackid)(void *ctx, void *map, u64 flags) =
     (void *)BPF_FUNC_get_stackid;
+static unsigned long long (*bpf_get_prandom_u32)(void) =
+	(void *) BPF_FUNC_get_prandom_u32;
 
 __attribute__ ((format (printf, 1, 3)))
 static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
diff --git a/support/ebpf/extmaps.h b/support/ebpf/extmaps.h
index 5922d9ed..56c58a67 100644
--- a/support/ebpf/extmaps.h
+++ b/support/ebpf/extmaps.h
@@ -6,8 +6,9 @@
 #include "bpf_map.h"
 
 // References to map definitions in *.ebpf.c.
-extern bpf_map_def progs;
+extern bpf_map_def perf_progs;
 extern bpf_map_def per_cpu_records;
+extern bpf_map_def kernel_stackmap;
 extern bpf_map_def pid_page_to_mapping_info;
 extern bpf_map_def metrics;
 extern bpf_map_def report_events;
@@ -41,7 +42,6 @@ extern bpf_map_def exe_id_to_21_stack_deltas;
 extern bpf_map_def exe_id_to_22_stack_deltas;
 extern bpf_map_def exe_id_to_23_stack_deltas;
 extern bpf_map_def hotspot_procs;
-extern bpf_map_def kernel_stackmap;
 extern bpf_map_def dotnet_procs;
 extern bpf_map_def perl_procs;
 extern bpf_map_def php_procs;
diff --git a/support/ebpf/integration_test.ebpf.c b/support/ebpf/integration_test.ebpf.c
index 510e72c6..dd01a060 100644
--- a/support/ebpf/integration_test.ebpf.c
+++ b/support/ebpf/integration_test.ebpf.c
@@ -80,10 +80,10 @@ void send_sample_traces(void *ctx, u64 pid, s32 kstack) {
   send_trace(ctx, trace);
 }
 
-// tracepoint__sched_switch fetches the current kernel stack ID from kernel_stackmap and
-// communicates it to userspace via kernel_stack_id map.
-SEC("tracepoint/sched/sched_switch")
-int tracepoint__sched_switch(void *ctx) {
+// tracepoint_integration__sched_switch fetches the current kernel stack ID from
+// kernel_stackmap and communicates it to userspace via kernel_stack_id map.
+SEC("tracepoint/integration/sched_switch")
+int tracepoint_integration__sched_switch(void *ctx) {
   u64 id = bpf_get_current_pid_tgid();
   u64 pid = id >> 32;
 
diff --git a/support/ebpf/interpreter_dispatcher.ebpf.c b/support/ebpf/interpreter_dispatcher.ebpf.c
index fbc5c598..1589a60d 100644
--- a/support/ebpf/interpreter_dispatcher.ebpf.c
+++ b/support/ebpf/interpreter_dispatcher.ebpf.c
@@ -25,8 +25,8 @@ bpf_map_def SEC("maps") metrics = {
   .max_entries = metricID_Max,
 };
 
-// progs maps from a program ID to an eBPF program
-bpf_map_def SEC("maps") progs = {
+// perf_progs maps from a program ID to a perf eBPF program
+bpf_map_def SEC("maps") perf_progs = {
   .type = BPF_MAP_TYPE_PROG_ARRAY,
   .key_size = sizeof(u32),
   .value_size = sizeof(u32),
diff --git a/support/ebpf/native_stack_trace.ebpf.c b/support/ebpf/native_stack_trace.ebpf.c
index 959099cb..53af3e4c 100644
--- a/support/ebpf/native_stack_trace.ebpf.c
+++ b/support/ebpf/native_stack_trace.ebpf.c
@@ -4,14 +4,6 @@
 #include "tracemgmt.h"
 #include "stackdeltatypes.h"
 
-#ifndef __USER32_CS
-  // defined in arch/x86/include/asm/segment.h
-  #define GDT_ENTRY_DEFAULT_USER32_CS  4
-  #define GDT_ENTRY_DEFAULT_USER_DS    5
-  #define __USER32_CS                 (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
-  #define __USER_DS                   (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
-#endif
-
 // Macro to create a map named exe_id_to_X_stack_deltas that is a nested maps with a fileID for the
 // outer map and an array as inner map that holds up to 2^X stack delta entries for the given fileID.
 #define STACK_DELTA_BUCKET(X)                                                            \
@@ -607,156 +599,6 @@ static ErrorCode unwind_one_frame(u64 pid, u32 frame_idx, struct UnwindState *st
   #error unsupported architecture
 #endif
 
-// Initialize state from pt_regs
-static inline ErrorCode copy_state_regs(UnwindState *state,
-                                        struct pt_regs *regs,
-                                        bool interrupted_kernelmode)
-{
-#if defined(__x86_64__)
-  // Check if the process is running in 32-bit mode on the x86_64 system.
-  // This check follows the Linux kernel implementation of user_64bit_mode() in
-  // arch/x86/include/asm/ptrace.h.
-  if (regs->cs == __USER32_CS) {
-    return ERR_NATIVE_X64_32BIT_COMPAT_MODE;
-  }
-  state->pc = regs->ip;
-  state->sp = regs->sp;
-  state->fp = regs->bp;
-  state->rax = regs->ax;
-  state->r9 = regs->r9;
-  state->r11 = regs->r11;
-  state->r13 = regs->r13;
-  state->r15 = regs->r15;
-
-  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/include/asm/syscall.h#L31-L39
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/entry/entry_64.S#L847
-  state->return_address = interrupted_kernelmode && regs->orig_ax != -1;
-#elif defined(__aarch64__)
-  // For backwards compatibility aarch64 can run 32-bit code.
-  // Check if the process is running in this 32-bit compat mod.
-  if (regs->pstate & PSR_MODE32_BIT) {
-    return ERR_NATIVE_AARCH64_32BIT_COMPAT_MODE;
-  }
-  state->pc = normalize_pac_ptr(regs->pc);
-  state->sp = regs->sp;
-  state->fp = regs->regs[29];
-  state->lr = normalize_pac_ptr(regs->regs[30]);
-  state->r22 = regs->regs[22];
-
-  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L118
-  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L206-L209
-  //
-  // Note: We do not use `unwinder_mark_nonleaf_frame` here,
-  // because the frame is a leaf frame from the perspective of the user stack,
-  // regardless of whether we are in a syscall.
-  state->return_address = interrupted_kernelmode && regs->syscallno != -1;
-  state->lr_invalid = false;
-#endif
-
-  return ERR_OK;
-}
-
-#ifndef TESTING_COREDUMP
-
-// Read the task's entry stack pt_regs. This has identical functionality
-// to bpf_task_pt_regs which is emulated to support older kernels.
-// Once kernel requirement is increased to 5.15 this can be replaced with
-// the bpf_task_pt_regs() helper.
-static inline
-long get_task_pt_regs(struct task_struct *task, SystemConfig* syscfg) {
-  u64 stack_ptr = (u64)task + syscfg->task_stack_offset;
-  long stack_base;
-  if (bpf_probe_read_kernel(&stack_base, sizeof(stack_base), (void*) stack_ptr)) {
-    return 0;
-  }
-  return stack_base + syscfg->stack_ptregs_offset;
-}
-
-// Determine whether the given pt_regs are from user-mode register context.
-// This needs to detect also invalid pt_regs in case we its kernel thread stack
-// without valid user mode pt_regs so is_kernel_address(pc) is not enough.
-static inline
-bool ptregs_is_usermode(struct pt_regs *regs) {
-#if defined(__x86_64__)
-  // On x86_64 the user mode SS should always be __USER_DS.
-  if (regs->ss != __USER_DS) {
-    return false;
-  }
-  return true;
-#elif defined(__aarch64__)
-  // Check if the processor state is in the EL0t what linux uses for usermode.
-  if ((regs->pstate & PSR_MODE_MASK) != PSR_MODE_EL0t) {
-    return false;
-  }
-  return true;
-#else
-#error add support for new architecture
-#endif
-}
-
-// Extract the usermode pt_regs for current task. Use context given pt_regs
-// if it is usermode regs, or resolve it via struct task_struct.
-//
-// State registers are not touched (get_pristine_per_cpu_record already reset it)
-// if something fails. has_usermode_regs is set to true if a user-mode register
-// context was found: not every thread that we interrupt will actually have
-// a user-mode context (e.g. kernel worker threads won't).
-static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
-                                          UnwindState *state,
-                                          bool *has_usermode_regs) {
-  ErrorCode error;
-
-  if (!ptregs_is_usermode(ctx)) {
-    u32 key = 0;
-    SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
-    if (!syscfg) {
-      // Unreachable: array maps are always fully initialized.
-      return ERR_UNREACHABLE;
-    }
-
-    // Use the current task's entry pt_regs
-    struct task_struct *task = (struct task_struct *) bpf_get_current_task();
-    long ptregs_addr = get_task_pt_regs(task, syscfg);
-
-    struct pt_regs regs;
-    if (!ptregs_addr || bpf_probe_read_kernel(&regs, sizeof(regs), (void*) ptregs_addr)) {
-      increment_metric(metricID_UnwindNativeErrReadKernelModeRegs);
-      return ERR_NATIVE_READ_KERNELMODE_REGS;
-    }
-
-    if (!ptregs_is_usermode(&regs)) {
-      // No usermode registers context found.
-      return ERR_OK;
-    }
-    error = copy_state_regs(state, &regs, true);
-  } else {
-    // User mode code interrupted, registers are available via the ebpf context.
-    error = copy_state_regs(state, ctx, false);
-  }
-  if (error == ERR_OK) {
-    DEBUG_PRINT("Read regs: pc: %llx sp: %llx fp: %llx", state->pc, state->sp, state->fp);
-    *has_usermode_regs = true;
-  }
-  return error;
-}
-
-#else // TESTING_COREDUMP
-
-static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
-                                          UnwindState *state,
-                                          bool *has_usermode_regs) {
-  // Coredumps provide always usermode pt_regs directly.
-  ErrorCode error = copy_state_regs(state, ctx, false);
-  if (error == ERR_OK) {
-    *has_usermode_regs = true;
-  }
-  return error;
-}
-
-#endif
-
 SEC("perf_event/unwind_native")
 int unwind_native(struct pt_regs *ctx) {
   PerCPURecord *record = get_per_cpu_record();
@@ -809,65 +651,11 @@ int unwind_native(struct pt_regs *ctx) {
   return -1;
 }
 
-static inline
-int collect_trace(struct pt_regs *ctx) {
+SEC("perf_event/native_tracer_entry")
+int native_tracer_entry(struct bpf_perf_event_data *ctx) {
   // Get the PID and TGID register.
   u64 id = bpf_get_current_pid_tgid();
   u32 pid = id >> 32;
   u32 tid = id & 0xFFFFFFFF;
-
-  if (pid == 0) {
-    return 0;
-  }
-
-  u64 ktime = bpf_ktime_get_ns();
-
-  DEBUG_PRINT("==== do_perf_event ====");
-
-  // The trace is reused on each call to this function so we have to reset the
-  // variables used to maintain state.
-  DEBUG_PRINT("Resetting CPU record");
-  PerCPURecord *record = get_pristine_per_cpu_record();
-  if (!record) {
-    return -1;
-  }
-
-  Trace *trace = &record->trace;
-  trace->pid = pid;
-  trace->tid = tid;
-  trace->ktime = ktime;
-  if (bpf_get_current_comm(&(trace->comm), sizeof(trace->comm)) < 0) {
-    increment_metric(metricID_ErrBPFCurrentComm);
-  }
-
-  // Get the kernel mode stack trace first
-  trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID);
-  DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id);
-
-  // Recursive unwind frames
-  int unwinder = PROG_UNWIND_STOP;
-  bool has_usermode_regs = false;
-  ErrorCode error = get_usermode_regs(ctx, &record->state, &has_usermode_regs);
-  if (error || !has_usermode_regs) {
-    goto exit;
-  }
-
-  if (!pid_information_exists(ctx, pid)) {
-    if (report_pid(ctx, pid, RATELIMIT_ACTION_DEFAULT)) {
-      increment_metric(metricID_NumProcNew);
-    }
-    return 0;
-  }
-  error = get_next_unwinder_after_native_frame(record, &unwinder);
-
-exit:
-  record->state.unwind_error = error;
-  tail_call(ctx, unwinder);
-  DEBUG_PRINT("bpf_tail call failed for %d in native_tracer_entry", unwinder);
-  return -1;
-}
-
-SEC("perf_event/native_tracer_entry")
-int native_tracer_entry(struct bpf_perf_event_data *ctx) {
-  return collect_trace((struct pt_regs*) &ctx->regs);
+  return collect_trace((struct pt_regs*) &ctx->regs, TRACE_SAMPLING, pid, tid, 0);
 }
diff --git a/support/ebpf/off_cpu.ebpf.c b/support/ebpf/off_cpu.ebpf.c
new file mode 100644
index 00000000..8132b148
--- /dev/null
+++ b/support/ebpf/off_cpu.ebpf.c
@@ -0,0 +1,86 @@
+#include "bpfdefs.h"
+#include "types.h"
+#include "tracemgmt.h"
+
+// kprobe_progs maps from a program ID to a kprobe eBPF program
+bpf_map_def SEC("maps") kprobe_progs = {
+  .type = BPF_MAP_TYPE_PROG_ARRAY,
+  .key_size = sizeof(u32),
+  .value_size = sizeof(u32),
+  .max_entries = NUM_TRACER_PROGS,
+};
+
+// profile_off_cpu communicates scheduler tasks.
+bpf_map_def SEC("maps") profile_off_cpu = {
+  .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+  .key_size = sizeof(u64),   // pid_tgid
+  .value_size = sizeof(u64), // time in ns
+  .max_entries = 256,
+};
+
+// tracepoint__sched_switch serves as entry point for off cpu profiling.
+SEC("tracepoint/sched/sched_switch")
+int tracepoint__sched_switch(void *ctx) {
+  u64 pid_tgid = bpf_get_current_pid_tgid();
+  u32 pid = pid_tgid >> 32;
+  u32 tid = pid_tgid & 0xFFFFFFFF;
+
+  if (pid == 0 || tid == 0) {
+    return 0;
+  }
+
+  u32 key = 0;
+  SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
+  if (!syscfg) {
+    // Unreachable: array maps are always fully initialized.
+    return ERR_UNREACHABLE;
+  }
+
+  if (bpf_get_prandom_u32()%OFF_CPU_THRESHOLD_MAX > syscfg->off_cpu_threshold) {
+    return 0;
+  }
+
+  u64 ts = bpf_ktime_get_ns();
+
+  if (bpf_map_update_elem(&profile_off_cpu, &pid_tgid, &ts, BPF_ANY)<0){
+      return 0;
+  }
+
+  return 0;
+}
+
+// dummy is never loaded or called. It just makes sure kprobe_progs is referenced
+// and make the compiler and linker happy.
+SEC("kprobe/dummy")
+int dummy(struct pt_regs *ctx) {
+    bpf_tail_call(ctx, &kprobe_progs,0);
+    return 0;
+}
+
+// kp__finish_task_switch is triggered right after the scheduler updated
+// the CPU registers.
+SEC("kprobe/finish_task_switch")
+int finish_task_switch(struct pt_regs *ctx) {
+  // Get the PID and TGID register.
+  u64 pid_tgid = bpf_get_current_pid_tgid();
+  u32 pid = pid_tgid >> 32;
+  u32 tid = pid_tgid & 0xFFFFFFFF;
+
+  if (pid == 0 || tid == 0) {
+    return 0;
+  }
+
+  u64 ts = bpf_ktime_get_ns();
+
+  u64 *start_ts = bpf_map_lookup_elem(&profile_off_cpu, &pid_tgid);
+  if (!start_ts){
+    // There is no information from the sched/sched_switch entry hook.
+    return 0;
+  }
+
+  DEBUG_PRINT("==== finish_task_switch ====");
+
+  u64 diff = ts - *start_ts;
+
+  return collect_trace(ctx, TRACE_OFF_CPU, pid, tid, diff);
+}
\ No newline at end of file
diff --git a/support/ebpf/tracemgmt.h b/support/ebpf/tracemgmt.h
index c0e08a45..2965635d 100644
--- a/support/ebpf/tracemgmt.h
+++ b/support/ebpf/tracemgmt.h
@@ -443,10 +443,10 @@ int get_next_unwinder_after_interpreter(const PerCPURecord *record) {
 // tail_call is a wrapper around bpf_tail_call() and ensures that the number of tail calls is not
 // reached while unwinding the stack.
 static inline __attribute__((__always_inline__))
-void tail_call(void *ctx, int next) {
+void tail_call(void *ctx, int next) { 
   PerCPURecord *record = get_per_cpu_record();
   if (!record) {
-    bpf_tail_call(ctx, &progs, PROG_UNWIND_STOP);
+    bpf_tail_call(ctx, &perf_progs, PROG_UNWIND_STOP);
     // In theory bpf_tail_call() should never return. But due to instruction reordering by the
     // compiler we have to place return here to bribe the verifier to accept this.
     return;
@@ -464,7 +464,218 @@ void tail_call(void *ctx, int next) {
   }
   record->tailCalls += 1 ;
 
-  bpf_tail_call(ctx, &progs, next);
+  bpf_tail_call(ctx, &perf_progs, next);
+}
+
+#ifndef __USER32_CS
+  // defined in arch/x86/include/asm/segment.h
+  #define GDT_ENTRY_DEFAULT_USER32_CS  4
+  #define GDT_ENTRY_DEFAULT_USER_DS    5
+  #define __USER32_CS                 (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+  #define __USER_DS                   (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#endif
+
+// Initialize state from pt_regs
+static inline ErrorCode copy_state_regs(UnwindState *state,
+                                        struct pt_regs *regs,
+                                        bool interrupted_kernelmode)
+{
+#if defined(__x86_64__)
+  // Check if the process is running in 32-bit mode on the x86_64 system.
+  // This check follows the Linux kernel implementation of user_64bit_mode() in
+  // arch/x86/include/asm/ptrace.h.
+  if (regs->cs == __USER32_CS) {
+    return ERR_NATIVE_X64_32BIT_COMPAT_MODE;
+  }
+  state->pc = regs->ip;
+  state->sp = regs->sp;
+  state->fp = regs->bp;
+  state->rax = regs->ax;
+  state->r9 = regs->r9;
+  state->r11 = regs->r11;
+  state->r13 = regs->r13;
+  state->r15 = regs->r15;
+
+  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/include/asm/syscall.h#L31-L39
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/x86/entry/entry_64.S#L847
+  state->return_address = interrupted_kernelmode && regs->orig_ax != -1;
+#elif defined(__aarch64__)
+  // For backwards compatibility aarch64 can run 32-bit code.
+  // Check if the process is running in this 32-bit compat mod.
+  if (regs->pstate & PSR_MODE32_BIT) {
+    return ERR_NATIVE_AARCH64_32BIT_COMPAT_MODE;
+  }
+  state->pc = normalize_pac_ptr(regs->pc);
+  state->sp = regs->sp;
+  state->fp = regs->regs[29];
+  state->lr = normalize_pac_ptr(regs->regs[30]);
+  state->r22 = regs->regs[22];
+
+  // Treat syscalls as return addresses, but not IRQ handling, page faults, etc..
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L118
+  // https://github.com/torvalds/linux/blob/2ef5971ff3/arch/arm64/include/asm/ptrace.h#L206-L209
+  //
+  // Note: We do not use `unwinder_mark_nonleaf_frame` here,
+  // because the frame is a leaf frame from the perspective of the user stack,
+  // regardless of whether we are in a syscall.
+  state->return_address = interrupted_kernelmode && regs->syscallno != -1;
+  state->lr_invalid = false;
+#endif
+
+  return ERR_OK;
+}
+
+#ifndef TESTING_COREDUMP
+
+// Read the task's entry stack pt_regs. This has identical functionality
+// to bpf_task_pt_regs which is emulated to support older kernels.
+// Once kernel requirement is increased to 5.15 this can be replaced with
+// the bpf_task_pt_regs() helper.
+static inline
+long get_task_pt_regs(struct task_struct *task, SystemConfig* syscfg) {
+  u64 stack_ptr = (u64)task + syscfg->task_stack_offset;
+  long stack_base;
+  if (bpf_probe_read_kernel(&stack_base, sizeof(stack_base), (void*) stack_ptr)) {
+    return 0;
+  }
+  return stack_base + syscfg->stack_ptregs_offset;
+}
+
+// Determine whether the given pt_regs are from user-mode register context.
+// This needs to detect also invalid pt_regs in case we its kernel thread stack
+// without valid user mode pt_regs so is_kernel_address(pc) is not enough.
+static inline
+bool ptregs_is_usermode(struct pt_regs *regs) {
+#if defined(__x86_64__)
+  // On x86_64 the user mode SS should always be __USER_DS.
+  if (regs->ss != __USER_DS) {
+    return false;
+  }
+  return true;
+#elif defined(__aarch64__)
+  // Check if the processor state is in the EL0t what linux uses for usermode.
+  if ((regs->pstate & PSR_MODE_MASK) != PSR_MODE_EL0t) {
+    return false;
+  }
+  return true;
+#else
+#error add support for new architecture
+#endif
+}
+
+// Extract the usermode pt_regs for current task. Use context given pt_regs
+// if it is usermode regs, or resolve it via struct task_struct.
+//
+// State registers are not touched (get_pristine_per_cpu_record already reset it)
+// if something fails. has_usermode_regs is set to true if a user-mode register
+// context was found: not every thread that we interrupt will actually have
+// a user-mode context (e.g. kernel worker threads won't).
+static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
+                                          UnwindState *state,
+                                          bool *has_usermode_regs) {
+  ErrorCode error;
+
+  if (!ptregs_is_usermode(ctx)) {
+    u32 key = 0;
+    SystemConfig* syscfg = bpf_map_lookup_elem(&system_config, &key);
+    if (!syscfg) {
+      // Unreachable: array maps are always fully initialized.
+      return ERR_UNREACHABLE;
+    }
+
+    // Use the current task's entry pt_regs
+    struct task_struct *task = (struct task_struct *) bpf_get_current_task();
+    long ptregs_addr = get_task_pt_regs(task, syscfg);
+
+    struct pt_regs regs;
+    if (!ptregs_addr || bpf_probe_read_kernel(&regs, sizeof(regs), (void*) ptregs_addr)) {
+      increment_metric(metricID_UnwindNativeErrReadKernelModeRegs);
+      return ERR_NATIVE_READ_KERNELMODE_REGS;
+    }
+
+    if (!ptregs_is_usermode(&regs)) {
+      // No usermode registers context found.
+      return ERR_OK;
+    }
+    error = copy_state_regs(state, &regs, true);
+  } else {
+    // User mode code interrupted, registers are available via the ebpf context.
+    error = copy_state_regs(state, ctx, false);
+  }
+  if (error == ERR_OK) {
+    DEBUG_PRINT("Read regs: pc: %llx sp: %llx fp: %llx", state->pc, state->sp, state->fp);
+    *has_usermode_regs = true;
+  }
+  return error;
+}
+
+#else // TESTING_COREDUMP
+
+static inline ErrorCode get_usermode_regs(struct pt_regs *ctx,
+                                          UnwindState *state,
+                                          bool *has_usermode_regs) {
+  // Coredumps provide always usermode pt_regs directly.
+  ErrorCode error = copy_state_regs(state, ctx, false);
+  if (error == ERR_OK) {
+    *has_usermode_regs = true;
+  }
+  return error;
+}
+
+#endif // TESTING_COREDUMP
+
+static inline
+int collect_trace(struct pt_regs *ctx, TraceOrigin origin, u32 pid, u32 tid, u64 off_cpu_time) {
+  if (pid == 0) {
+    return 0;
+  }
+
+  u64 ktime = bpf_ktime_get_ns();
+
+  // The trace is reused on each call to this function so we have to reset the
+  // variables used to maintain state.
+  DEBUG_PRINT("Resetting CPU record");
+  PerCPURecord *record = get_pristine_per_cpu_record();
+  if (!record) {
+    return -1;
+  }
+
+  Trace *trace = &record->trace;
+  trace->origin = origin;
+  trace->pid = pid;
+  trace->tid = tid;
+  trace->ktime = ktime;
+  trace->offtime = off_cpu_time;
+  if (bpf_get_current_comm(&(trace->comm), sizeof(trace->comm)) < 0) {
+    increment_metric(metricID_ErrBPFCurrentComm);
+  }
+
+  // Get the kernel mode stack trace first
+  trace->kernel_stack_id = bpf_get_stackid(ctx, &kernel_stackmap, BPF_F_REUSE_STACKID);
+  DEBUG_PRINT("kernel stack id = %d", trace->kernel_stack_id);
+
+  // Recursive unwind frames
+  int unwinder = PROG_UNWIND_STOP;
+  bool has_usermode_regs = false;
+  ErrorCode error = get_usermode_regs(ctx, &record->state, &has_usermode_regs);
+  if (error || !has_usermode_regs) {
+    goto exit;
+  }
+
+  if (!pid_information_exists(ctx, pid)) {
+    if (report_pid(ctx, pid, RATELIMIT_ACTION_DEFAULT)) {
+      increment_metric(metricID_NumProcNew);
+    }
+    return 0;
+  }
+  error = get_next_unwinder_after_native_frame(record, &unwinder);
+
+exit:
+  record->state.unwind_error = error;
+  tail_call(ctx, unwinder);
+  DEBUG_PRINT("bpf_tail call failed for %d in native_tracer_entry", unwinder);
+  return -1;
 }
 
 #endif
diff --git a/support/ebpf/tracer.ebpf.release.amd64 b/support/ebpf/tracer.ebpf.release.amd64
index 2a8cbb2f..3a2c8bb7 100644
Binary files a/support/ebpf/tracer.ebpf.release.amd64 and b/support/ebpf/tracer.ebpf.release.amd64 differ
diff --git a/support/ebpf/types.h b/support/ebpf/types.h
index e5592ff8..233e422f 100644
--- a/support/ebpf/types.h
+++ b/support/ebpf/types.h
@@ -331,6 +331,17 @@ typedef enum TracePrograms {
   NUM_TRACER_PROGS,
 } TracePrograms;
 
+// TraceOrigin describes the source of the trace. This enables
+// origin specific handling of traces in user space.
+typedef enum TraceOrigin {
+  TRACE_UNKNOWN,
+  TRACE_SAMPLING,
+  TRACE_OFF_CPU,
+} TraceOrigin;
+
+// OFF_CPU_THRESHOLD_MAX defines the maximum threshold.
+#define OFF_CPU_THRESHOLD_MAX 1000
+
 // MAX_FRAME_UNWINDS defines the maximum number of frames per
 // Trace we can unwind and respect the limit of eBPF instructions,
 // limit of tail calls and limit of stack size per eBPF program.
@@ -532,6 +543,14 @@ typedef struct Trace {
   s32 kernel_stack_id;
   // The number of frames in the stack.
   u32 stack_len;
+
+  // origin indicates the source of the trace.
+  TraceOrigin origin;
+
+  // Time in nanosecond for off-cpu profiling,
+  // for how long the trace was off cpu.
+  u64 offtime;
+
   // The frames of the stack trace.
   Frame frames[MAX_FRAME_UNWINDS];
 
@@ -851,6 +870,9 @@ typedef struct SystemConfig {
   // The offset of struct pt_regs within the kernel entry stack.
   u32 stack_ptregs_offset;
 
+  // User defined threshold for off-cpu profiling.
+  u32 off_cpu_threshold;
+
   // Enables the temporary hack that drops pure errors frames in unwind_stop.
   bool drop_error_only_traces;
 } SystemConfig;
diff --git a/support/types.go b/support/types.go
index 6387e4d0..a37f38aa 100644
--- a/support/types.go
+++ b/support/types.go
@@ -105,3 +105,11 @@ const (
 	// PerfMaxStackDepth is the bpf map data array length for BPF_MAP_TYPE_STACK_TRACE traces
 	PerfMaxStackDepth = C.PERF_MAX_STACK_DEPTH
 )
+
+const (
+	TraceOriginUnknown  = C.TRACE_UNKNOWN
+	TraceOriginSampling = C.TRACE_SAMPLING
+	TraceOriginOffCPU   = C.TRACE_OFF_CPU
+)
+
+const OffCPUThresholdMax = C.OFF_CPU_THRESHOLD_MAX
diff --git a/tracer/ebpf_integration_test.go b/tracer/ebpf_integration_test.go
index 19e3c648..d671aca2 100644
--- a/tracer/ebpf_integration_test.go
+++ b/tracer/ebpf_integration_test.go
@@ -56,7 +56,7 @@ func runKernelFrameProbe(t *testing.T, tracer *Tracer) {
 	require.NoError(t, err)
 	defer restoreRlimit()
 
-	prog, err := cebpf.NewProgram(coll.Programs["tracepoint__sched_switch"])
+	prog, err := cebpf.NewProgram(coll.Programs["tracepoint_integration__sched_switch"])
 	require.NoError(t, err)
 	defer prog.Close()
 
diff --git a/tracer/systemconfig.go b/tracer/systemconfig.go
index 0e3a3daf..b8dc3ad4 100644
--- a/tracer/systemconfig.go
+++ b/tracer/systemconfig.go
@@ -227,7 +227,7 @@ func determineStackLayout(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map
 
 func loadSystemConfig(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map,
 	kernelSymbols *libpf.SymbolMap, includeTracers types.IncludedTracers,
-	filterErrorFrames bool) error {
+	offCPUThreshold uint32, filterErrorFrames bool) error {
 	pacMask := pacmask.GetPACMask()
 	if pacMask != 0 {
 		log.Infof("Determined PAC mask to be 0x%016X", pacMask)
@@ -237,6 +237,7 @@ func loadSystemConfig(coll *cebpf.CollectionSpec, maps map[string]*cebpf.Map,
 	syscfg := C.SystemConfig{
 		inverse_pac_mask:       ^C.u64(pacMask),
 		drop_error_only_traces: C.bool(filterErrorFrames),
+		off_cpu_threshold:      C.u32(offCPUThreshold),
 	}
 
 	if err := parseBTF(&syscfg); err != nil {
diff --git a/tracer/tracer.go b/tracer/tracer.go
index 2da8bff5..2f7d8352 100644
--- a/tracer/tracer.go
+++ b/tracer/tracer.go
@@ -19,6 +19,7 @@ import (
 	"unsafe"
 
 	cebpf "github.com/cilium/ebpf"
+	"github.com/cilium/ebpf/asm"
 	"github.com/cilium/ebpf/link"
 	"github.com/elastic/go-perf"
 	log "github.com/sirupsen/logrus"
@@ -64,6 +65,12 @@ const (
 	probProfilingDisable = -1
 )
 
+const (
+	// OffCpuThresholdMax defines the upper bound for the off-cpu profiling
+	// threshold.
+	OffCPUThresholdMax = support.OffCPUThresholdMax
+)
+
 // Intervals is a subset of config.IntervalsAndTimers.
 type Intervals interface {
 	MonitorInterval() time.Duration
@@ -153,6 +160,8 @@ type Config struct {
 	ProbabilisticInterval time.Duration
 	// ProbabilisticThreshold is the threshold for probabilistic profiling.
 	ProbabilisticThreshold uint
+	// OffCPUThreshold is the user defined threshold for off-cpu profiling.
+	OffCPUThreshold uint32
 }
 
 // hookPoint specifies the group and name of the hooked point in the kernel.
@@ -160,6 +169,18 @@ type hookPoint struct {
 	group, name string
 }
 
+// progLoaderHelper supports the loading process of eBPF programs.
+type progLoaderHelper struct {
+	// enable tells whether a prog shall be loaded.
+	enable bool
+	// name of the eBPF program
+	name string
+	// progID defines the ID for the eBPF program that is used as key in the tailcallMap.
+	progID uint32
+	// noTailCallTarget indicates if this eBPF program should be added to the tailcallMap.
+	noTailCallTarget bool
+}
+
 // processKernelModulesMetadata computes the FileID of kernel files and reports executable metadata
 // for all kernel modules and the vmlinux image.
 func processKernelModulesMetadata(rep reporter.SymbolReporter, kernelModules *libpf.SymbolMap,
@@ -267,9 +288,7 @@ func NewTracer(ctx context.Context, cfg *Config) (*Tracer, error) {
 	}
 
 	// Based on includeTracers we decide later which are loaded into the kernel.
-	ebpfMaps, ebpfProgs, err := initializeMapsAndPrograms(cfg.IncludeTracers, kernelSymbols,
-		cfg.FilterErrorFrames, cfg.MapScaleFactor, cfg.KernelVersionCheck, cfg.DebugTracer,
-		cfg.BPFVerifierLogLevel)
+	ebpfMaps, ebpfProgs, err := initializeMapsAndPrograms(kernelSymbols, cfg)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load eBPF code: %v", err)
 	}
@@ -369,9 +388,7 @@ func buildStackDeltaTemplates(coll *cebpf.CollectionSpec) error {
 
 // initializeMapsAndPrograms loads the definitions for the eBPF maps and programs provided
 // by the embedded elf file and loads these into the kernel.
-func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
-	kernelSymbols *libpf.SymbolMap, filterErrorFrames bool, mapScaleFactor int,
-	kernelVersionCheck bool, debugTracer bool, bpfVerifierLogLevel uint32) (
+func initializeMapsAndPrograms(kernelSymbols *libpf.SymbolMap, cfg *Config) (
 	ebpfMaps map[string]*cebpf.Map, ebpfProgs map[string]*cebpf.Program, err error) {
 	// Loading specifications about eBPF programs and maps from the embedded elf file
 	// does not load them into the kernel.
@@ -379,7 +396,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 	// References to eBPF maps in the eBPF programs are just placeholders that need to be
 	// replaced by the actual loaded maps later on with RewriteMaps before loading the
 	// programs into the kernel.
-	coll, err := support.LoadCollectionSpec(debugTracer)
+	coll, err := support.LoadCollectionSpec(cfg.DebugTracer)
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to load specification for tracers: %v", err)
 	}
@@ -395,7 +412,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 	// Load all maps into the kernel that are used later on in eBPF programs. So we can rewrite
 	// in the next step the placesholders in the eBPF programs with the file descriptors of the
 	// loaded maps in the kernel.
-	if err = loadAllMaps(coll, ebpfMaps, mapScaleFactor); err != nil {
+	if err = loadAllMaps(coll, ebpfMaps, cfg.MapScaleFactor); err != nil {
 		return nil, nil, fmt.Errorf("failed to load eBPF maps: %v", err)
 	}
 
@@ -406,7 +423,7 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 		return nil, nil, fmt.Errorf("failed to rewrite maps: %v", err)
 	}
 
-	if kernelVersionCheck {
+	if cfg.KernelVersionCheck {
 		var major, minor, patch uint32
 		major, minor, patch, err = GetCurrentKernelVersion()
 		if err != nil {
@@ -426,13 +443,68 @@ func initializeMapsAndPrograms(includeTracers types.IncludedTracers,
 		}
 	}
 
-	if err = loadUnwinders(coll, ebpfProgs, ebpfMaps["progs"], includeTracers,
-		bpfVerifierLogLevel); err != nil {
-		return nil, nil, fmt.Errorf("failed to load eBPF programs: %v", err)
+	tailCallProgs := []progLoaderHelper{
+		{
+			progID: uint32(support.ProgUnwindStop),
+			name:   "unwind_stop",
+			enable: true,
+		},
+		{
+			progID: uint32(support.ProgUnwindNative),
+			name:   "unwind_native",
+			enable: true,
+		},
+		{
+			progID: uint32(support.ProgUnwindHotspot),
+			name:   "unwind_hotspot",
+			enable: cfg.IncludeTracers.Has(types.HotspotTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindPerl),
+			name:   "unwind_perl",
+			enable: cfg.IncludeTracers.Has(types.PerlTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindPHP),
+			name:   "unwind_php",
+			enable: cfg.IncludeTracers.Has(types.PHPTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindPython),
+			name:   "unwind_python",
+			enable: cfg.IncludeTracers.Has(types.PythonTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindRuby),
+			name:   "unwind_ruby",
+			enable: cfg.IncludeTracers.Has(types.RubyTracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindV8),
+			name:   "unwind_v8",
+			enable: cfg.IncludeTracers.Has(types.V8Tracer),
+		},
+		{
+			progID: uint32(support.ProgUnwindDotnet),
+			name:   "unwind_dotnet",
+			enable: cfg.IncludeTracers.Has(types.DotnetTracer),
+		},
+	}
+
+	if err = loadPerfUnwinders(coll, ebpfProgs, ebpfMaps["perf_progs"], tailCallProgs,
+		cfg.BPFVerifierLogLevel); err != nil {
+		return nil, nil, fmt.Errorf("failed to load perf eBPF programs: %v", err)
 	}
 
-	if err = loadSystemConfig(coll, ebpfMaps, kernelSymbols, includeTracers,
-		filterErrorFrames); err != nil {
+	if cfg.OffCPUThreshold < OffCPUThresholdMax {
+		if err = loadKProbeUnwinders(coll, ebpfProgs, ebpfMaps["kprobe_progs"], tailCallProgs,
+			cfg.BPFVerifierLogLevel, ebpfMaps["perf_progs"].FD()); err != nil {
+			return nil, nil, fmt.Errorf("failed to load kprobe eBPF programs: %v", err)
+		}
+	}
+
+	if err = loadSystemConfig(coll, ebpfMaps, kernelSymbols, cfg.IncludeTracers,
+		cfg.OffCPUThreshold, cfg.FilterErrorFrames); err != nil {
 		return nil, nil, fmt.Errorf("failed to load system config: %v", err)
 	}
 
@@ -503,129 +575,172 @@ func loadAllMaps(coll *cebpf.CollectionSpec, ebpfMaps map[string]*cebpf.Map,
 	return nil
 }
 
-// loadUnwinders just satisfies the proof of concept and loads all eBPF programs
-func loadUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program,
-	tailcallMap *cebpf.Map, includeTracers types.IncludedTracers,
+// loadPerfUnwinders just satisfies the proof of concept and loads all eBPF programs
+func loadPerfUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program,
+	tailcallMap *cebpf.Map, tailCallProgs []progLoaderHelper,
 	bpfVerifierLogLevel uint32) error {
-	restoreRlimit, err := rlimit.MaximizeMemlock()
-	if err != nil {
-		return fmt.Errorf("failed to adjust rlimit: %v", err)
+	programOptions := cebpf.ProgramOptions{
+		LogLevel: cebpf.LogLevel(bpfVerifierLogLevel),
 	}
-	defer restoreRlimit()
 
-	type prog struct {
-		// enable tells whether a prog shall be loaded.
-		enable bool
-		// name of the eBPF program
-		name string
-		// progID defines the ID for the eBPF program that is used as key in the tailcallMap.
-		progID uint32
-		// noTailCallTarget indicates if this eBPF program should be added to the tailcallMap.
-		noTailCallTarget bool
+	progs := make([]progLoaderHelper, len(tailCallProgs)+2)
+	copy(progs, tailCallProgs)
+	progs = append(progs,
+		progLoaderHelper{
+			name:             "tracepoint__sched_process_exit",
+			noTailCallTarget: true,
+			enable:           true,
+		},
+		progLoaderHelper{
+			name:             "native_tracer_entry",
+			noTailCallTarget: true,
+			enable:           true,
+		})
+
+	for _, unwindProg := range progs {
+		if !unwindProg.enable {
+			continue
+		}
+
+		progSpec, ok := coll.Programs[unwindProg.name]
+		if !ok {
+			return fmt.Errorf("program %s does not exist", unwindProg.name)
+		}
+
+		if err := loadProgram(ebpfProgs, tailcallMap, unwindProg.progID, progSpec,
+			programOptions, unwindProg.noTailCallTarget); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// progArrayReferences returns a list of instructions which load a specified tail
+// call FD.
+func progArrayReferences(perfTailCallMapFD int, insns asm.Instructions) []int {
+	insNos := []int{}
+	for i := range insns {
+		ins := &insns[i]
+		if asm.OpCode(ins.OpCode.Class()) != asm.OpCode(asm.LdClass) {
+			continue
+		}
+		m := ins.Map()
+		if m == nil {
+			continue
+		}
+		if perfTailCallMapFD == m.FD() {
+			insNos = append(insNos, i)
+		}
 	}
+	return insNos
+}
 
+// loadKProbeUnwinders reuses large parts of loadPerfUnwinders. By default all eBPF programs
+// are written as perf event eBPF programs. loadKProbeUnwinders dynamically rewrites the
+// specification of these programs to krpobe eBPF programs and adjusts tail call maps.
+func loadKProbeUnwinders(coll *cebpf.CollectionSpec, ebpfProgs map[string]*cebpf.Program,
+	tailcallMap *cebpf.Map, tailCallProgs []progLoaderHelper,
+	bpfVerifierLogLevel uint32, perfTailCallMapFD int) error {
 	programOptions := cebpf.ProgramOptions{
 		LogLevel: cebpf.LogLevel(bpfVerifierLogLevel),
 	}
 
-	for _, unwindProg := range []prog{
-		{
-			progID: uint32(support.ProgUnwindStop),
-			name:   "unwind_stop",
-			enable: true,
-		},
-		{
-			progID: uint32(support.ProgUnwindNative),
-			name:   "unwind_native",
-			enable: true,
-		},
-		{
-			progID: uint32(support.ProgUnwindHotspot),
-			name:   "unwind_hotspot",
-			enable: includeTracers.Has(types.HotspotTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindPerl),
-			name:   "unwind_perl",
-			enable: includeTracers.Has(types.PerlTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindPHP),
-			name:   "unwind_php",
-			enable: includeTracers.Has(types.PHPTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindPython),
-			name:   "unwind_python",
-			enable: includeTracers.Has(types.PythonTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindRuby),
-			name:   "unwind_ruby",
-			enable: includeTracers.Has(types.RubyTracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindV8),
-			name:   "unwind_v8",
-			enable: includeTracers.Has(types.V8Tracer),
-		},
-		{
-			progID: uint32(support.ProgUnwindDotnet),
-			name:   "unwind_dotnet",
-			enable: includeTracers.Has(types.DotnetTracer),
-		},
-		{
-			name:             "tracepoint__sched_process_exit",
+	progs := make([]progLoaderHelper, len(tailCallProgs)+2)
+	copy(progs, tailCallProgs)
+	progs = append(progs,
+		progLoaderHelper{
+			name:             "finish_task_switch",
 			noTailCallTarget: true,
 			enable:           true,
 		},
-		{
-			name:             "native_tracer_entry",
+		progLoaderHelper{
+			name:             "tracepoint__sched_switch",
 			noTailCallTarget: true,
 			enable:           true,
 		},
-	} {
+	)
+
+	for _, unwindProg := range progs {
 		if !unwindProg.enable {
 			continue
 		}
 
-		// Load the eBPF program into the kernel. If no error is returned,
-		// the eBPF program can be used/called/triggered from now on.
-		unwinder, err := cebpf.NewProgramWithOptions(coll.Programs[unwindProg.name],
-			programOptions)
-		if err != nil {
-			// These errors tend to have hundreds of lines (or more),
-			// so we print each line individually.
-			if ve, ok := err.(*cebpf.VerifierError); ok {
-				for _, line := range ve.Log {
-					log.Error(line)
-				}
-			} else {
-				scanner := bufio.NewScanner(strings.NewReader(err.Error()))
-				for scanner.Scan() {
-					log.Error(scanner.Text())
-				}
+		progSpec, ok := coll.Programs[unwindProg.name]
+		if !ok {
+			return fmt.Errorf("program %s does not exist", unwindProg.name)
+		}
+
+		// Replace the prog array for the tail calls.
+		insns := progArrayReferences(perfTailCallMapFD, progSpec.Instructions)
+		for _, ins := range insns {
+			if err := progSpec.Instructions[ins].AssociateMap(tailcallMap); err != nil {
+				return fmt.Errorf("failed to rewrite map ptr: %v", err)
 			}
-			return fmt.Errorf("failed to load %s", unwindProg.name)
 		}
 
-		ebpfProgs[unwindProg.name] = unwinder
-		fd := uint32(unwinder.FD())
-		if unwindProg.noTailCallTarget {
-			continue
+		// All the tail call targets are perf event programs. To be able to tail call them
+		// from a kprobe, adjust their specification.
+		if !unwindProg.noTailCallTarget {
+			// Adjust program type
+			progSpec.Type = cebpf.Kprobe
+
+			// Adjust program name for easier debugging
+			progSpec.Name = "kp_" + progSpec.Name
 		}
-		if err := tailcallMap.Update(unsafe.Pointer(&unwindProg.progID), unsafe.Pointer(&fd),
-			cebpf.UpdateAny); err != nil {
-			// Every eBPF program that is loaded within loadUnwinders can be the
-			// destination of a tail call of another eBPF program. If we can not update
-			// the eBPF map that manages these destinations our unwinding will fail.
-			return fmt.Errorf("failed to update tailcall map: %v", err)
+		if err := loadProgram(ebpfProgs, tailcallMap, unwindProg.progID, progSpec,
+			programOptions, unwindProg.noTailCallTarget); err != nil {
+			return err
 		}
 	}
 
 	return nil
 }
 
+// loadProgram loads an eBPF program from progSpec and populates the related maps.
+func loadProgram(ebpfProgs map[string]*cebpf.Program, tailcallMap *cebpf.Map,
+	progID uint32, progSpec *cebpf.ProgramSpec, programOptions cebpf.ProgramOptions,
+	noTailCallTarget bool) error {
+	restoreRlimit, err := rlimit.MaximizeMemlock()
+	if err != nil {
+		return fmt.Errorf("failed to adjust rlimit: %v", err)
+	}
+	defer restoreRlimit()
+
+	// Load the eBPF program into the kernel. If no error is returned,
+	// the eBPF program can be used/called/triggered from now on.
+	unwinder, err := cebpf.NewProgramWithOptions(progSpec, programOptions)
+	if err != nil {
+		// These errors tend to have hundreds of lines (or more),
+		// so we print each line individually.
+		if ve, ok := err.(*cebpf.VerifierError); ok {
+			for _, line := range ve.Log {
+				log.Error(line)
+			}
+		} else {
+			scanner := bufio.NewScanner(strings.NewReader(err.Error()))
+			for scanner.Scan() {
+				log.Error(scanner.Text())
+			}
+		}
+		return fmt.Errorf("failed to load %s", progSpec.Name)
+	}
+	ebpfProgs[progSpec.Name] = unwinder
+
+	if noTailCallTarget {
+		return nil
+	}
+	fd := uint32(unwinder.FD())
+	if err := tailcallMap.Update(unsafe.Pointer(&progID), unsafe.Pointer(&fd),
+		cebpf.UpdateAny); err != nil {
+		// Every eBPF program that is loaded within loadUnwinders can be the
+		// destination of a tail call of another eBPF program. If we can not update
+		// the eBPF map that manages these destinations our unwinding will fail.
+		return fmt.Errorf("failed to update tailcall map: %v", err)
+	}
+	return nil
+}
+
 // insertKernelFrames fetches the kernel stack frames for a particular kstackID and populates
 // the trace with these kernel frames. It also allocates the memory for the frames of the trace.
 // It returns the number of kernel frames for kstackID or an error.
@@ -1158,6 +1273,34 @@ func (t *Tracer) StartProbabilisticProfiling(ctx context.Context) {
 	})
 }
 
+// StartOffCPUProfiling starts off-cpu profiling by attaching the programs to the hooks.
+func (t *Tracer) StartOffCPUProfiling() error {
+	// Attach the second hook for off-cpu profiling first.
+	kprobeProg, ok := t.ebpfProgs["finish_task_switch"]
+	if !ok {
+		return errors.New("off-cpu program finish_task_switch is not available")
+	}
+
+	kprobeLink, err := link.Kprobe("finish_task_switch.isra.0", kprobeProg, nil)
+	if err != nil {
+		return err
+	}
+	t.hooks[hookPoint{group: "kprobe", name: "finish_task_switch"}] = kprobeLink
+
+	// Attach the first hook that enables off-cpu profiling.
+	tpProg, ok := t.ebpfProgs["tracepoint__sched_switch"]
+	if !ok {
+		return errors.New("tracepoint__sched_switch is not available")
+	}
+	tpLink, err := link.Tracepoint("sched", "sched_switch", tpProg, nil)
+	if err != nil {
+		return nil
+	}
+	t.hooks[hookPoint{group: "sched", name: "sched_switch"}] = tpLink
+
+	return nil
+}
+
 // TraceProcessor gets the trace processor.
 func (t *Tracer) TraceProcessor() tracehandler.TraceProcessor {
 	return t.processManager