diff --git a/drivers/gpu/drm/panthor/panthor_perfcnt.c b/drivers/gpu/drm/panthor/panthor_perfcnt.c
index ac34c11a5fc19bcce109890d7a8283efaa5edabe..b474697d3dfdba0a90488590ffd68f7cc2a81ea0 100644
--- a/drivers/gpu/drm/panthor/panthor_perfcnt.c
+++ b/drivers/gpu/drm/panthor/panthor_perfcnt.c
@@ -74,6 +74,16 @@ struct perfcnt_counters {
 	u32 mmu_l2_enable;
 };
 
+/* Header written to the first 16 bytes of each sample by the hardware */
+struct perfcnt_slot_header {
+	u64 timestamp;
+	/* Bitmask of counter groups that were enabled in this sample. If bit N
+	 * is set, then counters N*4 to (N+1)*4 were written. Bit 0 references
+	 * the header, rather than an actual group of counters. */
+	u32 enabled_groups;
+	u32 _reserved;
+};
+
 static int panthor_perfcnt_enable_counters(struct panthor_device *ptdev,
 					   struct perfcnt_counters *counters)
 {
@@ -145,6 +155,67 @@ static void perfcnt_copy_sample(struct panthor_device *ptdev,
 	       perfcnt->sample_size);
 }
 
+/* Adds enabled counter values from every sample slot between first_idx and
+ * last_idx-1, writing the sum to last_idx.
+ *
+ * This is needed because the hardware will automatically trigger samples in
+ * some situations, particularly when a HW unit is powered up or down. We hide
+ * this detail from userspace by accumulating counters from both the manual and
+ * HW-triggered samples into a single sample that counts all events since the
+ * previous manual sample. */
+static void accumulate_samples(struct panthor_device *ptdev, u32 first_idx, u32 last_idx)
+{
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	unsigned int sample_idx;
+	unsigned int block_offset;
+	unsigned int i;
+
+	if (last_idx != first_idx) {
+		drm_dbg(&ptdev->base, "Accumulating %u hw-triggered samples\n", last_idx - first_idx);
+	}
+
+	void *last_slot = perfcnt->bo->kmap +
+		((last_idx & (ptdev->perfcnt->ringslots - 1)) * perfcnt->sample_size);
+
+	unsigned int group_size = sizeof(u32) * 4;
+	unsigned int group_count = SAMPLE_BLOCK_SIZE / group_size;
+
+	/* Disabled counters are not written by the hardware, so we need to zero
+	 * them before accumulating previous samples where the counter may have
+	 * been enabled */
+	for (block_offset = 0; block_offset < perfcnt->sample_size; block_offset += SAMPLE_BLOCK_SIZE) {
+		struct perfcnt_slot_header *hdr = last_slot + block_offset;
+		unsigned long disabled_groups = ~hdr->enabled_groups;
+
+		/* Skip the first group (the header) */
+		unsigned int group = 1;
+		for_each_set_bit_from(group, &disabled_groups, group_count) {
+			unsigned int offset = block_offset + group_size * group;
+			memset(last_slot + offset, 0, group_size);
+		}
+	}
+
+	for (sample_idx = first_idx; sample_idx < last_idx; sample_idx++) {
+		void *slot = perfcnt->bo->kmap +
+			((sample_idx & (ptdev->perfcnt->ringslots - 1))
+			 * perfcnt->sample_size);
+
+		for (block_offset = 0; block_offset < perfcnt->sample_size; block_offset += SAMPLE_BLOCK_SIZE) {
+			u32 *block = slot + block_offset;
+			u32 *last_block = last_slot + block_offset;
+			struct perfcnt_slot_header *hdr = &block[0];
+			unsigned long enabled_groups = hdr->enabled_groups;
+
+			/* Skip the first group (the header) */
+			unsigned int group = 1;
+			for_each_set_bit_from(group, &enabled_groups, group_count) {
+				for (i = 0; i < 4; i++)
+					block[i] += last_block[i];
+			}
+		}
+	}
+}
+
 static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 ins_idx)
 {
 	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
@@ -168,17 +239,16 @@ static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 in
 	}
 }
 
-static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status)
+static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status, u32 sample_index)
 {
 	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
-	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
 	struct panthor_perfcnt_dumper *dumper, *dumper_tmp;
 
 	mutex_lock(&perfcnt->lock);
 	list_for_each_entry_safe(dumper, dumper_tmp, &perfcnt->dumper_list, list) {
 		if (status == PERFCNT_STATUS_SUCCEEDED)
 			perfcnt_copy_sample(ptdev, perfcnt, dumper->user_bo,
-					    glb_iface->output->perfcnt_insert - 1);
+					    sample_index);
 		list_del(&dumper->list);
 		INIT_LIST_HEAD(&dumper->list);
 		dumper->last_status = status;
@@ -207,7 +277,7 @@ static void perfcnt_process_sample(struct work_struct *work)
 
 	if (glb_iface->output->perfcnt_status & GLB_PERFCNT_STATUS_FAILED) {
 		drm_err(&ptdev->base, "Perfcounter sampling failed\n");
-		clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED);
+		clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED, 0);
 		goto worker_exit;
 	}
 
@@ -218,26 +288,28 @@ static void perfcnt_process_sample(struct work_struct *work)
 		ret = panthor_fw_glb_wait_acks(ptdev, flipped, &acked, 100);
 		if (ret)
 			drm_err(&ptdev->base, "Resetting Overflow flags failed\n");
-		clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW);
+		clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW, 0);
 		goto clear_inc_idx;
 	}
 
 	if (glb_iface->output->perfcnt_status & PERFCNT_OP_AFFECTED)
 		drm_warn(&ptdev->base, "Perfcnt sample operation might have been impacted by a power transition or protected session exec\n");
 
-	clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED);
+	u32 ext_idx = READ_ONCE(glb_iface->input->perfcnt_extract);
+	u32 ins_idx = READ_ONCE(glb_iface->output->perfcnt_insert);
+
+	accumulate_samples(ptdev, ext_idx, ins_idx - 1);
+	clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED, ins_idx - 1);
 
 clear_inc_idx:
-	clear_slot_headers(ptdev, glb_iface->input->perfcnt_extract,
-			   glb_iface->output->perfcnt_insert);
+	clear_slot_headers(ptdev, ext_idx, ins_idx);
 	/*
 	 * TRM recommends increasing the extract pointer by one after every sample
 	 * operation, but because sample requests are processed sequentially and we
-	 * discard samples triggered by the HW automatically, it's best if we simply
-	 * set it to the next insert slot index.
+	 * accumulate samples triggered by the HW automatically into the manual
+	 * sample, it's best if we simply set it to the next insert slot index.
 	 */
-	WRITE_ONCE(glb_iface->input->perfcnt_extract,
-		   READ_ONCE(glb_iface->output->perfcnt_insert));
+	WRITE_ONCE(glb_iface->input->perfcnt_extract, ins_idx);
 worker_exit:
 	wake_up_all(&perfcnt->wq);
 }
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 3294240476e702669a8812186c5bee2a88521be0..d513704c134ae3b7f0b92ddd6a2b77b164857efd 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -1036,6 +1036,14 @@ struct drm_panthor_perfcnt_config {
 
 /**
  * struct drm_panthor_perfcnt_dump - Arguments passed to DRM_IOCTL_PANTHOR_PERFCNT_DUMP
+ *
+ * Counter dumps match the sample layout/semantics from the hardware, except for
+ * the behavior of the enable masks in the headers. The enable masks reflect
+ * which counter groups were enabled at the end of the sample period. This does
+ * not necessarily mean they were in that state for the entire period. Disabled
+ * counter values accurately reflect the count of events since the last dump,
+ * which may be nonzero for disabled counters if they were enabled for part
+ * of the sample period.
  */
 struct drm_panthor_perfcnt_dump {
 	/** @handle: Handle of the BO to write perfcnt dump into */