From c33030d323eec1cb636c444626967e12be80dac8 Mon Sep 17 00:00:00 2001 From: Benjamin Lee <benjamin.lee@collabora.com> Date: Thu, 20 Mar 2025 19:54:12 -0700 Subject: [PATCH] panthor/drm: accumulate counters from HW-triggered samples Because counter values are written as deltas since the previous sample, we need to sum the values from all samples since the previous manual sample in order to pass accurate counts to userspace. This is especially important when counters are disabled in between manual samples. The main alternative approach would be to pass the HW-triggered samples to userspace, and let the userspace program decide how it wants to handle them. This would mean a more complicated uapi, and means that the ringbuffer will overflow if userspace does not sample frequently enough. Signed-off-by: Benjamin Lee <benjamin.lee@collabora.com> --- drivers/gpu/drm/panthor/panthor_perfcnt.c | 96 ++++++++++++++++++++--- include/uapi/drm/panthor_drm.h | 8 ++ 2 files changed, 92 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/panthor/panthor_perfcnt.c b/drivers/gpu/drm/panthor/panthor_perfcnt.c index ac34c11a5fc1..b474697d3dfd 100644 --- a/drivers/gpu/drm/panthor/panthor_perfcnt.c +++ b/drivers/gpu/drm/panthor/panthor_perfcnt.c @@ -74,6 +74,16 @@ struct perfcnt_counters { u32 mmu_l2_enable; }; +/* Header written to the first 16 bytes of each sample by the hardware */ +struct perfcnt_slot_header { + u64 timestamp; + /* Bitmask of counter groups that were enabled in this sample. If bit N + * is set, then counters N*4 to (N+1)*4 were written. Bit 0 references + * the header, rather than an actual group of counters. */ + u32 enabled_groups; + u32 _reserved; +}; + static int panthor_perfcnt_enable_counters(struct panthor_device *ptdev, struct perfcnt_counters *counters) { @@ -145,6 +155,67 @@ static void perfcnt_copy_sample(struct panthor_device *ptdev, perfcnt->sample_size); } +/* Adds enabled counter values from every sample slot between first_idx and + * last_idx-1, writing the sum to last_idx. + * + * This is needed because the hardware will automatically trigger samples in + * some situations, particularly when a HW unit is powered up or down. We hide + * this detail from userspace by accumulating counters from both the manual and + * HW-triggered samples into a single sample that counts all events since the + * previous manual sample. */ +static void accumulate_samples(struct panthor_device *ptdev, u32 first_idx, u32 last_idx) +{ + struct panthor_perfcnt *perfcnt = ptdev->perfcnt; + unsigned int sample_idx; + unsigned int block_offset; + unsigned int i; + + if (last_idx != first_idx) { + drm_dbg(&ptdev->base, "Accumulating %u hw-triggered samples\n", last_idx - first_idx); + } + + void *last_slot = perfcnt->bo->kmap + + ((last_idx & (ptdev->perfcnt->ringslots - 1)) * perfcnt->sample_size); + + unsigned int group_size = sizeof(u32) * 4; + unsigned int group_count = SAMPLE_BLOCK_SIZE / group_size; + + /* Disabled counters are not written by the hardware, so we need to zero + * them before accumulating previous samples where the counter may have + * been enabled */ + for (block_offset = 0; block_offset < perfcnt->sample_size; block_offset += SAMPLE_BLOCK_SIZE) { + struct perfcnt_slot_header *hdr = last_slot + block_offset; + unsigned long disabled_groups = ~hdr->enabled_groups; + + /* Skip the first group (the header) */ + unsigned int group = 1; + for_each_set_bit_from(group, &disabled_groups, group_count) { + unsigned int offset = block_offset + group_size * group; + memset(last_slot + offset, 0, group_size); + } + } + + for (sample_idx = first_idx; sample_idx < last_idx; sample_idx++) { + void *slot = perfcnt->bo->kmap + + ((sample_idx & (ptdev->perfcnt->ringslots - 1)) + * perfcnt->sample_size); + + for (block_offset = 0; block_offset < perfcnt->sample_size; block_offset += SAMPLE_BLOCK_SIZE) { + u32 *block = slot + block_offset; + u32 *last_block = last_slot + block_offset; + struct perfcnt_slot_header *hdr = &block[0]; + unsigned long enabled_groups = hdr->enabled_groups; + + /* Skip the first group (the header) */ + unsigned int group = 1; + for_each_set_bit_from(group, &enabled_groups, group_count) { + for (i = 0; i < 4; i++) + block[i] += last_block[i]; + } + } + } +} + static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 ins_idx) { struct panthor_perfcnt *perfcnt = ptdev->perfcnt; @@ -168,17 +239,16 @@ static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 in } } -static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status) +static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status, u32 sample_index) { struct panthor_perfcnt *perfcnt = ptdev->perfcnt; - struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); struct panthor_perfcnt_dumper *dumper, *dumper_tmp; mutex_lock(&perfcnt->lock); list_for_each_entry_safe(dumper, dumper_tmp, &perfcnt->dumper_list, list) { if (status == PERFCNT_STATUS_SUCCEEDED) perfcnt_copy_sample(ptdev, perfcnt, dumper->user_bo, - glb_iface->output->perfcnt_insert - 1); + sample_index); list_del(&dumper->list); INIT_LIST_HEAD(&dumper->list); dumper->last_status = status; @@ -207,7 +277,7 @@ static void perfcnt_process_sample(struct work_struct *work) if (glb_iface->output->perfcnt_status & GLB_PERFCNT_STATUS_FAILED) { drm_err(&ptdev->base, "Perfcounter sampling failed\n"); - clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED); + clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED, 0); goto worker_exit; } @@ -218,26 +288,28 @@ static void perfcnt_process_sample(struct work_struct *work) ret = panthor_fw_glb_wait_acks(ptdev, flipped, &acked, 100); if (ret) drm_err(&ptdev->base, "Resetting Overflow flags failed\n"); - clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW); + clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW, 0); goto clear_inc_idx; } if (glb_iface->output->perfcnt_status & PERFCNT_OP_AFFECTED) drm_warn(&ptdev->base, "Perfcnt sample operation might have been impacted by a power transition or protected session exec\n"); - clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED); + u32 ext_idx = READ_ONCE(glb_iface->input->perfcnt_extract); + u32 ins_idx = READ_ONCE(glb_iface->output->perfcnt_insert); + + accumulate_samples(ptdev, ext_idx, ins_idx - 1); + clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED, ins_idx - 1); clear_inc_idx: - clear_slot_headers(ptdev, glb_iface->input->perfcnt_extract, - glb_iface->output->perfcnt_insert); + clear_slot_headers(ptdev, ext_idx, ins_idx); /* * TRM recommends increasing the extract pointer by one after every sample * operation, but because sample requests are processed sequentially and we - * discard samples triggered by the HW automatically, it's best if we simply - * set it to the next insert slot index. + * accumulate samples triggered by the HW automatically into the manual + * sample, it's best if we simply set it to the next insert slot index. */ - WRITE_ONCE(glb_iface->input->perfcnt_extract, - READ_ONCE(glb_iface->output->perfcnt_insert)); + WRITE_ONCE(glb_iface->input->perfcnt_extract, ins_idx); worker_exit: wake_up_all(&perfcnt->wq); } diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h index 3294240476e7..d513704c134a 100644 --- a/include/uapi/drm/panthor_drm.h +++ b/include/uapi/drm/panthor_drm.h @@ -1036,6 +1036,14 @@ struct drm_panthor_perfcnt_config { /** * struct drm_panthor_perfcnt_dump - Arguments passed to DRM_IOCTL_PANTHOR_PERFCNT_DUMP + * + * Counter dumps match the sample layout/semantics from the hardware, except for + * the behavior of the enable masks in the headers. The enable masks reflect + * which counter groups were enabled at the end of the sample period. This does + * not necessarily mean they were in that state for the entire period. Disabled + * counter values accurately reflect the count of events since the last dump, + * which may be nonzero for disabled counters if they were enabled for part + * of the sample period. */ struct drm_panthor_perfcnt_dump { /** @handle: Handle of the BO to write perfcnt dump into */ -- GitLab