diff --git a/drivers/gpu/drm/panthor/panthor_perfcnt.c b/drivers/gpu/drm/panthor/panthor_perfcnt.c index ac34c11a5fc19bcce109890d7a8283efaa5edabe..b474697d3dfdba0a90488590ffd68f7cc2a81ea0 100644 --- a/drivers/gpu/drm/panthor/panthor_perfcnt.c +++ b/drivers/gpu/drm/panthor/panthor_perfcnt.c @@ -74,6 +74,16 @@ struct perfcnt_counters { u32 mmu_l2_enable; }; +/* Header written to the first 16 bytes of each sample by the hardware */ +struct perfcnt_slot_header { + u64 timestamp; + /* Bitmask of counter groups that were enabled in this sample. If bit N + * is set, then counters N*4 to (N+1)*4 were written. Bit 0 references + * the header, rather than an actual group of counters. */ + u32 enabled_groups; + u32 _reserved; +}; + static int panthor_perfcnt_enable_counters(struct panthor_device *ptdev, struct perfcnt_counters *counters) { @@ -145,6 +155,67 @@ static void perfcnt_copy_sample(struct panthor_device *ptdev, perfcnt->sample_size); } +/* Adds enabled counter values from every sample slot between first_idx and + * last_idx-1, writing the sum to last_idx. + * + * This is needed because the hardware will automatically trigger samples in + * some situations, particularly when a HW unit is powered up or down. We hide + * this detail from userspace by accumulating counters from both the manual and + * HW-triggered samples into a single sample that counts all events since the + * previous manual sample. */ +static void accumulate_samples(struct panthor_device *ptdev, u32 first_idx, u32 last_idx) +{ + struct panthor_perfcnt *perfcnt = ptdev->perfcnt; + unsigned int sample_idx; + unsigned int block_offset; + unsigned int i; + + if (last_idx != first_idx) { + drm_dbg(&ptdev->base, "Accumulating %u hw-triggered samples\n", last_idx - first_idx); + } + + void *last_slot = perfcnt->bo->kmap + + ((last_idx & (ptdev->perfcnt->ringslots - 1)) * perfcnt->sample_size); + + unsigned int group_size = sizeof(u32) * 4; + unsigned int group_count = SAMPLE_BLOCK_SIZE / group_size; + + /* Disabled counters are not written by the hardware, so we need to zero + * them before accumulating previous samples where the counter may have + * been enabled */ + for (block_offset = 0; block_offset < perfcnt->sample_size; block_offset += SAMPLE_BLOCK_SIZE) { + struct perfcnt_slot_header *hdr = last_slot + block_offset; + unsigned long disabled_groups = ~hdr->enabled_groups; + + /* Skip the first group (the header) */ + unsigned int group = 1; + for_each_set_bit_from(group, &disabled_groups, group_count) { + unsigned int offset = block_offset + group_size * group; + memset(last_slot + offset, 0, group_size); + } + } + + for (sample_idx = first_idx; sample_idx < last_idx; sample_idx++) { + void *slot = perfcnt->bo->kmap + + ((sample_idx & (ptdev->perfcnt->ringslots - 1)) + * perfcnt->sample_size); + + for (block_offset = 0; block_offset < perfcnt->sample_size; block_offset += SAMPLE_BLOCK_SIZE) { + u32 *block = slot + block_offset; + u32 *last_block = last_slot + block_offset; + struct perfcnt_slot_header *hdr = &block[0]; + unsigned long enabled_groups = hdr->enabled_groups; + + /* Skip the first group (the header) */ + unsigned int group = 1; + for_each_set_bit_from(group, &enabled_groups, group_count) { + for (i = 0; i < 4; i++) + block[i] += last_block[i]; + } + } + } +} + static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 ins_idx) { struct panthor_perfcnt *perfcnt = ptdev->perfcnt; @@ -168,17 +239,16 @@ static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 in } } -static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status) +static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status, u32 sample_index) { struct panthor_perfcnt *perfcnt = ptdev->perfcnt; - struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); struct panthor_perfcnt_dumper *dumper, *dumper_tmp; mutex_lock(&perfcnt->lock); list_for_each_entry_safe(dumper, dumper_tmp, &perfcnt->dumper_list, list) { if (status == PERFCNT_STATUS_SUCCEEDED) perfcnt_copy_sample(ptdev, perfcnt, dumper->user_bo, - glb_iface->output->perfcnt_insert - 1); + sample_index); list_del(&dumper->list); INIT_LIST_HEAD(&dumper->list); dumper->last_status = status; @@ -207,7 +277,7 @@ static void perfcnt_process_sample(struct work_struct *work) if (glb_iface->output->perfcnt_status & GLB_PERFCNT_STATUS_FAILED) { drm_err(&ptdev->base, "Perfcounter sampling failed\n"); - clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED); + clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED, 0); goto worker_exit; } @@ -218,26 +288,28 @@ static void perfcnt_process_sample(struct work_struct *work) ret = panthor_fw_glb_wait_acks(ptdev, flipped, &acked, 100); if (ret) drm_err(&ptdev->base, "Resetting Overflow flags failed\n"); - clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW); + clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW, 0); goto clear_inc_idx; } if (glb_iface->output->perfcnt_status & PERFCNT_OP_AFFECTED) drm_warn(&ptdev->base, "Perfcnt sample operation might have been impacted by a power transition or protected session exec\n"); - clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED); + u32 ext_idx = READ_ONCE(glb_iface->input->perfcnt_extract); + u32 ins_idx = READ_ONCE(glb_iface->output->perfcnt_insert); + + accumulate_samples(ptdev, ext_idx, ins_idx - 1); + clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED, ins_idx - 1); clear_inc_idx: - clear_slot_headers(ptdev, glb_iface->input->perfcnt_extract, - glb_iface->output->perfcnt_insert); + clear_slot_headers(ptdev, ext_idx, ins_idx); /* * TRM recommends increasing the extract pointer by one after every sample * operation, but because sample requests are processed sequentially and we - * discard samples triggered by the HW automatically, it's best if we simply - * set it to the next insert slot index. + * accumulate samples triggered by the HW automatically into the manual + * sample, it's best if we simply set it to the next insert slot index. */ - WRITE_ONCE(glb_iface->input->perfcnt_extract, - READ_ONCE(glb_iface->output->perfcnt_insert)); + WRITE_ONCE(glb_iface->input->perfcnt_extract, ins_idx); worker_exit: wake_up_all(&perfcnt->wq); } diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h index 3294240476e702669a8812186c5bee2a88521be0..d513704c134ae3b7f0b92ddd6a2b77b164857efd 100644 --- a/include/uapi/drm/panthor_drm.h +++ b/include/uapi/drm/panthor_drm.h @@ -1036,6 +1036,14 @@ struct drm_panthor_perfcnt_config { /** * struct drm_panthor_perfcnt_dump - Arguments passed to DRM_IOCTL_PANTHOR_PERFCNT_DUMP + * + * Counter dumps match the sample layout/semantics from the hardware, except for + * the behavior of the enable masks in the headers. The enable masks reflect + * which counter groups were enabled at the end of the sample period. This does + * not necessarily mean they were in that state for the entire period. Disabled + * counter values accurately reflect the count of events since the last dump, + * which may be nonzero for disabled counters if they were enabled for part + * of the sample period. */ struct drm_panthor_perfcnt_dump { /** @handle: Handle of the BO to write perfcnt dump into */