drm/msm: add perf logging debugfs

Signed-off-by: Rob Clark <robdclark@gmail.com>
This commit is contained in:
Rob Clark 2014-05-30 14:49:43 -04:00
parent a7d3c9509b
commit 70c70f091b
7 changed files with 436 additions and 5 deletions

View File

@ -34,6 +34,7 @@ msm-y := \
msm_gem_submit.o \
msm_gpu.o \
msm_iommu.o \
msm_perf.o \
msm_rd.o \
msm_ringbuffer.o

View File

@ -207,11 +207,11 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
/* Turn on performance counters: */
gpu_write(gpu, REG_A3XX_RBBM_PERFCTR_CTL, 0x01);
/* Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS
* we will use this to augment our hang detection:
*/
gpu_write(gpu, REG_A3XX_SP_PERFCOUNTER7_SELECT,
SP_FS_FULL_ALU_INSTRUCTIONS);
/* Enable the perfcntrs that we use.. */
for (i = 0; i < gpu->num_perfcntrs; i++) {
const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
gpu_write(gpu, perfcntr->select_reg, perfcntr->select_val);
}
gpu_write(gpu, REG_A3XX_RBBM_INT_0_MASK, A3XX_INT0_MASK);
@ -465,6 +465,13 @@ static const struct adreno_gpu_funcs funcs = {
},
};
static const struct msm_gpu_perfcntr perfcntrs[] = {
{ REG_A3XX_SP_PERFCOUNTER6_SELECT, REG_A3XX_RBBM_PERFCTR_SP_6_LO,
SP_ALU_ACTIVE_CYCLES, "ALUACTIVE" },
{ REG_A3XX_SP_PERFCOUNTER7_SELECT, REG_A3XX_RBBM_PERFCTR_SP_7_LO,
SP_FS_FULL_ALU_INSTRUCTIONS, "ALUFULL" },
};
struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
{
struct a3xx_gpu *a3xx_gpu = NULL;
@ -504,6 +511,9 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
DBG("fast_rate=%u, slow_rate=%u, bus_freq=%u",
gpu->fast_rate, gpu->slow_rate, gpu->bus_freq);
gpu->perfcntrs = perfcntrs;
gpu->num_perfcntrs = ARRAY_SIZE(perfcntrs);
ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, config->rev);
if (ret)
goto fail;

View File

@ -548,6 +548,12 @@ static int late_init_minor(struct drm_minor *minor)
return ret;
}
ret = msm_perf_debugfs_init(minor);
if (ret) {
dev_err(minor->dev->dev, "could not install perf debugfs\n");
return ret;
}
return 0;
}
@ -588,6 +594,7 @@ static void msm_debugfs_cleanup(struct drm_minor *minor)
if (!minor->dev->dev_private)
return;
msm_rd_debugfs_cleanup(minor);
msm_perf_debugfs_cleanup(minor);
}
#endif

View File

@ -56,6 +56,7 @@ struct msm_kms;
struct msm_gpu;
struct msm_mmu;
struct msm_rd_state;
struct msm_perf_state;
struct msm_gem_submit;
#define NUM_DOMAINS 2 /* one for KMS, then one per gpu core (?) */
@ -85,6 +86,7 @@ struct msm_drm_private {
wait_queue_head_t fence_event;
struct msm_rd_state *rd;
struct msm_perf_state *perf;
/* list of GEM objects: */
struct list_head inactive_list;
@ -212,6 +214,8 @@ int msm_debugfs_late_init(struct drm_device *dev);
int msm_rd_debugfs_init(struct drm_minor *minor);
void msm_rd_debugfs_cleanup(struct drm_minor *minor);
void msm_rd_dump_submit(struct msm_gem_submit *submit);
int msm_perf_debugfs_init(struct drm_minor *minor);
void msm_perf_debugfs_cleanup(struct drm_minor *minor);
#else
static inline int msm_debugfs_late_init(struct drm_device *dev) { return 0; }
static inline void msm_rd_dump_submit(struct msm_gem_submit *submit) {}

View File

@ -319,6 +319,101 @@ static void hangcheck_handler(unsigned long data)
queue_work(priv->wq, &gpu->retire_work);
}
/*
* Performance Counters:
*/
/* called under perf_lock */
static int update_hw_cntrs(struct msm_gpu *gpu, uint32_t ncntrs, uint32_t *cntrs)
{
uint32_t current_cntrs[ARRAY_SIZE(gpu->last_cntrs)];
int i, n = min(ncntrs, gpu->num_perfcntrs);
/* read current values: */
for (i = 0; i < gpu->num_perfcntrs; i++)
current_cntrs[i] = gpu_read(gpu, gpu->perfcntrs[i].sample_reg);
/* update cntrs: */
for (i = 0; i < n; i++)
cntrs[i] = current_cntrs[i] - gpu->last_cntrs[i];
/* save current values: */
for (i = 0; i < gpu->num_perfcntrs; i++)
gpu->last_cntrs[i] = current_cntrs[i];
return n;
}
static void update_sw_cntrs(struct msm_gpu *gpu)
{
ktime_t time;
uint32_t elapsed;
unsigned long flags;
spin_lock_irqsave(&gpu->perf_lock, flags);
if (!gpu->perfcntr_active)
goto out;
time = ktime_get();
elapsed = ktime_to_us(ktime_sub(time, gpu->last_sample.time));
gpu->totaltime += elapsed;
if (gpu->last_sample.active)
gpu->activetime += elapsed;
gpu->last_sample.active = msm_gpu_active(gpu);
gpu->last_sample.time = time;
out:
spin_unlock_irqrestore(&gpu->perf_lock, flags);
}
void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
{
unsigned long flags;
spin_lock_irqsave(&gpu->perf_lock, flags);
/* we could dynamically enable/disable perfcntr registers too.. */
gpu->last_sample.active = msm_gpu_active(gpu);
gpu->last_sample.time = ktime_get();
gpu->activetime = gpu->totaltime = 0;
gpu->perfcntr_active = true;
update_hw_cntrs(gpu, 0, NULL);
spin_unlock_irqrestore(&gpu->perf_lock, flags);
}
void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
{
gpu->perfcntr_active = false;
}
/* returns -errno or # of cntrs sampled */
int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&gpu->perf_lock, flags);
if (!gpu->perfcntr_active) {
ret = -EINVAL;
goto out;
}
*activetime = gpu->activetime;
*totaltime = gpu->totaltime;
gpu->activetime = gpu->totaltime = 0;
ret = update_hw_cntrs(gpu, ncntrs, cntrs);
out:
spin_unlock_irqrestore(&gpu->perf_lock, flags);
return ret;
}
/*
* Cmdstream submission/retirement:
*/
@ -361,6 +456,7 @@ void msm_gpu_retire(struct msm_gpu *gpu)
{
struct msm_drm_private *priv = gpu->dev->dev_private;
queue_work(priv->wq, &gpu->retire_work);
update_sw_cntrs(gpu);
}
/* add bo's to gpu's ring, and kick gpu: */
@ -381,6 +477,8 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
gpu->submitted_fence = submit->fence;
update_sw_cntrs(gpu);
ret = gpu->funcs->submit(gpu, submit, ctx);
priv->lastctx = ctx;
@ -433,6 +531,9 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
struct iommu_domain *iommu;
int i, ret;
if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs)))
gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs);
gpu->dev = drm;
gpu->funcs = funcs;
gpu->name = name;
@ -448,6 +549,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
(unsigned long)gpu);
spin_lock_init(&gpu->perf_lock);
BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));
/* Map registers: */

View File

@ -25,6 +25,7 @@
#include "msm_ringbuffer.h"
struct msm_gem_submit;
struct msm_gpu_perfcntr;
/* So far, with hardware that I've seen to date, we can have:
* + zero, one, or two z180 2d cores
@ -64,6 +65,18 @@ struct msm_gpu {
struct drm_device *dev;
const struct msm_gpu_funcs *funcs;
/* performance counters (hw & sw): */
spinlock_t perf_lock;
bool perfcntr_active;
struct {
bool active;
ktime_t time;
} last_sample;
uint32_t totaltime, activetime; /* sw counters */
uint32_t last_cntrs[5]; /* hw counters */
const struct msm_gpu_perfcntr *perfcntrs;
uint32_t num_perfcntrs;
struct msm_ringbuffer *rb;
uint32_t rb_iova;
@ -113,6 +126,19 @@ static inline bool msm_gpu_active(struct msm_gpu *gpu)
return gpu->submitted_fence > gpu->funcs->last_fence(gpu);
}
/* Perf-Counters:
* The select_reg and select_val are just there for the benefit of the child
* class that actually enables the perf counter.. but msm_gpu base class
* will handle sampling/displaying the counters.
*/
struct msm_gpu_perfcntr {
uint32_t select_reg;
uint32_t sample_reg;
uint32_t select_val;
const char *name;
};
static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
{
msm_writel(data, gpu->mmio + (reg << 2));
@ -126,6 +152,11 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
int msm_gpu_pm_suspend(struct msm_gpu *gpu);
int msm_gpu_pm_resume(struct msm_gpu *gpu);
void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
void msm_gpu_retire(struct msm_gpu *gpu);
int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
struct msm_file_private *ctx);

View File

@ -0,0 +1,275 @@
/*
* Copyright (C) 2013 Red Hat
* Author: Rob Clark <robdclark@gmail.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*/
/* For profiling, userspace can:
*
* tail -f /sys/kernel/debug/dri/<minor>/gpu
*
* This will enable performance counters/profiling to track the busy time
* and any gpu specific performance counters that are supported.
*/
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include "msm_drv.h"
#include "msm_gpu.h"
struct msm_perf_state {
struct drm_device *dev;
bool open;
int cnt;
struct mutex read_lock;
char buf[256];
int buftot, bufpos;
unsigned long next_jiffies;
struct dentry *ent;
struct drm_info_node *node;
};
#define SAMPLE_TIME (HZ/4)
/* wait for next sample time: */
static int wait_sample(struct msm_perf_state *perf)
{
unsigned long start_jiffies = jiffies;
if (time_after(perf->next_jiffies, start_jiffies)) {
unsigned long remaining_jiffies =
perf->next_jiffies - start_jiffies;
int ret = schedule_timeout_interruptible(remaining_jiffies);
if (ret > 0) {
/* interrupted */
return -ERESTARTSYS;
}
}
perf->next_jiffies += SAMPLE_TIME;
return 0;
}
static int refill_buf(struct msm_perf_state *perf)
{
struct msm_drm_private *priv = perf->dev->dev_private;
struct msm_gpu *gpu = priv->gpu;
char *ptr = perf->buf;
int rem = sizeof(perf->buf);
int i, n;
if ((perf->cnt++ % 32) == 0) {
/* Header line: */
n = snprintf(ptr, rem, "%%BUSY");
ptr += n;
rem -= n;
for (i = 0; i < gpu->num_perfcntrs; i++) {
const struct msm_gpu_perfcntr *perfcntr = &gpu->perfcntrs[i];
n = snprintf(ptr, rem, "\t%s", perfcntr->name);
ptr += n;
rem -= n;
}
} else {
/* Sample line: */
uint32_t activetime = 0, totaltime = 0;
uint32_t cntrs[5];
uint32_t val;
int ret;
/* sleep until next sample time: */
ret = wait_sample(perf);
if (ret)
return ret;
ret = msm_gpu_perfcntr_sample(gpu, &activetime, &totaltime,
ARRAY_SIZE(cntrs), cntrs);
if (ret < 0)
return ret;
val = totaltime ? 1000 * activetime / totaltime : 0;
n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
ptr += n;
rem -= n;
for (i = 0; i < ret; i++) {
/* cycle counters (I think).. convert to MHz.. */
val = cntrs[i] / 10000;
n = snprintf(ptr, rem, "\t%5d.%02d",
val / 100, val % 100);
ptr += n;
rem -= n;
}
}
n = snprintf(ptr, rem, "\n");
ptr += n;
rem -= n;
perf->bufpos = 0;
perf->buftot = ptr - perf->buf;
return 0;
}
static ssize_t perf_read(struct file *file, char __user *buf,
size_t sz, loff_t *ppos)
{
struct msm_perf_state *perf = file->private_data;
int n = 0, ret;
mutex_lock(&perf->read_lock);
if (perf->bufpos >= perf->buftot) {
ret = refill_buf(perf);
if (ret)
goto out;
}
n = min((int)sz, perf->buftot - perf->bufpos);
ret = copy_to_user(buf, &perf->buf[perf->bufpos], n);
if (ret)
goto out;
perf->bufpos += n;
*ppos += n;
out:
mutex_unlock(&perf->read_lock);
if (ret)
return ret;
return n;
}
static int perf_open(struct inode *inode, struct file *file)
{
struct msm_perf_state *perf = inode->i_private;
struct drm_device *dev = perf->dev;
struct msm_drm_private *priv = dev->dev_private;
struct msm_gpu *gpu = priv->gpu;
int ret = 0;
mutex_lock(&dev->struct_mutex);
if (perf->open || !gpu) {
ret = -EBUSY;
goto out;
}
file->private_data = perf;
perf->open = true;
perf->cnt = 0;
perf->buftot = 0;
perf->bufpos = 0;
msm_gpu_perfcntr_start(gpu);
perf->next_jiffies = jiffies + SAMPLE_TIME;
out:
mutex_unlock(&dev->struct_mutex);
return ret;
}
static int perf_release(struct inode *inode, struct file *file)
{
struct msm_perf_state *perf = inode->i_private;
struct msm_drm_private *priv = perf->dev->dev_private;
msm_gpu_perfcntr_stop(priv->gpu);
perf->open = false;
return 0;
}
static const struct file_operations perf_debugfs_fops = {
.owner = THIS_MODULE,
.open = perf_open,
.read = perf_read,
.llseek = no_llseek,
.release = perf_release,
};
int msm_perf_debugfs_init(struct drm_minor *minor)
{
struct msm_drm_private *priv = minor->dev->dev_private;
struct msm_perf_state *perf;
/* only create on first minor: */
if (priv->perf)
return 0;
perf = kzalloc(sizeof(*perf), GFP_KERNEL);
if (!perf)
return -ENOMEM;
perf->dev = minor->dev;
mutex_init(&perf->read_lock);
priv->perf = perf;
perf->node = kzalloc(sizeof(*perf->node), GFP_KERNEL);
if (!perf->node)
goto fail;
perf->ent = debugfs_create_file("perf", S_IFREG | S_IRUGO,
minor->debugfs_root, perf, &perf_debugfs_fops);
if (!perf->ent) {
DRM_ERROR("Cannot create /sys/kernel/debug/dri/%s/perf\n",
minor->debugfs_root->d_name.name);
goto fail;
}
perf->node->minor = minor;
perf->node->dent = perf->ent;
perf->node->info_ent = NULL;
mutex_lock(&minor->debugfs_lock);
list_add(&perf->node->list, &minor->debugfs_list);
mutex_unlock(&minor->debugfs_lock);
return 0;
fail:
msm_perf_debugfs_cleanup(minor);
return -1;
}
void msm_perf_debugfs_cleanup(struct drm_minor *minor)
{
struct msm_drm_private *priv = minor->dev->dev_private;
struct msm_perf_state *perf = priv->perf;
if (!perf)
return;
priv->perf = NULL;
debugfs_remove(perf->ent);
if (perf->node) {
mutex_lock(&minor->debugfs_lock);
list_del(&perf->node->list);
mutex_unlock(&minor->debugfs_lock);
kfree(perf->node);
}
mutex_destroy(&perf->read_lock);
kfree(perf);
}
#endif