blktrace userspace tools

  • d0ca268 Initial commit of blktrace

Inital commit of blktrace userspace tools 包含下列几个文件:

    ├── blkparse.c
    ├── blktrace.c
    ├── blktrace.h
    ├── Makefile
    └── README

README

    % blktrace <dev>
    --- run task to generate load to be traced ---
    <SIGINT to kill>
    --- Generates:
        <dev>_dat.[0..ncpus] : Contains binary pdu data
        <dev>_out.[0..ncpus] : Contains binary trace data

    % blkparse <dev> <ncpus> 
    --- Generates:
        <dev>_log.[0..ncpus] : Contains formatted trace data
        Merged formatted trace data to stdout

Trace Categories

    #define BLK_TC_SHIFT        (16)

    enum {
            BLK_TC_READ     = 1 << 0,       /* reads */
            BLK_TC_WRITE    = 1 << 1,       /* writes */
            BLK_TC_BARRIER  = 1 << 2,       /* barrier */
            BLK_TC_SYNC     = 1 << 3,       /* barrier */
            BLK_TC_QUEUE    = 1 << 4,       /* queueing/merging */
            BLK_TC_REQUEUE  = 1 << 5,       /* requeueing */
            BLK_TC_ISSUE    = 1 << 6,       /* issue */
            BLK_TC_COMPLETE = 1 << 7,       /* completions */
            BLK_TC_FS       = 1 << 8,       /* fs requests */
            BLK_TC_PC       = 1 << 9,       /* pc requests */

            BLK_TC_END      = 1 << 15,      /* only 16-bits, reminder */
    };

Trace Actions

    #define BLK_TC_ACT(act)     ((act) << BLK_TC_SHIFT)

    enum {
        __BLK_TA_QUEUE = 1,     /* queued */
        __BLK_TA_BACKMERGE,     /* back merged to existing rq */
        __BLK_TA_FRONTMERGE,    /* front merge to existing rq */
        __BLK_TA_GETRQ,         /* allocated new request */
        __BLK_TA_SLEEPRQ,       /* sleeping on rq allocation */
        __BLK_TA_REQUEUE,       /* request requeued */
        __BLK_TA_ISSUE,         /* sent to driver */
        __BLK_TA_COMPLETE,      /* completed by driver */
    };

    /*
     * Trace actions in full. Additionally, read or write is masked
     */
    #define BLK_TA_QUEUE        (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_BACKMERGE    (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_FRONTMERGE   (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_GETRQ        (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_SLEEPRQ      (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_REQUEUE      (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
    #define BLK_TA_ISSUE        (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
    #define BLK_TA_COMPLETE     (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))

struct blk_io_trace - 从 relayfs 获取的一条 message 开始部分对应一个 struct blk_io_trace 记录

    #define BLK_IO_TRACE_MAGIC  (0x65617400)
    #define CHECK_MAGIC(t)      (((t)->magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
    #define SUPPORTED_VERSION   (0x02)

    struct blk_io_trace {
        __u32 magic;
        __u32 sequence;
        __u64 time;
        __u64 sector;
        __u32 bytes;
        __u32 action;
        __u32 pid;
        __u16 error;
        __u16 pdu_len;
    };

IOCTL

    struct blk_user_trace_setup {
        char name[32];
        __u16 act_mask;
        __u32 buf_size;
        __u32 buf_nr;
    };

    #define BLKSTARTTRACE   _IOWR(0x12,115,struct blk_user_trace_setup)
    #define BLKSTOPTRACE    _IO(0x12,116)

blktrace - block queue tracing application

Step 1. start_trace

    #define BUF_SIZE    (128 *1024)
    #define BUF_NR      (4)

    struct blk_user_trace_setup buts;
    memset(&buts, sizeof(buts), 0);
    buts.buf_size = BUF_SIZE;
    buts.buf_nr = BUF_NR;
    ioctl(devfd, BLKSTARTTRACE, &buts);

Step 2. extract 从 relayfs 读取 messages

为每个 online CPU 创建一个 thread 执行 extract()

    Generates:
        <dev>_dat.[0..ncpus] : Contains binary pdu data
        <dev>_out.[0..ncpus] : Contains binary trace data

从 relayfs 获取的一条 message 开始部分对应一个 struct blk_io_trace 记录

Step 3. stop_trace

ioctl(devfd, BLKSTOPTRACE) 停止 blktrace

Step 4. show_stats

打印总结信息

    CPU<N>: <nr of event processed for CPU_N> exents
    Total: <nr of event processed for all CPUs> exents

blkparse - 将 blktrace 获取的 messages 根据 blk_io_trace header 信息进行分类统计

    void dump_trace_pc(struct blk_io_trace *t)
    {
        switch (t->action & 0xffff) {
            case __BLK_TA_QUEUE:
                log_generic(t, 'Q');
                break;
            case __BLK_TA_GETRQ:
                log_generic(t, 'G');
                break;
            case __BLK_TA_SLEEPRQ:
                log_generic(t, 'S');
                break;
            case __BLK_TA_REQUEUE:
                log_generic(t, 'R');
                break;
            case __BLK_TA_ISSUE:
                log_pc(t, 'D');
                break;
            case __BLK_TA_COMPLETE:
                log_pc(t, 'C');
                break;
            default:
                fprintf(stderr, "Bad pc action %xn", t->action);
                return;
        }

        events++;
    }

    void dump_trace_fs(struct blk_io_trace *t)
    {
        int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);

        switch (t->action & 0xffff) {
            case __BLK_TA_QUEUE:
                account_q(w, t->bytes);
                log_queue(t, 'Q');
                break;
            case __BLK_TA_BACKMERGE:
                account_m(w, t->bytes);
                log_merge(t, 'M');
                break;
            case __BLK_TA_FRONTMERGE:
                account_m(w, t->bytes);
                log_merge(t, 'F');
                break;
            case __BLK_TA_GETRQ:
                log_generic(t, 'G');
                break;
            case __BLK_TA_SLEEPRQ:
                log_generic(t, 'S');
                break;
            case __BLK_TA_REQUEUE:
                log_queue(t, 'R');
                break;
            case __BLK_TA_ISSUE:
                log_issue(t, 'D');
                break;
            case __BLK_TA_COMPLETE:
                account_c(w, t->bytes);
                log_complete(t, 'C');
                break;
            default:
                fprintf(stderr, "Bad fs action %xn", t->action);
                return;
        }

        events++;
    }