Commit 9b41b2ae authored by Bas Zoetekouw's avatar Bas Zoetekouw Committed by Dmitry Smirnov

Imported Debian snapshot 0~git-20061221162513-1

This diff is collapsed.
CC = gcc
CFLAGS = -Wall -O2 -g -W
PROGS = blkparse blktrace verify_blkparse blkrawverify
LIBS = -lpthread
SCRIPTS = btrace
all: depend $(PROGS) $(SCRIPTS)
$(MAKE) -C btt
%.o: %.c
$(CC) -o $*.o -c $(ALL_CFLAGS) $<
blkparse: blkparse.o blkparse_fmt.o rbtree.o act_mask.o
$(CC) $(ALL_CFLAGS) -o $@ $(filter %.o,$^)
blktrace: blktrace.o act_mask.o $(LIBS)
$(CC) $(ALL_CFLAGS) -o $@ $(filter %.o,$^) $(LIBS)
verify_blkparse: verify_blkparse.o
$(CC) $(ALL_CFLAGS) -o $@ $(filter %.o,$^)
blkrawverify: blkrawverify.o
$(CC) $(ALL_CFLAGS) -o $@ $(filter %.o,$^)
$(MAKE) -C doc all
$(MAKE) -C doc clean
@$(CC) -MM $(ALL_CFLAGS) *.c 1> .depend
INSTALL = install
prefix = /usr/local
bindir = $(prefix)/bin
RPMBUILD = rpmbuild
TAR = tar
export prefix INSTALL TAR
dist: btrace.spec
git-tar-tree HEAD btrace-1.0 > btrace-1.0.tar
@mkdir -p btrace-1.0
@cp btrace.spec btrace-1.0
$(TAR) rf btrace-1.0.tar btrace-1.0/btrace.spec
@rm -rf btrace-1.0
@bzip2 btrace-1.0.tar
rpm: dist
$(RPMBUILD) -ta btrace-1.0.tar.bz2
clean: docsclean
-rm -f *.o $(PROGS) .depend btrace-1.0.tar.bz2
$(MAKE) -C btt clean
install: $(PROGS) $(SCRIPTS)
$(INSTALL) -m755 -d $(DESTDIR)$(bindir)
$(INSTALL) btt/btt $(DESTDIR)$(bindir)
ifneq ($(wildcard .depend),)
include .depend
Block IO Tracing
Written by Jens Axboe <> (initial version and kernel support),
Alan D. Brunelle (threading and splitup into two seperate programs),
Nathan Scott <> (bug fixes, process names, multiple devices)
Also thanks to Tom Zanussi <> for good input and
blktrace was integrated into the mainline kernel between 2.6.16 and 2.6.17-rc1.
The target trace needs to run on a kernel at least that new.
If you don't have git, you can get hourly snapshots from:
The snapshots include the full git object database as well. has
excessively long mirror times, so if you have git installed, you can pull
the master tree from:
For browsing the repo over http and viewing history etc, you can direct
your browser to:
$ blktrace -d <dev> [ -r debug_path ] [ -o output ] [ -k ] [ -w time ]
[ -a action ] [ -A action mask ]
-d Use specified device. May also be given last after options.
-r Path to mounted debugfs, defaults to /debug.
-o File(s) to send output to.
-D Directory to prepend to output file names.
-k Kill running trace.
-w Stop after defined time, in seconds.
-a Only trace specific actions (use more -a options to add actions).
Available actions are:
-A Give the trace mask directly as a number.
-b Sub buffer size in KiB.
-n Number of sub buffers.
-l Run in network listen mode (blktrace server)
-h Run in network client mode, connecting to the given host
-p Network port to use (default 8462)
-s Disable network client use of sendfile() to transfer data
-V Print program version info.
$ blkparse -i <input> [ -o <output> ] [ -b rb_batch ] [ -s ] [ -t ] [ -q ]
[ -w start:stop ] [ -f output format ] [ -F format spec ]
[ -d <binary> ]
-i Input file containing trace data, or '-' for stdin.
-D Directory to prepend to input file names.
-o Output file. If not given, output is stdout.
-b stdin read batching.
-s Show per-program io statistics.
-h Hash processes by name, not pid.
-t Track individual ios. Will tell you the time a request took to
get queued, to get dispatched, and to get completed.
-q Quiet. Don't display any stats at the end of the trace.
-w Only parse data between the given time interval in seconds. If
'start' isn't given, blkparse defaults the start time to 0.
-d Dump sorted data in binary format
-f Output format. Customize the output format. The format field
identifiers are:
%a - Action
%c - CPU ID
%C - Task command name
%d - Direction (r/w)
%D - Device number
%e - Error number
%M - Major
%m - Minor
%N - Number of bytes
%n - Number of sectors
%p - PID
%P - PDU
%s - Sequence number
%S - Sector number
%t - Time (wallclock - nanoseconds)
%T - Time (wallclock - seconds)
%u - Time (processing - microseconds)
%U - Unplug depth
-F Format specification. The individual specifiers are:
A - Remap
B - Back merge
C - Complete
D - Issue
F - Front merge
G - Get request
I - Insert
M - Both front and back merge
P - Plug
Q - Queue
R - Requeue
S - Sleep requests
T - Unplug timer
U - Unplug IO
W - Bounce
X - Split
-v More verbose for marginal errors.
-V Print program version info.
$ verify_blkparse filename
Verifies an output file from blkparse. All it does is check if
the events in the file are correctly time ordered. If an entry
is found that isn't ordered, it's dumped to stdout.
$ blkrawverify <dev> [<dev>...]
The blkrawverify utility can be used to verify data retrieved
via blktrace. It will check for valid event formats, forward
progressing sequence numbers and time stamps, also does reasonable
checks for other potential issues within invidividual events.
Errors found will be tracked in <dev>.verify.out.
If you want to do live tracing, you can pipe the data between blktrace
and blkparse:
% blktrace -d <device> -o - | blkparse -i -
This has a small risk of displaying some traces a little out of sync, since
it will do batch sorts of input events. Similarly, you can do traces over
the network. The network 'server' must run:
% blktrace -l
to listen to incoming blktrace connections, while the client should use
% blktrace -d /dev/sda -h <server hostname>
to connect and transfer data over the network.
A users guide is distributed with the source. It is in latex, a
'make docs' will build a PDF in doc/. You need tetex and latex installed
to build the document.
vger hosts a mailing list dedicated to btrace discussion and development.
The list is called, subscribe by sending
a mail to with 'subscribe linux-btrace' in
the mail body.
2006-09-05, Jens Axboe <>
#include <strings.h>
#include "blktrace.h"
#define DECLARE_MASK_MAP(mask) { BLK_TC_##mask, #mask, "BLK_TC_"#mask }
#define COMPARE_MASK_MAP(mmp, str) \
(!strcasecmp((mmp)->short_form, (str)) || \
!strcasecmp((mmp)->long_form, (str)))
struct mask_map {
int mask;
char *short_form;
char *long_form;
static struct mask_map mask_maps[] = {
int find_mask_map(char *string)
unsigned int i;
for (i = 0; i < sizeof(mask_maps)/sizeof(mask_maps[0]); i++)
if (COMPARE_MASK_MAP(&mask_maps[i], string))
return mask_maps[i].mask;
return -1;
int valid_act_opt(int x)
return (1 <= x) && (x < (1 << BLK_TC_SHIFT));
#ifndef BARRIER_H
#define BARRIER_H
#if defined(__ia64__)
#define store_barrier() asm volatile ("mf" ::: "memory")
#elif defined(__x86_64__)
#define store_barrier() asm volatile("sfence" ::: "memory")
#elif defined(__i386__)
#define store_barrier() asm volatile ("": : :"memory")
#elif defined(__ppc__) || defined(__powerpc__)
#define store_barrier() asm volatile ("eieio" : : : "memory")
#elif defined(__s390__) || defined(__s390x__)
#define store_barrier() asm volatile ("bcr 15,0" : : : "memory")
#error Define store_barrier() for your CPU
This diff is collapsed.
* This file contains format parsing code for blkparse, allowing you to
* customize the individual action format and generel output format.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include "blktrace.h"
#define HEADER "%D %2c %8s %5T.%9t %5p %2a %3d "
static char *override_format[256];
static inline int valid_spec(int spec)
return strchr(VALID_SPECS, spec) != NULL;
void set_all_format_specs(char *option)
char *p;
for (p = VALID_SPECS; *p; p++)
if (override_format[(int)(*p)] == NULL)
override_format[(int)(*p)] = strdup(option);
int add_format_spec(char *option)
int spec = optarg[0];
if (!valid_spec(spec)) {
fprintf(stderr,"Bad format specifier %c\n", spec);
return 1;
if (optarg[1] != ',') {
fprintf(stderr,"Bad format specifier - need ',' %s\n", option);
return 1;
option += 2;
if (*option == '\0') {
fprintf(stderr,"Bad format specifier - need fmt %s\n", option);
return 1;
* Set both merges (front and back)
if (spec == 'M') {
override_format['B'] = strdup(option);
override_format['M'] = strdup(option);
} else
override_format[spec] = strdup(option);
return 0;
static inline void fill_rwbs(char *rwbs, struct blk_io_trace *t)
int w = t->action & BLK_TC_ACT(BLK_TC_WRITE);
int a = t->action & BLK_TC_ACT(BLK_TC_AHEAD);
int b = t->action & BLK_TC_ACT(BLK_TC_BARRIER);
int s = t->action & BLK_TC_ACT(BLK_TC_SYNC);
int m = t->action & BLK_TC_ACT(BLK_TC_META);
int i = 0;
if (w)
rwbs[i++] = 'W';
rwbs[i++] = 'R';
if (a)
rwbs[i++] = 'A';
if (b)
rwbs[i++] = 'B';
if (s)
rwbs[i++] = 'S';
if (m)
rwbs[i++] = 'M';
rwbs[i] = '\0';
static const char *
print_time(unsigned long long timestamp)
static char timebuf[128];
struct tm *tm;
time_t sec;
unsigned long nsec;
sec = abs_start_time.tv_sec + SECONDS(timestamp);
nsec = abs_start_time.tv_nsec + NANO_SECONDS(timestamp);
if (nsec >= 1000000000) {
nsec -= 1000000000;
sec += 1;
tm = localtime(&sec);
snprintf(timebuf, sizeof(timebuf),
nsec / 1000);
return timebuf;
static inline int pdu_rest_is_zero(unsigned char *pdu, int len)
static char zero[4096];
return !memcmp(pdu, zero, len);
static char *dump_pdu(unsigned char *pdu_buf, int pdu_len)
static char p[4096];
int i, len;
if (!pdu_buf || !pdu_len)
return NULL;
for (len = 0, i = 0; i < pdu_len; i++) {
if (i)
len += sprintf(p + len, " ");
len += sprintf(p + len, "%02x", pdu_buf[i]);
* usually dump for cdb dumps where we can see lots of
* zeroes, stop when the rest is just zeroes and indicate
* so with a .. appended
if (!pdu_buf[i] && pdu_rest_is_zero(pdu_buf + i, pdu_len - i)) {
sprintf(p + len, " ..");
return p;
#define pdu_start(t) (((void *) (t) + sizeof(struct blk_io_trace)))
static unsigned int get_pdu_int(struct blk_io_trace *t)
__u64 *val = pdu_start(t);
return be64_to_cpu(*val);
static void get_pdu_remap(struct blk_io_trace *t, struct blk_io_trace_remap *r)
struct blk_io_trace_remap *__r = pdu_start(t);
__u64 sector = __r->sector;
r->device = be32_to_cpu(__r->device);
r->device_from = be32_to_cpu(__r->device_from);
r->sector = be64_to_cpu(sector);
static void print_field(char *act, struct per_cpu_info *pci,
struct blk_io_trace *t, unsigned long long elapsed,
int pdu_len, unsigned char *pdu_buf, char field,
int minus, int has_w, int width)
char format[64];
if (has_w) {
if (minus)
sprintf(format, "%%-%d", width);
sprintf(format, "%%%d", width);
} else
sprintf(format, "%%");
switch (field) {
case 'a':
fprintf(ofp, strcat(format, "s"), act);
case 'c':
fprintf(ofp, strcat(format, "d"), pci->cpu);
case 'C': {
char *name = find_process_name(t->pid);
fprintf(ofp, strcat(format, "s"), name);
case 'd': {
char rwbs[6];
fill_rwbs(rwbs, t);
fprintf(ofp, strcat(format, "s"), rwbs);
case 'D': /* format width ignored */
fprintf(ofp,"%3d,%-3d", MAJOR(t->device), MINOR(t->device));
case 'e':
fprintf(ofp, strcat(format, "d"), t->error);
case 'M':
fprintf(ofp, strcat(format, "d"), MAJOR(t->device));
case 'm':
fprintf(ofp, strcat(format, "d"), MINOR(t->device));
case 'n':
fprintf(ofp, strcat(format, "u"), t_sec(t));
case 'N':
fprintf(ofp, strcat(format, "u"), t->bytes);
case 'p':
fprintf(ofp, strcat(format, "u"), t->pid);
case 'P': { /* format width ignored */
char *p = dump_pdu(pdu_buf, pdu_len);
if (p)
fprintf(ofp, "%s", p);
case 's':
fprintf(ofp, strcat(format, "ld"), t->sequence);
case 'S':
fprintf(ofp, strcat(format, "lu"), t->sector);
case 't':
sprintf(format, "%%0%dlu", has_w ? width : 9);
fprintf(ofp, format, NANO_SECONDS(t->time));
case 'T':
fprintf(ofp, strcat(format, "d"), SECONDS(t->time));
case 'u':
if (elapsed == -1ULL) {
fprintf(stderr, "Expecting elapsed value\n");
fprintf(ofp, strcat(format, "llu"), elapsed / 1000);
case 'U':
fprintf(ofp, strcat(format, "u"), get_pdu_int(t));
case 'z':
fprintf(ofp, strcat(format, "s"), print_time(t->time));
fprintf(ofp,strcat(format, "c"), field);
static char *parse_field(char *act, struct per_cpu_info *pci,
struct blk_io_trace *t, unsigned long long elapsed,
int pdu_len, unsigned char *pdu_buf,
char *master_format)
int minus = 0;
int has_w = 0;
int width = 0;
char *p = master_format;
if (*p == '-') {
minus = 1;
if (isdigit(*p)) {
has_w = 1;
do {
width = (width * 10) + (*p++ - '0');
} while ((*p) && (isdigit(*p)));
if (*p) {
print_field(act, pci, t, elapsed, pdu_len, pdu_buf, *p++,
minus, has_w, width);
return p;
static void process_default(char *act, struct per_cpu_info *pci,
struct blk_io_trace *t, unsigned long long elapsed,
int pdu_len, unsigned char *pdu_buf)
struct blk_io_trace_remap r = { .device = 0, };
char rwbs[6];
char *name;
fill_rwbs(rwbs, t);
* The header is always the same
if (act[0] == 'A') { /* Remap */
get_pdu_remap(t, &r);
t->device = r.device_from;
fprintf(ofp, "%3d,%-3d %2d %8d %5d.%09lu %5u %2s %3s ",
MAJOR(t->device), MINOR(t->device), pci->cpu, t->sequence,
(int) SECONDS(t->time), (unsigned long) NANO_SECONDS(t->time),
t->pid, act, rwbs);
name = find_process_name(t->pid);
switch (act[0]) {
case 'R': /* Requeue */
case 'C': /* Complete */
if (t->action & BLK_TC_ACT(BLK_TC_PC)) {
char *p = dump_pdu(pdu_buf, pdu_len);
if (p)
fprintf(ofp, "(%s) ", p);
fprintf(ofp, "[%d]\n", t->error);
} else {
if (elapsed != -1ULL) {
fprintf(ofp, "%llu + %u (%8llu) [%d]\n",
(unsigned long long) t->sector,
t_sec(t), elapsed, t->error);
} else {
fprintf(ofp, "%llu + %u [%d]\n",
(unsigned long long) t->sector,
t_sec(t), t->error);
case 'D': /* Issue */
case 'I': /* Insert */
case 'Q': /* Queue */
case 'W': /* Bounce */
if (t->action & BLK_TC_ACT(BLK_TC_PC)) {
char *p;
fprintf(ofp, "%u ", t->bytes);
p = dump_pdu(pdu_buf, pdu_len);
if (p)
fprintf(ofp, "(%s) ", p);
fprintf(ofp, "[%s]\n", name);
} else {
if (elapsed != -1ULL) {
fprintf(ofp, "%llu + %u (%8llu) [%s]\n",
(unsigned long long) t->sector,
t_sec(t), elapsed, name);
} else {
fprintf(ofp, "%llu + %u [%s]\n",
(unsigned long long) t->sector,
t_sec(t), name);
case 'B': /* Back merge */
case 'F': /* Front merge */
case 'M': /* Front or back merge */
case 'G': /* Get request */
case 'S': /* Sleep request */
fprintf(ofp, "%llu + %u [%s]\n", (unsigned long long) t->sector,
t_sec(t), name);
case 'P': /* Plug */
fprintf(ofp, "[%s]\n", name);
case 'U': /* Unplug IO */
case 'T': /* Unplug timer */
fprintf(ofp, "[%s] %u\n", name, get_pdu_int(t));
case 'A': /* remap */
fprintf(ofp, "%llu + %u <- (%d,%d) %llu\n",
(unsigned long long) t->sector, t_sec(t),
MAJOR(r.device), MINOR(r.device),
(unsigned long long) r.sector);
case 'X': /* Split */
fprintf(ofp, "%llu / %u [%s]\n", (unsigned long long) t->sector,
get_pdu_int(t), name);
fprintf(stderr, "Unknown action %c\n", act[0]);
void process_fmt(char *act, struct per_cpu_info *pci, struct blk_io_trace *t,
unsigned long long elapsed, int pdu_len,
unsigned char *pdu_buf)
char *p = override_format[(int) *act];
if (!p) {
process_default(act, pci, t, elapsed, pdu_len, pdu_buf);
while (*p) {
switch (*p) {
case '%': /* Field specifier */
if (*p == '%')
fprintf(ofp, "%c", *p++);
else if (!*p)
fprintf(ofp, "%c", '%');
p = parse_field(act, pci, t, elapsed,
pdu_len, pdu_buf,