Superpatterns Pat Patterson on the Cloud, Identity and Single Malt Scotch

4May/1076

A Simple Block Driver for Linux Kernel 2.6.31

Programming Amazon Web Services

Linux Device Drivers, 3rd Edition

My current work involves writing my first Linux block device driver. Going to the web to find a sample, I discovered Jonathan Corbet's Simple Block Driver article with its associated block driver example code. It's a nice succinct implementation of a ramdisk - pretty much the simplest working block device. There's only one problem, though, the article was written in 2003, when kernel 2.6.0 was the new kid on the block. Trying to build it on openSUSE 11.2 with kernel 2.6.31 just produced a slew of compile errors. A bit of research revealed that there were major changes to the kernel block device interface in 2.6.31, so I would have to port the example to get it working.

About a day and a half of poring through the kernel source and the excellent LDD3 (hardcopy) later, I had a running simple block driver for kernel 2.6.31. I've also tested it successfully on SUSE 11 SP1 Beta, which uses kernel 2.6.32. Here's the code, followed by instructions for getting it working.

sbd.c

/*
 * A sample, extra-simple block driver. Updated for kernel 2.6.31.
 *
 * (C) 2003 Eklektix, Inc.
 * (C) 2010 Pat Patterson <pat at superpat dot com>
 * Redistributable under the terms of the GNU GPL.
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>

#include <linux/kernel.h> /* printk() */
#include <linux/fs.h>     /* everything... */
#include <linux/errno.h>  /* error codes */
#include <linux/types.h>  /* size_t */
#include <linux/vmalloc.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>

MODULE_LICENSE("Dual BSD/GPL");
static char *Version = "1.4";

static int major_num = 0;
module_param(major_num, int, 0);
static int logical_block_size = 512;
module_param(logical_block_size, int, 0);
static int nsectors = 1024; /* How big the drive is */
module_param(nsectors, int, 0);

/*
 * We can tweak our hardware sector size, but the kernel talks to us
 * in terms of small sectors, always.
 */
#define KERNEL_SECTOR_SIZE 512

/*
 * Our request queue.
 */
static struct request_queue *Queue;

/*
 * The internal representation of our device.
 */
static struct sbd_device {
	unsigned long size;
	spinlock_t lock;
	u8 *data;
	struct gendisk *gd;
} Device;

/*
 * Handle an I/O request.
 */
static void sbd_transfer(struct sbd_device *dev, sector_t sector,
		unsigned long nsect, char *buffer, int write) {
	unsigned long offset = sector * logical_block_size;
	unsigned long nbytes = nsect * logical_block_size;

	if ((offset + nbytes) > dev->size) {
		printk (KERN_NOTICE "sbd: Beyond-end write (%ld %ld)\n", offset, nbytes);
		return;
	}
	if (write)
		memcpy(dev->data + offset, buffer, nbytes);
	else
		memcpy(buffer, dev->data + offset, nbytes);
}

static void sbd_request(struct request_queue *q) {
	struct request *req;

	req = blk_fetch_request(q);
	while (req != NULL) {
		// blk_fs_request() was removed in 2.6.36 - many thanks to
		// Christian Paro for the heads up and fix...
		//if (!blk_fs_request(req)) {
		if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
			printk (KERN_NOTICE "Skip non-CMD request\n");
			__blk_end_request_all(req, -EIO);
			continue;
		}
		sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
				req->buffer, rq_data_dir(req));
		if ( ! __blk_end_request_cur(req, 0) ) {
			req = blk_fetch_request(q);
		}
	}
}

/*
 * The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
 * calls this. We need to implement getgeo, since we can't
 * use tools such as fdisk to partition the drive otherwise.
 */
int sbd_getgeo(struct block_device * block_device, struct hd_geometry * geo) {
	long size;

	/* We have no real geometry, of course, so make something up. */
	size = Device.size * (logical_block_size / KERNEL_SECTOR_SIZE);
	geo->cylinders = (size & ~0x3f) >> 6;
	geo->heads = 4;
	geo->sectors = 16;
	geo->start = 0;
	return 0;
}

/*
 * The device operations structure.
 */
static struct block_device_operations sbd_ops = {
		.owner  = THIS_MODULE,
		.getgeo = sbd_getgeo
};

static int __init sbd_init(void) {
	/*
	 * Set up our internal device.
	 */
	Device.size = nsectors * logical_block_size;
	spin_lock_init(&Device.lock);
	Device.data = vmalloc(Device.size);
	if (Device.data == NULL)
		return -ENOMEM;
	/*
	 * Get a request queue.
	 */
	Queue = blk_init_queue(sbd_request, &Device.lock);
	if (Queue == NULL)
		goto out;
	blk_queue_logical_block_size(Queue, logical_block_size);
	/*
	 * Get registered.
	 */
	major_num = register_blkdev(major_num, "sbd");
	if (major_num < 0) {
		printk(KERN_WARNING "sbd: unable to get major number\n");
		goto out;
	}
	/*
	 * And the gendisk structure.
	 */
	Device.gd = alloc_disk(16);
	if (!Device.gd)
		goto out_unregister;
	Device.gd->major = major_num;
	Device.gd->first_minor = 0;
	Device.gd->fops = &sbd_ops;
	Device.gd->private_data = &Device;
	strcpy(Device.gd->disk_name, "sbd0");
	set_capacity(Device.gd, nsectors);
	Device.gd->queue = Queue;
	add_disk(Device.gd);

	return 0;

out_unregister:
	unregister_blkdev(major_num, "sbd");
out:
	vfree(Device.data);
	return -ENOMEM;
}

static void __exit sbd_exit(void)
{
	del_gendisk(Device.gd);
	put_disk(Device.gd);
	unregister_blkdev(major_num, "sbd");
	blk_cleanup_queue(Queue);
	vfree(Device.data);
}

module_init(sbd_init);
module_exit(sbd_exit);

Makefile

obj-m := sbd.o
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
default:
	$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

There are two main areas of change compared with Jonathan's original:

  • sbd_request() uses the blk_fetch_request(), blk_rq_pos(), blk_rq_cur_sectors() and __blk_end_request_cur() functions rather than elv_next_request(), req->sector, req->current_nr_sectors and end_request() respectively. The structure of the loop also changes so we handle each sector from the request individually. One outstanding task for me is to investigate whether req->buffer holds all of the data for the entire request, so I can handle it all in one shot, rather than sector-by-sector. My first attempt resulted in the (virtual) machine hanging when I installed the driver, so I clearly need to do some more work in this area!
  • The driver implements the getgeo operation (in sbd_getgeo), rather than ioctl, since blkdev_ioctl now handles HDIO_GETGEO by calling the driver's getgeo function. This is a nice simplification since it moves a copy_to_user call out of each driver and into the kernel.

Before building, ensure you have the kernel source, headers, gcc, make etc - if you've read this far, you likely have all this and/or know how to get it, so I won't spell it all out here. You'll also need to go to the kernel source directory and do the following to prepare your build environment, if you have not already done so:

cd /usr/src/`uname -r`
make oldconfig && make prepare

Now, back in the directory with the sbd source, you can build it:

make -C /lib/modules/`uname -r`/build M=`pwd` modules

You'll see a warning about 'Version' being defined, but not used, but don't worry about that :-). Now we can load the module, partition the ramdisk, make a filesystem, mount it, and create a file:

opensuse:/home/pat/sbd # insmod sbd.ko
opensuse:/home/pat/sbd # fdisk /dev/sbd0
Device contains neither a valid DOS partition table, nor Sun, SGI or OSF disklabel
Building a new DOS disklabel with disk identifier 0x5f93978c.
Changes will remain in memory only, until you decide to write them.
After that, of course, the previous content won't be recoverable.

Warning: invalid flag 0x0000 of partition table 4 will be corrected by w(rite)

Command (m for help): n
Command action
   e   extended
   p   primary partition (1-4)
p
Partition number (1-4): 1
First cylinder (1-16, default 1):
Using default value 1
Last cylinder, +cylinders or +size{K,M,G} (1-16, default 16):
Using default value 16

Command (m for help): w
The partition table has been altered!

Calling ioctl() to re-read partition table.
Syncing disks.
opensuse:/home/pat/sbd # mkfs /dev/sbd0p1
mke2fs 1.41.9 (22-Aug-2009)
Filesystem label=
OS type: Linux
Block size=1024 (log=0)
Fragment size=1024 (log=0)
64 inodes, 504 blocks
25 blocks (4.96%) reserved for the super user
First data block=1
Maximum filesystem blocks=524288
1 block group
8192 blocks per group, 8192 fragments per group
64 inodes per group

Writing inode tables: done
Writing superblocks and filesystem accounting information: done

This filesystem will be automatically checked every 24 mounts or
180 days, whichever comes first.  Use tune2fs -c or -i to override.
opensuse:/home/pat/sbd # mount /dev/sbd0p1 /mnt
opensuse:/home/pat/sbd # echo Hi > /mnt/file1
opensuse:/home/pat/sbd # cat /mnt/file1
Hi
opensuse:/home/pat/sbd # ls -l /mnt
total 13
-rw-r--r-- 1 root root     3 2010-04-29 07:04 file1
drwx------ 2 root root 12288 2010-04-29 07:04 lost+found
opensuse:/home/pat/sbd # umount /mnt
opensuse:/home/pat/sbd # rmmod sbd

Hopefully this all works for you, and is as useful for you as it has been for me. Many thanks to Jonathan for the original version and the excellent LDD3. One final piece of housekeeping - although the comment at the top of sbd.c mentions only GPL, the MODULE_LICENSE macro specifies "Dual BSD/GPL". I am interpreting the original code as being under the dual GPL/BSD license and this version is similarly dual licensed.

UPDATE (Feb 5 2011) See the comment by Michele regarding changes to logical_block_size!

UPDATE (Apr 23 2015) See the comment by Sarge regarding changes for kernel 3.15-rc2 and later

Comments (76) Trackbacks (8)
  1. Nick – I think you’re right on the possibility of spinning, but you wouldn’t want to do __blk_end_request_all() then __blk_end_request_cur(). Since I’m no longer working on this stuff and have no way of testing it, are you able to confirm that the following works?

    static void sbd_request(struct request_queue *q) {
    	struct request *req;
    
    	req = blk_fetch_request(q);
    	while (req != NULL) {
    		// blk_fs_request() was removed in 2.6.36 - many thanks to
    		// Christian Paro for the heads up and fix...
    		//if (!blk_fs_request(req)) {
    		if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
    			printk (KERN_NOTICE "Skip non-CMD request\n");
    			__blk_end_request_all(req, -EIO);
    			break;
    		}
    		sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
    				req->buffer, rq_data_dir(req));
    		if ( ! __blk_end_request_cur(req, 0) ) {
    			req = blk_fetch_request(q);
    		}
    	}
    }
    
  2. ayman – no idea – sorry!

  3. Hi ,
    I am new to the block device driver . When I was doing the insmod ,I am getting a system hang . If I comment out add_disk () , I could see the driver is inserting . Why the add_Disk () is causing hang .I am usinh redhat 2.6.18

    Thanks
    Pradeep

  4. Hi Pradeep – the sample code in this blog post is specifically for kernel version 2.6.31 and later. You should use the example at http://lwn.net/Articles/58719/

  5. I’m using 2.6.24. I built built this by using the source from http://lwn.net/Articles/58720/. All went well except that fdisk must have changed because I have to use x instead of n to create a new partition (typing n just says to use x). But in the x menu there is no way to create a new partition. Then I checked “man fdisk” but that also does not say how to create the partition.

    Can someone give me additional information on how to create the necessary partition with fdisk from Linux 2.6.24?

  6. Hi Eddy – sorry, I don’t know the ‘x’ command to fdisk. Is there a reason you’re using a four-year old kernel?

  7. Yes, it is simply because I don’t use Linux much and I have a 2.6.24 already loaded. I am trying to get some understanding of, what I call, “driver interface” code … I have always only been at the hardware driver level. If I try to load a later version without any experience I will certainly run into problems which will cost time that I don’t have.

    The x command in fdisk means “extra functionality (experts only)”

  8. Eddy – as I mention in the article, the block driver interface changed significantly in 2.6.31, so any work you’re doing in 2.6.24 will be wasted. Grab a VMware or VirtualBox image of a recent Ubuntu – you’ll have it spun up and working in no time.

  9. Thanks, I didn’t know ready made images were available. This makes life much easier.

  10. excellent article really it helped me alot…….thnx a lot

  11. good work really.

  12. How to understand the block driver

  13. Hi Sushant – it’s a sample driver that implements a RAM disk. What don’t you understand?

  14. Hello,

    Can this driver be used to simulate a swap backing store?

    What I am looking for is a bare-bone example of a block device that can simulate a backing store. I will be later on extend this driver to add my own stuff.

    Thanks.

  15. Hi Hebbo – this is really the inverse of a swap backing store – it’s implementing a block device in RAM. You want to map memory to disk. I don’t think this is very helpful for you.

  16. I tried this code in 2.6.32-38,after insmod it is invoking request function for 12 times to read ,after doing fdisk its not creating sbd0p1 what might be issue n solution

  17. No idea, Chetan, sorry – I haven’t worked on this stuff in more than four years!

  18. Works great in 3.10.14 kernel. Had to do one fix for very large ramdisk:

    Device.size = (long)nsectors * (long)logical_block_size;

  19. As of kernel version 3.15-rc2 and later, due to commit b4f42e2831ff9b9fa19252265d7c8985d47eefb9 in branch Linux-3.14.y, this is needed in sbd_request:
    Instead of:
    sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
    req->buffer, rq_data_dir(req));

    .. this:
    sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
    bio_data(req->bio), rq_data_dir(req));

  20. with this fix, everything builds and then runs just fine on Ubuntu Utopic, latest kernel for that distribution. Thanks very much for this; great job porting to later kernels.

  21. Just FYI and for completeness: the commit number in linux-stable git repo that removes blk_fs_request and other macros is 33659ebbae262228eef4e0fe990f393d1f0ed941 on August 7, 2010, in 2.6.36-rc1.

  22. Many thanks, Sarge – I’ll add a note in the post!

  23. (trying once more):
    It’s good that you are still in the loop here. I think there is a bug in your sbd_request function.

    You have:
    struct request *req;

    req = blk_fetch_request(q);
    while (req != NULL) {
    // blk_fs_request() was removed in 2.6.36 – many thanks to
    // Christian Paro for the heads up and fix…
    //if (!blk_fs_request(req)) {
    if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
    printk (KERN_NOTICE “Skip non-CMD request\n”);
    __blk_end_request_all(req, -EIO);
    continue;
    }
    sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
    bio_data(req->bio), rq_data_dir(req));
    if ( ! __blk_end_request_cur(req, 0) ) {
    req = blk_fetch_request(q);
    }
    }

    in:
    if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
    printk (KERN_NOTICE “Skip non-CMD request\n”);
    __blk_end_request_all(req, -EIO);
    continue;
    }

    if req is _not_ NULL, then the request will be ended, but req still has an unmodfied non-NULL value so, after the continue, it will go to the top of the loop which will then come right back into this block and perhaps crash the kernel if trying to end an already ended req is not handled right in the kernel. Otherwise, this is an endless loop. Unsure what you want to do here, as this is non-standard code from all of the other drivers I have been able to look at. Please advise.

  24. Wow, Sarge – you’re absolutely right. I think a req = blk_fetch_request(q); just before the continue would fix it… What do you think? Unfortunately, I’ve moved on from this work and have no real way to test it…

  25. I don’t know.. it seems to me that the entire request has been ended (call to __blk_end_request_all), so there should be no more sections left to process, and executing “break” instead of “continue” is probably the solution; basically ending request processing for this particular full request. It may not be a good idea to call fetch_request again on an already null or finished request. But, I also see a problem where the top of the loop tests for req != NULL and then immediately in the next if statement, tests for req == NULL. This will never be. So, here, req must be non-NULL.

    In any case, here is my new version. I tried to remove the “__” in front of the blk_end* calls, but that caused multiple kernel panics when the module was loaded, so I don’t know what that is all about, as the kernel source I have (Ubuntu 3.19.0-15-generic) shows identical code for both functions. No time to track that one down, and this works as it is.

    This new version is tested and it works, not disturbing a healthy system. I also managed to remove the compiler warning for the Version number (which I bumped up). All licenses are the same: dual BSD/GPL. I didn’t change or add to the copyrights because I want to remain anonymous.

    (No way to attach files here on this blog as far as I can see, so I’ll just post the text. You’ll need to reformat as you wish, especially if you want to re-post this in the main article.)
    — code starts —

    /*
    * A sample, extra-simple block driver. Updated for kernel 2.6.31.
    *
    * (C) 2003 Eklektix, Inc.
    * (C) 2010 Pat Patterson
    * Redistributable under the terms of the GNU GPL.
    */

    #include
    #include
    #include

    #include /* printk() */
    #include /* everything… */
    #include /* error codes */
    #include /* size_t */
    #include
    #include
    #include
    #include

    #include

    MODULE_LICENSE(“Dual BSD/GPL”);
    static char *Version __attribute__((unused)) = “1.5”;

    static int major_num = 0;
    module_param(major_num, int, 0);
    static int logical_block_size = 512;
    module_param(logical_block_size, int, 0);
    static int nsectors = 1024; /* How big the drive is */
    module_param(nsectors, int, 0);

    /*
    * We can tweak our hardware sector size, but the kernel talks to us
    * in terms of small sectors, always.
    */
    #define KERNEL_SECTOR_SIZE 512

    /*
    * Our request queue.
    */
    static struct request_queue *Queue;

    /*
    * The internal representation of our device.
    */
    static struct sbd_device {
    unsigned long size;
    spinlock_t lock;
    u8 *data;
    struct gendisk *gd;
    } Device;

    /*
    * Handle an I/O request.
    */
    static void sbd_transfer(struct sbd_device *dev, sector_t sector,
    unsigned long nsect, char *buffer, int write) {
    unsigned long offset = sector * logical_block_size;
    unsigned long nbytes = nsect * logical_block_size;

    if ((offset + nbytes) > dev->size) {
    printk (KERN_NOTICE “sbd: Beyond-end write (%ld %ld)\n”, offset, nbytes);
    return;
    }
    if (write)
    memcpy(dev->data + offset, buffer, nbytes);
    else
    memcpy(buffer, dev->data + offset, nbytes);
    }

    static void sbd_request(struct request_queue *q) {
    struct request *req;

    req = blk_fetch_request(q);
    while (req != NULL) {
    // blk_fs_request() was removed in 2.6.36 – many thanks to
    // Christian Paro for the heads up and fix…
    //if (!blk_fs_request(req)) {
    if (req->cmd_type != REQ_TYPE_FS) {
    printk (KERN_NOTICE “Skip non-CMD request\n”);
    __blk_end_request_all(req, -EIO);
    req = blk_fetch_request(q);
    continue;
    }

    sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
    #if LINUX_VERSION_CODE buffer,
    #else
    bio_data(req->bio),
    #endif
    rq_data_dir(req));
    if ( ! __blk_end_request_cur(req, 0) ) {
    req = blk_fetch_request(q);
    }
    }
    }

    /*
    * The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
    * calls this. We need to implement getgeo, since we can’t
    * use tools such as fdisk to partition the drive otherwise.
    */
    int sbd_getgeo(struct block_device * block_device, struct hd_geometry * geo) {
    long size;

    /* We have no real geometry, of course, so make something up. */
    size = Device.size * (logical_block_size / KERNEL_SECTOR_SIZE);
    geo->cylinders = (size & ~0x3f) >> 6;
    geo->heads = 4;
    geo->sectors = 16;
    geo->start = 0;
    return 0;
    }

    /*
    * The device operations structure.
    */
    static struct block_device_operations sbd_ops = {
    .owner = THIS_MODULE,
    .getgeo = sbd_getgeo
    };

    static int __init sbd_init(void) {
    /*
    * Set up our internal device.
    */
    Device.size = nsectors * logical_block_size;
    spin_lock_init(&Device.lock);
    Device.data = vmalloc(Device.size);
    if (Device.data == NULL)
    return -ENOMEM;
    /*
    * Get a request queue.
    */
    Queue = blk_init_queue(sbd_request, &Device.lock);
    if (Queue == NULL)
    goto out;
    blk_queue_logical_block_size(Queue, logical_block_size);
    /*
    * Get registered.
    */
    major_num = register_blkdev(major_num, “sbd”);
    if (major_num major = major_num;
    Device.gd->first_minor = 0;
    Device.gd->fops = &sbd_ops;
    Device.gd->private_data = &Device;
    strcpy(Device.gd->disk_name, “sbd0″);
    set_capacity(Device.gd, nsectors);
    Device.gd->queue = Queue;
    add_disk(Device.gd);

    return 0;

    out_unregister:
    unregister_blkdev(major_num, “sbd”);
    out:
    vfree(Device.data);
    return -ENOMEM;
    }

    static void __exit sbd_exit(void)
    {
    del_gendisk(Device.gd);
    put_disk(Device.gd);
    unregister_blkdev(major_num, “sbd”);
    blk_cleanup_queue(Queue);
    vfree(Device.data);
    }

    module_init(sbd_init);
    module_exit(sbd_exit);

    — code ends —

  26. ok, that messed up the formatting completely. Also, some sort of issue with open and closed angle brackets. All the include statements are the same, but linux/version.h needs to be added.


Leave a comment