Superpatterns Pat Patterson on the Cloud, Identity and Single Malt Scotch

4May/1065

A Simple Block Driver for Linux Kernel 2.6.31

Programming Amazon Web Services

Linux Device Drivers, 3rd Edition

My current work involves writing my first Linux block device driver. Going to the web to find a sample, I discovered Jonathan Corbet's Simple Block Driver article with its associated block driver example code. It's a nice succinct implementation of a ramdisk - pretty much the simplest working block device. There's only one problem, though, the article was written in 2003, when kernel 2.6.0 was the new kid on the block. Trying to build it on openSUSE 11.2 with kernel 2.6.31 just produced a slew of compile errors. A bit of research revealed that there were major changes to the kernel block device interface in 2.6.31, so I would have to port the example to get it working.

About a day and a half of poring through the kernel source and the excellent LDD3 (hardcopy) later, I had a running simple block driver for kernel 2.6.31. I've also tested it successfully on SUSE 11 SP1 Beta, which uses kernel 2.6.32. Here's the code, followed by instructions for getting it working.

sbd.c

/*
 * A sample, extra-simple block driver. Updated for kernel 2.6.31.
 *
 * (C) 2003 Eklektix, Inc.
 * (C) 2010 Pat Patterson <pat at superpat dot com>
 * Redistributable under the terms of the GNU GPL.
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>

#include <linux/kernel.h> /* printk() */
#include <linux/fs.h>     /* everything... */
#include <linux/errno.h>  /* error codes */
#include <linux/types.h>  /* size_t */
#include <linux/vmalloc.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>

MODULE_LICENSE("Dual BSD/GPL");
static char *Version = "1.4";

static int major_num = 0;
module_param(major_num, int, 0);
static int logical_block_size = 512;
module_param(logical_block_size, int, 0);
static int nsectors = 1024; /* How big the drive is */
module_param(nsectors, int, 0);

/*
 * We can tweak our hardware sector size, but the kernel talks to us
 * in terms of small sectors, always.
 */
#define KERNEL_SECTOR_SIZE 512

/*
 * Our request queue.
 */
static struct request_queue *Queue;

/*
 * The internal representation of our device.
 */
static struct sbd_device {
	unsigned long size;
	spinlock_t lock;
	u8 *data;
	struct gendisk *gd;
} Device;

/*
 * Handle an I/O request.
 */
static void sbd_transfer(struct sbd_device *dev, sector_t sector,
		unsigned long nsect, char *buffer, int write) {
	unsigned long offset = sector * logical_block_size;
	unsigned long nbytes = nsect * logical_block_size;

	if ((offset + nbytes) > dev->size) {
		printk (KERN_NOTICE "sbd: Beyond-end write (%ld %ld)\n", offset, nbytes);
		return;
	}
	if (write)
		memcpy(dev->data + offset, buffer, nbytes);
	else
		memcpy(buffer, dev->data + offset, nbytes);
}

static void sbd_request(struct request_queue *q) {
	struct request *req;

	req = blk_fetch_request(q);
	while (req != NULL) {
		// blk_fs_request() was removed in 2.6.36 - many thanks to
		// Christian Paro for the heads up and fix...
		//if (!blk_fs_request(req)) {
		if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
			printk (KERN_NOTICE "Skip non-CMD request\n");
			__blk_end_request_all(req, -EIO);
			continue;
		}
		sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
				req->buffer, rq_data_dir(req));
		if ( ! __blk_end_request_cur(req, 0) ) {
			req = blk_fetch_request(q);
		}
	}
}

/*
 * The HDIO_GETGEO ioctl is handled in blkdev_ioctl(), which
 * calls this. We need to implement getgeo, since we can't
 * use tools such as fdisk to partition the drive otherwise.
 */
int sbd_getgeo(struct block_device * block_device, struct hd_geometry * geo) {
	long size;

	/* We have no real geometry, of course, so make something up. */
	size = Device.size * (logical_block_size / KERNEL_SECTOR_SIZE);
	geo->cylinders = (size & ~0x3f) >> 6;
	geo->heads = 4;
	geo->sectors = 16;
	geo->start = 0;
	return 0;
}

/*
 * The device operations structure.
 */
static struct block_device_operations sbd_ops = {
		.owner  = THIS_MODULE,
		.getgeo = sbd_getgeo
};

static int __init sbd_init(void) {
	/*
	 * Set up our internal device.
	 */
	Device.size = nsectors * logical_block_size;
	spin_lock_init(&Device.lock);
	Device.data = vmalloc(Device.size);
	if (Device.data == NULL)
		return -ENOMEM;
	/*
	 * Get a request queue.
	 */
	Queue = blk_init_queue(sbd_request, &Device.lock);
	if (Queue == NULL)
		goto out;
	blk_queue_logical_block_size(Queue, logical_block_size);
	/*
	 * Get registered.
	 */
	major_num = register_blkdev(major_num, "sbd");
	if (major_num < 0) {
		printk(KERN_WARNING "sbd: unable to get major number\n");
		goto out;
	}
	/*
	 * And the gendisk structure.
	 */
	Device.gd = alloc_disk(16);
	if (!Device.gd)
		goto out_unregister;
	Device.gd->major = major_num;
	Device.gd->first_minor = 0;
	Device.gd->fops = &sbd_ops;
	Device.gd->private_data = &Device;
	strcpy(Device.gd->disk_name, "sbd0");
	set_capacity(Device.gd, nsectors);
	Device.gd->queue = Queue;
	add_disk(Device.gd);

	return 0;

out_unregister:
	unregister_blkdev(major_num, "sbd");
out:
	vfree(Device.data);
	return -ENOMEM;
}

static void __exit sbd_exit(void)
{
	del_gendisk(Device.gd);
	put_disk(Device.gd);
	unregister_blkdev(major_num, "sbd");
	blk_cleanup_queue(Queue);
	vfree(Device.data);
}

module_init(sbd_init);
module_exit(sbd_exit);

Makefile

obj-m := sbd.o
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
default:
	$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

There are two main areas of change compared with Jonathan's original:

  • sbd_request() uses the blk_fetch_request(), blk_rq_pos(), blk_rq_cur_sectors() and __blk_end_request_cur() functions rather than elv_next_request(), req->sector, req->current_nr_sectors and end_request() respectively. The structure of the loop also changes so we handle each sector from the request individually. One outstanding task for me is to investigate whether req->buffer holds all of the data for the entire request, so I can handle it all in one shot, rather than sector-by-sector. My first attempt resulted in the (virtual) machine hanging when I installed the driver, so I clearly need to do some more work in this area!
  • The driver implements the getgeo operation (in sbd_getgeo), rather than ioctl, since blkdev_ioctl now handles HDIO_GETGEO by calling the driver's getgeo function. This is a nice simplification since it moves a copy_to_user call out of each driver and into the kernel.

Before building, ensure you have the kernel source, headers, gcc, make etc - if you've read this far, you likely have all this and/or know how to get it, so I won't spell it all out here. You'll also need to go to the kernel source directory and do the following to prepare your build environment, if you have not already done so:

cd /usr/src/`uname -r`
make oldconfig && make prepare

Now, back in the directory with the sbd source, you can build it:

make -C /lib/modules/`uname -r`/build M=`pwd` modules

You'll see a warning about 'Version' being defined, but not used, but don't worry about that :-). Now we can load the module, partition the ramdisk, make a filesystem, mount it, and create a file:

opensuse:/home/pat/sbd # insmod sbd.ko
opensuse:/home/pat/sbd # fdisk /dev/sbd0
Device contains neither a valid DOS partition table, nor Sun, SGI or OSF disklabel
Building a new DOS disklabel with disk identifier 0x5f93978c.
Changes will remain in memory only, until you decide to write them.
After that, of course, the previous content won't be recoverable.

Warning: invalid flag 0x0000 of partition table 4 will be corrected by w(rite)

Command (m for help): n
Command action
   e   extended
   p   primary partition (1-4)
p
Partition number (1-4): 1
First cylinder (1-16, default 1):
Using default value 1
Last cylinder, +cylinders or +size{K,M,G} (1-16, default 16):
Using default value 16

Command (m for help): w
The partition table has been altered!

Calling ioctl() to re-read partition table.
Syncing disks.
opensuse:/home/pat/sbd # mkfs /dev/sbd0p1
mke2fs 1.41.9 (22-Aug-2009)
Filesystem label=
OS type: Linux
Block size=1024 (log=0)
Fragment size=1024 (log=0)
64 inodes, 504 blocks
25 blocks (4.96%) reserved for the super user
First data block=1
Maximum filesystem blocks=524288
1 block group
8192 blocks per group, 8192 fragments per group
64 inodes per group

Writing inode tables: done
Writing superblocks and filesystem accounting information: done

This filesystem will be automatically checked every 24 mounts or
180 days, whichever comes first.  Use tune2fs -c or -i to override.
opensuse:/home/pat/sbd # mount /dev/sbd0p1 /mnt
opensuse:/home/pat/sbd # echo Hi > /mnt/file1
opensuse:/home/pat/sbd # cat /mnt/file1
Hi
opensuse:/home/pat/sbd # ls -l /mnt
total 13
-rw-r--r-- 1 root root     3 2010-04-29 07:04 file1
drwx------ 2 root root 12288 2010-04-29 07:04 lost+found
opensuse:/home/pat/sbd # umount /mnt
opensuse:/home/pat/sbd # rmmod sbd

Hopefully this all works for you, and is as useful for you as it has been for me. Many thanks to Jonathan for the original version and the excellent LDD3. One final piece of housekeeping - although the comment at the top of sbd.c mentions only GPL, the MODULE_LICENSE macro specifies "Dual BSD/GPL". I am interpreting the original code as being under the dual GPL/BSD license and this version is similarly dual licensed.

UPDATE (Feb 5 2011) See the comment by Michele regarding changes to logical_block_size!

Comments (65) Trackbacks (8)
  1. Hi thanks for the nice explanation.

  2. This is a tremendous help for me while I am trying to work in 2.6.32 kernel and trying to port the driver code in the book (LDD3). BIG THANKS, you saved my time friend.

  3. Thanks, this code was a huge help. One important case it doesn’t handle though is a detach/rmmod while an I/O is in progress.

  4. You’re right, Sidney, but then, it’s really only a simple sample driver :-)

  5. Pat, I’m trying to build a simple char device driver based on LDD3 book.
    But I got a problem in compiling.
    When I call “make”, it produce nothing.
    I have Ubuntu 9.10 (Lucid Lynx) kernel 2.6.32(standard fresh installed).
    Could you give some advices what should I prepare(maybe kernel tree or linux header, etc) first?

    Thanks.

  6. Hi Kyuha,

    Try

    make -C /lib/modules/`uname -r`/build M=`pwd` modules

    Plain ‘make’ isn’t enough with kernel modules.

  7. Should my makefile look like

    obj-m := sbd.o
    KDIR := /lib/modules/$(shell uname -r)/build
    PWD := $(shell pwd)
    default:
    $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules

  8. Yes – make sure you run the other steps I described, too:

    cd /usr/src/`uname -r`
    make oldconfig && make prepare

  9. Thanks, Pat.

  10. Hi Pat, I got this error when trying to do “make oldconfig && make prepare”

    root@kyu:/usr/src/linux-headers-2.6.32-21# make oldconfig && make prepare
    scripts/kconfig/conf -o arch/x86/Kconfig
    #
    # configuration written to .config
    #
    scripts/kconfig/conf -s arch/x86/Kconfig
    CHK include/linux/version.h
    CHK include/linux/utsrelease.h
    SYMLINK include/asm -> include/asm-x86
    make[1]: *** No rule to make target `kernel/bounds.c’, needed by `kernel/bounds.s’. Stop.
    make: *** [prepare0] Error 2
    root@kyu:/usr/src/linux-headers-2.6.32-21#

    Can you please point what is wrong with my work?

  11. Hi Kyuha – You probably need to install the kernel sources package.

  12. Hi Pat,

    Thanks..that was a great starting point for me…Please suggest any enhancement that I can do on this code where I can go much deeper into device driver modules domain…

    Also, on command line we have M=`pwd` where M=current directory. What this command option informs?

    I am new to device driver programming. As per another member who commented above, it doesn’t handle though is a detach/rmmod while an I/O is in progress. What does it exactly mean? How do we test it?

    Please advice,

    Thanks

  13. Hi Veb – the M=`pwd` tells the Linux build system to build modules in the current directory. Handling detach/rmmod while an I/O is in progress is beyond the scope of a simple example. If you’re looking for more comprehensive examples, then /usr/src/linux/drivers is a good place to start.

  14. The linux documentation itself is not updated. Thanks a lot for the help. It is a good starting point for beginners like me. Now, I need to figure out, how to go ahead from here…

  15. First of all, thanks a lot for this updated version. Linux kernel seems to change at a dreadful speed.
    I had a few issues with the driver, in that it will not work if you change the logical_block_size. The main reason is that in the sbd_transfer function
    unsigned long offset = sector * logical_block_size;
    unsigned long nbytes = nsect * logical_block_size;
    should become:
    unsigned long offset = sector * KERNEL_SECTOR_SIZE;
    unsigned long nbytes = nsect * KERNEL_SECTOR_SIZE;

    Also the following line in sbd_getgeo is fishy:
    size = Device.size * (logical_block_size / KERNEL_SECTOR_SIZE);
    size is probably intended to be in 512 byte sectors, but Device.size is the size in bytes…
    I would change it with
    size = Device.size / KERNEL_SECTOR_SIZE;

    (I haven’t tested this last one.)

    Thanks again for the huge help,

    Michele

  16. Hi Michele – glad you found this useful, and thanks for the advice regarding logical_block_size – I no longer have everything set up to test this, but I’ve put a note at the end of the blog entry directing folks to your comment. Thanks for taking the time :-)

  17. Thanks a lot, I was stuck using APIs from the older version of Linux and didnt even realize until i saw this page that elv_next_request was outdated.. u saved.. my college project..

  18. Hi Akshat – glad you found it useful – thanks for the comment! :-)

  19. It appears that “blk_fs_request” has been removed in the 2.6.36 kernel, causing sbd.c to not compile as written.

    Replacing the line:

    if (!blk_fs_request(req)) {

    …with

    if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {

    seems to fix this, on the lines of a similar patch I found for the cloop driver in the Knoppix codebase: http://lists.debian.org/debian-knoppix/2010/10/msg00012.html

  20. Thanks, Christian – I just updated the entry appropriately. Glad to see folks are still finding this useful!

  21. hi, pat.

    I’m trying to compile a block device driver and when i do the make it gives me these errors.

    root@ubuntu:/home/sergio12345/Desktop/practica# make
    make -C /lib/modules/2.6.30-020630-generic/build SUBDIRS=/home/sergio12345/Desktop/practica modules
    make[1]: Entering directory `/usr/src/linux-headers-2.6.30-020630-generic’
    CC [M] /home/sergio12345/Desktop/practica/bd.o

    /home/sergio12345/Desktop/practica/bd.c: In function ‘sbd_request’:
    /home/sergio12345/Desktop/practica/bd.c:74: error: implicit declaration of function ‘blk_fetch_request’
    /home/sergio12345/Desktop/practica/bd.c:74: warning: assignment makes pointer from integer without a cast
    /home/sergio12345/Desktop/practica/bd.c:81: error: implicit declaration of function ‘__blk_end_request_all’
    /home/sergio12345/Desktop/practica/bd.c:84: error: implicit declaration of function ‘blk_rq_pos’
    /home/sergio12345/Desktop/practica/bd.c:84: error: implicit declaration of function ‘blk_rq_cur_sectors’
    /home/sergio12345/Desktop/practica/bd.c:86: error: implicit declaration of function ‘__blk_end_request_cur’
    /home/sergio12345/Desktop/practica/bd.c:87: warning: assignment makes pointer from integer without a cast
    /home/sergio12345/Desktop/practica/bd.c: In function ‘sbd_init’:
    /home/sergio12345/Desktop/practica/bd.c:132: error: implicit declaration of function ‘blk_queue_logical_block_size’
    make[2]: *** [/home/sergio12345/Desktop/practica/bd.o] Error 1
    make[1]: *** [_module_/home/sergio12345/Desktop/practica] Error 2
    make[1]: Leaving directory `/usr/src/linux-headers-2.6.30-020630-generic’
    make: *** [default] Error 2
    root@ubuntu:/home/sergio12345/Desktop/practica#

    thanks for ur help.

  22. Hi Sergio – it looks like you’re working with kernel 2.6.30; this recipe is for 2.6.31 and higher. Take a look at the original sample at http://lwn.net/Articles/58720/ – it might work for you.

    Cheers,

    Pat

  23. This has been a great help!! I am writing a block driver for a solid state disk on our computer boards. Was stuck with errors for elv_next_request and end_request. This saved me a lot of time!!! Thanks.

  24. Hi Paul D – you’re very welcome – it’s great to see so many people have found this useful!

  25. I have built my driver and allowed the major number to be allocated automatically. My system allocates number 251 and it is displayed under /proc/devices. When I look under /dev it displays major number 259. Do you know why the difference?

  26. Never mind. I had my alloc_disk argument set to zero.

  27. hi.. thanx for sharing code. i have a doubt that if i want to use make_request function for this, which will eliminate the io queuing. so is this happen in different way in 2.6.31.

  28. Hi Neeraj – Yes, you could use make_request if it is a better choice for what you’re doing. My purpose here was simply to port the sample to 2.6.31 rather than give a comprehensive account of block driver creation.

  29. Hi,

    Thanks for sharing your experience.

    1. I am trying out my first Block Device and followed your example and it is asking for a filesystem type on mounting the sbd0p1 device:
    yangcomputer:~/exp # mount /dev/sbd0p1 /mnt
    mount: you must specify the filesystem type

    2. This example applies to a ramdisk. Do you have any idea how I can implement a similar block device on a real hard disk? Which part of the code do I need to change?

    Thanks,

    Yan

  30. Hi Yan,

    1. Did you do fdisk and mkfs to create a partition and filesystem respectively? It sounds like you might have forgotten one or both steps.

    2. I don’t have any experience implementing a driver for a real hard disk – when I did this I was creating a network block device. You should probably look at the SCSI and IDE block drivers in the Linux source tree.

    Cheers,

    Pat

  31. Hi Pat,

    I did use fdisk to format and partititon /dev/sbd0 however it complained about:
    fdisk /dev/sbd0
    You must set cylinders.
    You can do this from the extra functions menu.

    Command (m for help):

    So I went into Extra menu to set the cylinders to 1.

    Am I doing it right?

    Thanks,

    Yan

  32. Hi Yan – I think you can set cylinders to any value – it doesn’t really matter, since it’s not a real disk.

  33. thanks so much !!!

  34. everything goes so well with it. thx a lot

  35. i want a keypad and lcd device driver for mpc85xx please help me about this

  36. Hi Gangadhar – I know nothing of keypad and lcd device drivers. Good luck!

  37. Hello Pat!

    Thank you for this code, it helped in understanding block drivers better.

    However, when I try to insmod the driver, Segmentation Fault occurs. Please help.

    Thanks!

  38. Hi Soumya – not sure what might be going wrong. Is there a stack trace in the kernel log? See this StackOverflow question for some hints.

  39. Hello Pat
    after going these(Command (m for help): w
    The partition table has been altered!

    Calling ioctl() to re-read partition table.
    Syncing disks.) command when i reach at

    opensuse:/home/pat/sbd # mkfs /dev/sbd0p1
    and when i put that path (actually my system path is-deepak@deepak-Satellite-C665:~/ddd/dp$ ls
    Makefile modules.order Module.symvers sbd.c sbd.ko sbd.mod.c sbd.mod.o sbd.o
    deepak@deepak-Satellite-C665:~/ddd/dp$ pwd
    /home/deepak/ddd/dp)
    it showing deepak@deepak-Satellite-C665:~$ /home/deepak/ddd/dp/sbd # mkfs /dev/sbd0p2
    bash: /home/deepak/ddd/dp/sbd: No such file or directory
    deepak@deepak-Satellite-C665:~$
    that error what i have to do..

  40. Hi,
    when i mount this using #sudo mount /dev/sbd0p2
    i get an error and it is
    180 days, whichever comes first. Use tune2fs -c or -i to override.
    deepak@deepak-Satellite-C665:~/ddd/dp$ sudo mount /dev/sbd0p2
    mount: can’t find /dev/sbd0p2 in /etc/fstab or /etc/mtab
    i need help

  41. Hi Deepak – I can’t see anything obviously wrong in what you’re doing. Go back over the instructions and double check that you’ve followed them exactly. What version of Linux are you using?

  42. Sir,
    I have take your code and cpmpile but I get error that ” fatal error: linux/module.h: No such file or directory
    compilation terminated.” So, what can I do?

  43. Maulik – what kernel version are you working on?

  44. Thanks for the updated driver , nice work .

  45. Hi Pat, looks like I found a bug :) You should change line 140 like:

    major_num = register_blkdev(major_num, “sbd”);
    if (major_num < 0) { // use < instead of <=

    register_blkdev returns 0 when user requests a specific (non-zero) major number.

  46. Hi Pat,

    I am not understanding one thing that what difference it makes if we execute blk_fetch_request function inside while loop like LDD3.

    If I do so, fdisk just hang.

  47. Thanks, Radek – I made that change.

  48. tej – could you post an example of code that causes fdisk to hang?

  49. Hey Pat,

    Perhaps I’m missing something but….

    static void sbd_request(struct request_queue *q) {
    struct request *req;

    req = blk_fetch_request(q);
    while (req != NULL) {
    /* Are you sure you want to continue in the following block */
    if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
    printk (KERN_NOTICE “Skip non-CMD request\n”);
    __blk_end_request_all(req, -EIO);
    continue; // this will skip an iteration and spin surely?
    }
    sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
    req->buffer, rq_data_dir(req));
    if ( ! __blk_end_request_cur(req, 0) ) {
    req = blk_fetch_request(q);
    }
    }
    }

    Would wrapping it in an if/else not be better such as:

    static void sbd_request(struct request_queue *q) {
    struct request *req;

    req = blk_fetch_request(q);
    while (req != NULL) {
    /* Are you sure you want to continue in the following block */
    if (req == NULL || (req->cmd_type != REQ_TYPE_FS)) {
    printk (KERN_NOTICE “Skip non-CMD request\n”);
    __blk_end_request_all(req, -EIO);
    } else {
    sbd_transfer(&Device, blk_rq_pos(req), blk_rq_cur_sectors(req),
    req->buffer, rq_data_dir(req));
    }
    if ( ! __blk_end_request_cur(req, 0) ) {
    req = blk_fetch_request(q);
    }
    }
    }

  50. Thank you for this, great job and it works properly.
    I try to run this driver on two machines, to impliment something like NBD “network block device”
    the problem is that if I am in the method sbd_transfer, “if I try to open a udp socket the system crashes and blocks completely. If i do this another where there is no problem
    do you have any idea??


Leave a comment