From 615c0903b8d73cdd2bc985a814cd37c00e29dc95 Mon Sep 17 00:00:00 2001 From: Krzysztof Mazur Date: Wed, 15 Dec 2010 21:10:18 +0100 Subject: [PATCH 01/84] initial support for LSBD LSBD (Log-Structured Block Device) is a virtual block device on top of real block device that implements idea of log structured writes. --- Documentation/Configure.help | 21 + Documentation/devices.txt | 9 +- drivers/block/Config.in | 1 + drivers/block/Makefile | 1 + drivers/block/lsbd.c | 1123 ++++++++++++++++++++++++++++++++++++++++++ include/linux/lsbd.h | 87 ++++ include/linux/major.h | 2 + 7 files changed, 1243 insertions(+), 1 deletion(-) create mode 100644 drivers/block/lsbd.c create mode 100644 include/linux/lsbd.h diff --git a/Documentation/Configure.help b/Documentation/Configure.help index 6d51d63..9bad7a5 100644 --- a/Documentation/Configure.help +++ b/Documentation/Configure.help @@ -552,6 +552,27 @@ CONFIG_BLK_DEV_LOOP Most users will answer N here. +Log-Structured Block Device +CONFIG_BLK_DEV_LSBD + Saying Y here will allow you to use Log-Structured Block Device + over other block-device. + + The Log-Structured Block Device is middle layer between useful + block device and device with fast linear writes and slow random + writes (for instance most CompactFlash). It also provides high + reliability via mirroring and very high tolerance for bad sectors. + + For performance reasons logical size of device should be about 25% + smaller (about 63% smaller when mirroring is used) than lower-level + device. + + If you want to compile this driver as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read . The module + will be called lsbd.o. + + Most users will answer N here. + Micro Memory MM5415 Battery Backed RAM support (EXPERIMENTAL) CONFIG_BLK_DEV_UMEM Saying Y here will include support for the MM5415 family of diff --git a/Documentation/devices.txt b/Documentation/devices.txt index ce6280a..bd4ffa3 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt @@ -2581,7 +2581,14 @@ Your cooperation is appreciated. 231-239 UNASSIGNED -240-254 LOCAL/EXPERIMENTAL USE +240 block Log-Structured Block Device + 0 = /dev/lsbd0 First device, whole device + 1 = /dev/lsbd0p0 First device, first partition + ... + 32 = /dev/lsbd1 Second device, whole device + ... + +241-254 LOCAL/EXPERIMENTAL USE 255 RESERVED diff --git a/drivers/block/Config.in b/drivers/block/Config.in index 7560ff9..8c18e96 100644 --- a/drivers/block/Config.in +++ b/drivers/block/Config.in @@ -43,6 +43,7 @@ dep_tristate 'Promise SATA SX8 support' CONFIG_BLK_DEV_SX8 $CONFIG_PCI tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP dep_tristate 'Network block device support' CONFIG_BLK_DEV_NBD $CONFIG_NET +tristate 'Log-Structured Block Device' CONFIG_BLK_DEV_LSBD tristate 'RAM disk support' CONFIG_BLK_DEV_RAM if [ "$CONFIG_BLK_DEV_RAM" = "y" -o "$CONFIG_BLK_DEV_RAM" = "m" ]; then diff --git a/drivers/block/Makefile b/drivers/block/Makefile index d4b1663..be6edb4 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_ATARI_SLM) += acsi_slm.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_BLK_DEV_RAM) += rd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o +obj-$(CONFIG_BLK_DEV_LSBD) += lsbd.o obj-$(CONFIG_BLK_DEV_PS2) += ps2esdi.o obj-$(CONFIG_BLK_DEV_XD) += xd.o obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o diff --git a/drivers/block/lsbd.c b/drivers/block/lsbd.c new file mode 100644 index 0000000..db4ce7d --- /dev/null +++ b/drivers/block/lsbd.c @@ -0,0 +1,1123 @@ +/* + * ar3c-kernel - lsbd.c + * + * Log-Structured Block Device + * + * Copyright (C) 2010 Krzysztof Mazur + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Fundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#define MAJOR_NR LSBD_MAJOR + +#define LSBD_MAX 8 +#define PART_BITS 5 + +enum { + LSBD_QUEUE_MOVE, + LSBD_QUEUE_HIGH, + LSBD_QUEUE_NORMAL, + LSBD_QUEUE_COUNT, /* must be last */ +}; + +struct lsbd { + unsigned int id; + kdev_t dev; + unsigned int sector_size; /* size of sector */ + unsigned int sectors_per_block; + unsigned int blocks; + unsigned int blksize; + spinlock_t lock; + struct semaphore mutex; + + unsigned int lsectors; + + unsigned long long epoch; + unsigned int cur_block; + unsigned int clean_block; + + lsbd_lcache_t *lcache; + unsigned int cur_lcache; + + spinlock_t wqueue_lock; + struct list_head wqueue[LSBD_QUEUE_COUNT]; + wait_queue_head_t wqueue_wait; + unsigned int wqueue_len; + struct semaphore req_sem; + + int stop; + struct task_struct *tsk; + struct completion *event; +}; + +struct lsbd_request { + struct buffer_head *bh; + struct list_head list; + unsigned int sector; + int move; +}; + +#if 0 +#define lsbd_debug(c, fmt, ...) \ + printk(KERN_DEBUG "lsbd%d: "fmt, (c)->id, ## __VA_ARGS__) +#else +#define lsbd_debug(c, fmt, ...) do { } while (0) +#endif + +#define lsbd_info(c, fmt, ...) \ + printk(KERN_INFO "lsbd%d: "fmt, (c)->id, ## __VA_ARGS__) + +#define lsbd_error(c, fmt, ...) \ + printk(KERN_ERR "lsbd%d: "fmt, (c)->id, ## __VA_ARGS__) + +static struct lsbd lsbd_dev[LSBD_MAX]; + +static int lsbd_sizes[LSBD_MAX << PART_BITS]; +static int lsbd_maxsect[LSBD_MAX << PART_BITS]; +static int lsbd_blksizes[LSBD_MAX << PART_BITS]; +static int lsbd_hardsect_size[LSBD_MAX << PART_BITS]; +static struct hd_struct lsbd_struct[LSBD_MAX << PART_BITS]; + +static int lsbd_thread(void *data); + +unsigned int block_prev(struct lsbd *p, unsigned int block) +{ + return block ? block - 1 : p->blocks - 1; +} + +unsigned int block_diff(struct lsbd *p, unsigned int a, unsigned int b) +{ + if (a >= b) + return a - b; + return p->blocks + a - b; +} + +static int lsbd_open(struct inode *inode, struct file *file) +{ + int minor; + int dev; + + if (!inode) + return -EINVAL; + if (MAJOR(inode->i_rdev) != MAJOR_NR) { + printk(KERN_WARNING "lsbd_open: pseudo-major != %d\n", + MAJOR_NR); + return -ENODEV; + } + minor = MINOR(inode->i_rdev); + dev = minor >> PART_BITS; + if (dev >= LSBD_MAX) + return -ENODEV; + MOD_INC_USE_COUNT; + return 0; +} + +static int lsbd_release(struct inode *inode, struct file *file) +{ + int minor; + int dev; + + if (!inode) + return 0; + if (MAJOR(inode->i_rdev) != MAJOR_NR) { + printk(KERN_WARNING "lsbd_release: pseudo-major != %d\n", + MAJOR_NR); + return 0; + } + minor = MINOR(inode->i_rdev); + dev = minor >> PART_BITS; + if (dev >= LSBD_MAX) + return 0; + MOD_DEC_USE_COUNT; + return 0; +} + +unsigned int lsbd_block_size(struct lsbd *p) +{ + return p->sectors_per_block * p->sector_size; +} + +struct buffer_head *lsbd_bread(struct lsbd *p, int block, int size) +{ + return bread(p->dev, block * p->sectors_per_block, size); +} + +struct buffer_head *lsbd_sread(struct lsbd *p, int block, unsigned int sector, + int size) +{ + return bread(p->dev, block * p->sectors_per_block + sector + 1, size); +} + +static u32 lsbd_checksum(const void *buf, size_t count) +{ + const u32 *b = buf; + size_t i; + u32 csum = 0x12345678; + + BUG_ON(count & 3); + for (i = 0; i < count / 4; i++) + csum += b[i]; + return csum; +} + +static int lsbd_block_verify_ok(struct lsbd *p, const struct lsbd_block *b) +{ + u32 csum = lsbd_checksum(b, sizeof(*b) - 4); + + lsbd_debug(p, "csum: %08x %08x\n", csum, be32_to_cpu(b->checksum)); + return (be32_to_cpu(b->checksum) == csum); +} + +static int lsbd_block_commit(struct lsbd *p, struct lsbd_block *b) +{ + u32 csum = lsbd_checksum(b, sizeof(*b) - 4); + + lsbd_debug(p, "commit: %08x\n", csum); + b->checksum = cpu_to_be32(csum); + return 0; +} + +struct __block_test { + int a[(sizeof(struct lsbd_block) == 1024) ? 1 : -1]; +}; + +/* + * lsbd_find_current_block - find current block + */ +static int lsbd_find_current_block(struct lsbd *p) +{ + struct buffer_head *bh; + struct lsbd_block *b; + unsigned int i; + unsigned int block = 0; + unsigned int base = 0; + unsigned int blocks; + unsigned long long epoch = 0; + unsigned long long e = 0; + unsigned int reads = 0; + + /* read first block */ + for (i = 0; i < p->blocks; i++) { + reads++; + bh = lsbd_bread(p, i, sizeof(*b)); + if (bh == NULL) + continue; + + b = (void *) bh->b_data; + if (!lsbd_block_verify_ok(p, b)) { + brelse(bh); + continue; + } + + base = i; + epoch = cpu_to_be64(b->epoch); + brelse(bh); + break; + } + if (i == p->blocks) { + lsbd_error(p, "metadata not found, aborting\n"); + return -EINVAL; + } + + blocks = p->blocks - base; + block = base + blocks / 2; + while (blocks > 1) { + lsbd_debug(p, "block: %d, blocks: %d\n", block, blocks); + reads++; + bh = lsbd_bread(p, block, p->sector_size); + if (bh == NULL) { + lsbd_debug(p, "block %d: I/O\n", block); + block++; + if (block >= base + blocks) { + blocks = blocks / 2; + block = base + blocks / 2; + } + continue; + } + b = (void *) bh->b_data; + + if (!lsbd_block_verify_ok(p, b)) { + lsbd_debug(p, "block %d: verify\n", block); + brelse(bh); + block++; + if (block >= base + blocks) { + blocks = blocks / 2; + block = base + blocks / 2; + } + continue; + } + + e = be64_to_cpu(b->epoch); + lsbd_debug(p, "block: %d, epoch %Ld, max %Ld\n", block, + e, epoch); + if (e >= epoch) { + epoch = e; + blocks = base + blocks - block; + base = block; + } else { + blocks = blocks / 2; + } + block = base + blocks / 2; + brelse(bh); + } + lsbd_debug(p, "block: %d, blocks: %d\n", block, blocks); + + p->epoch = epoch; + p->cur_block = block; + p->clean_block = block; + lsbd_info(p, "recovery finished: block %d, epoch %Ld, %d io\n", + p->cur_block, p->epoch, reads); + return 0; +} + +/* + * lsbd_read_lcache - create lcache mapping + */ +int lsbd_read_lcache(struct lsbd *p) +{ + struct buffer_head *bh; + struct lsbd_block *b; + unsigned int block; + unsigned int readed = 0; + unsigned int sector_id; + unsigned int i = 0; + unsigned int sectors; + unsigned int sectors_max; + struct lsbd_sect *sects; + unsigned int readed_blocks = 0; + unsigned int lcache_offset; + unsigned int lcache_chunk; + unsigned int lcache_base; + lsbd_lcache_t *cache; + int first = 1; + + /* to protect against OOM allocate at most 64 MB of lcache */ + if (p->lsectors > 16 * 1024 * 1024) { + lsbd_error(p, "lcache: too high number of sectors %d\n", + p->lsectors); + return -EINVAL; + } + p->lcache = vmalloc(p->lsectors * sizeof(p->lcache[0])); + if (p->lcache == NULL) + return -ENOMEM; + for (i = 0; i < p->lsectors; i++) + p->lcache[i] = LSBD_SECT_INVALID; + + sectors_max = (p->sector_size - sizeof(*b)) / sizeof(sects[0]); + + block = p->cur_block; + do { + readed_blocks++; + bh = lsbd_bread(p, block, p->sector_size); + if (bh == NULL) + continue; + b = (void *) bh->b_data; + + if (!lsbd_block_verify_ok(p, b)) { + brelse(bh); + continue; + } + + sects = (void *) (bh->b_data + be32_to_cpu(b->ptab_offset)); + sectors = be32_to_cpu(b->sectors_per_block) - 1; + if (sectors > sectors_max) + sectors = sectors_max; + + for (i = 0; i < sectors; i++) { + sector_id = be32_to_cpu(sects[i].id); + + if (sector_id >= p->lsectors) + continue; + + if (p->lcache[sector_id] == LSBD_SECT_INVALID) { + p->lcache[sector_id] = block + * p->sectors_per_block + i + 1; + readed++; + } + } + + lcache_offset = be32_to_cpu(b->lcache_offset); + if (lcache_offset > p->sector_size) { + brelse(bh); + continue; + } + + lcache_chunk = be32_to_cpu(b->lcache_chunk); + if (lcache_chunk > (p->sector_size - lcache_offset) + / sizeof(*cache)) { + lcache_chunk = (p->sector_size - lcache_offset) + / sizeof(*cache); + } + lcache_base = be32_to_cpu(b->lcache_base); + cache = (void *) (bh->b_data + lcache_offset); + + if (be32_to_cpu(b->lcache_checksum) != lsbd_checksum(cache, + lcache_chunk * sizeof(*cache))) { + brelse(bh); + continue; + } + + if (first) { + p->cur_lcache = lcache_base + lcache_chunk; + first = 0; + } + + for (i = 0; i < lcache_chunk; i++) { + unsigned long long sector_id = lcache_base + i; + + if (sector_id >= p->lsectors) + break; + if (p->lcache[sector_id] == LSBD_SECT_INVALID) { + p->lcache[sector_id] = be32_to_cpu(cache[i]); + readed++; + } + } + brelse(bh); + } while ((readed < p->lsectors) + && ((block = block_prev(p, block)) != p->cur_block)); + + lsbd_info(p, "lcache: loaded using %u blocks, %u/%u\n", + readed_blocks, readed, p->lsectors); + return 0; +} + +static int lsbd_load_params(struct lsbd *p) +{ + struct buffer_head *bh; + struct lsbd_block *b; + unsigned int block = 0; + unsigned int i; + unsigned int blocks; + unsigned int sector_size; + unsigned int sectors_per_block; + + block = p->cur_block; + i = p->blocks; + if (i > 100) + i = 100; + + for (; i > 0; i--, block = block_prev(p, block)) { + bh = lsbd_bread(p, i, sizeof(*b)); + if (bh == NULL) + continue; + + b = (void *) bh->b_data; + if (!lsbd_block_verify_ok(p, b)) { + brelse(bh); + continue; + } + + sector_size = be32_to_cpu(b->sector_size); + sectors_per_block = be32_to_cpu(b->sectors_per_block); + blocks = be32_to_cpu(b->blocks); + p->lsectors = be32_to_cpu(b->lsectors); + if (!p->lsectors) + p->lsectors = p->blocks; + + brelse(bh); + + if ((sector_size & (sector_size - 1)) || (sector_size < 4096)) { + lsbd_error(p, "sector_size=%d must be a power of two " + "and larger than 4 KiB\n", + sector_size); + return -EINVAL; + } + + if (sector_size > p->sector_size) { + lsbd_info(p, "changing sector size to %d\n", + sector_size); + p->sector_size = sector_size; +// p->blks_per_sector = p->sector_size / p->blksize; + } + + if (blocks < p->blocks) { + p->blocks = blocks; + lsbd_info(p, "shrinking device to %d blocks\n", + p->blocks); + } else if (blocks > p->blocks) { + lsbd_error(p, "refusing to start partial image %d " + "avail, %d total\n", + p->blocks, blocks); + return -EINVAL; + } + + lsbd_info(p, "logical size %d KiB, physical %d KiB\n", + p->lsectors * (p->sector_size >> 10), + p->blocks * p->sectors_per_block + * (p->sector_size >> 10)); + lsbd_sizes[p->id] = p->lsectors * (p->sector_size >> 10); + return 0; + } + return 1; +} + +static void lsbd_stop(struct lsbd *p) +{ + struct completion event; + + lsbd_info(p, "stopping lsbd task (pid = %d)\n", p->tsk->pid); + init_completion(&event); + p->event = &event; + p->stop = 1; + send_sig(SIGKILL, p->tsk, 1); + wait_for_completion(&event); + p->tsk = NULL; +} + +static int lsbd_mount(struct lsbd *p) +{ + int blksize; + int size; + int ret; + struct completion event; + + if (p->tsk != NULL) + lsbd_stop(p); + + p->stop = 0; + blksize = 0; + if (blksize_size[MAJOR(p->dev)]) + blksize = blksize_size[MAJOR(p->dev)][MINOR(p->dev)]; + if (!blksize) + blksize = 1024; + + p->blksize = blksize; + p->sector_size = blksize; + if (p->sector_size) + p->sector_size = 4096; +// p->blks_per_sector = p->sector_size / blksize; + lsbd_debug(p, "assuming %d sector size\n", p->sector_size); + + p->sectors_per_block = 16; + size = 0; + if (blk_size[MAJOR(p->dev)]) + size = blk_size[MAJOR(p->dev)][MINOR(p->dev)]; + p->blocks = size / (lsbd_block_size(p) >> 10); + lsbd_info(p, "%d KiB sectors, %d KiB blocks, %d blocks\n", + p->sector_size >> 10, lsbd_block_size(p) >> 10, + p->blocks); + if (!p->blocks) { + lsbd_error(p, "aborting mount on empty device\n"); + return -EINVAL; + } + + ret = lsbd_find_current_block(p); + if (ret) + return ret; + ret = lsbd_load_params(p); + if (ret) + return ret; + lsbd_read_lcache(p); + + init_completion(&event); + p->event = &event; + ret = kernel_thread(lsbd_thread, p, 0); + if (ret < 0) + return ret; + wait_for_completion(&event); + return 0; +} + +static int lsbd_set_dev(struct lsbd *p, int major, int minor) +{ + kdev_t dev = MKDEV(major, minor); + + spin_lock_irq(&p->lock); + p->dev = dev; + spin_unlock_irq(&p->lock); + + return lsbd_mount(p); +} + +static int lsbd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct lsbd *p; + int minor; + int dev; + int err; + + if (!inode) + return -EINVAL; + if (MAJOR(inode->i_rdev) != MAJOR_NR) { + printk(KERN_WARNING "lsbd_open: pseudo-major != %d\n", + MAJOR_NR); + return -ENODEV; + } + minor = MINOR(inode->i_rdev); + dev = minor >> PART_BITS; + if (dev >= LSBD_MAX) + return -ENODEV; + + p = &lsbd_dev[dev]; + down_interruptible(&p->mutex); + switch (cmd) { + case LSBD_SET_DEV: { + unsigned int major = arg >> 20; + unsigned int minor = arg & 0xfffff; + + err = lsbd_set_dev(p, major, minor); + break; + } + case BLKGETSIZE: + err = put_user((unsigned long) lsbd_sizes[minor] << 1, + (unsigned long *) arg); + break; + case BLKGETSIZE64: + err = put_user((u64) lsbd_sizes[minor] << 10, (u64 *) arg); + break; + case BLKBSZGET: + case BLKBSZSET: + case BLKSSZGET: + err = blk_ioctl(inode->i_rdev, cmd, arg); + break; + default: + return -EINVAL; + } + up(&p->mutex); + return err; +} + +static struct block_device_operations lsbd_fops = { + .owner = THIS_MODULE, + .open = lsbd_open, + .release = lsbd_release, + .ioctl = lsbd_ioctl, +}; + +static struct gendisk lsbd_gendisk = { + .major = MAJOR_NR, + .major_name = "lsbd", + .minor_shift = PART_BITS, + .max_p = 1 << PART_BITS, + .part = lsbd_struct, + .sizes = lsbd_sizes, + .fops = &lsbd_fops, +}; + +static int lsbd_queue_bh(struct lsbd *p, struct buffer_head *bh, + unsigned int sector, unsigned int prio) +{ + unsigned long flags; + struct lsbd_request *r; + int need_wakeup = 0; + int move = (prio == LSBD_QUEUE_MOVE); + + if (prio >= LSBD_QUEUE_COUNT) + return -EINVAL; + + if (sector >= p->lsectors) + return -EINVAL; + + if (!move) + down(&p->req_sem); + /* queue non-read requests to daemon */ + r = kmalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) { + if (!move) + up(&p->req_sem); + return -ENOMEM; + } + r->bh = bh; + r->sector = sector; + r->move = move; + + spin_lock_irqsave(&p->wqueue_lock, flags); + list_add_tail(&r->list, &p->wqueue[prio]); + if (!p->wqueue_len) + need_wakeup = 1; + p->wqueue_len++; + spin_unlock_irqrestore(&p->wqueue_lock, flags); + if (need_wakeup) + wake_up_interruptible(&p->wqueue_wait); + return 0; +} + +static int lsbd_make_request(request_queue_t *q, int rw, + struct buffer_head *bh) +{ + struct lsbd *p = &lsbd_dev[MINOR(bh->b_rdev) >> PART_BITS]; + unsigned int lsector = LSBD_SECT_INVALID; + + if (rw == READA || rw == READ) { + unsigned int sector; + + lsector = bh->b_rsector >> 3; + lsbd_debug(p, "reading sector %d (%ld)\n", lsector, + bh->b_rsector); + lsbd_debug(p, "blocknr %ld, size %hd\n", bh->b_blocknr, + bh->b_size); + if (lsector >= p->lsectors) { + buffer_IO_error(bh); + return 0; + } + sector = p->lcache[lsector]; + lsbd_debug(p, "mapped to sector %d\n", sector); + if (sector == LSBD_SECT_INVALID) { + buffer_IO_error(bh); + return 0; + } + if (sector == LSBD_SECT_ZERO) { + char *buf; + buf = bh_kmap(bh); + memset(buf, 0, bh->b_size); + kunmap(bh); + bh->b_end_io(bh, 1); + return 0; + } + bh->b_rsector = (unsigned long) p->lcache[lsector] << 3; + lsbd_debug(p, "mapped to physical %ld\n", bh->b_rsector); + + bh->b_rdev = p->dev; + /* + * Let the main block layer submit the IO and resolve recursion: + */ + return 1; + } + + if (rw == WRITE) { + lsector = bh->b_rsector >> 3; + if (lsbd_queue_bh(p, bh, lsector, LSBD_QUEUE_NORMAL)) + buffer_IO_error(bh); + return 0; + } + buffer_IO_error(bh); + return 0; +} + +void lsbd_clean_block(struct lsbd *p) +{ + unsigned long block = p->clean_block; + struct buffer_head *bh; + struct buffer_head *rbh; + struct lsbd_block *b; + int ret; + struct lsbd_sect *sects; + unsigned long sectors; + unsigned long sectors_max; + unsigned long i; + unsigned int ptab_offset; + + block++; + if (block >= p->blocks) + block = 0; + +#if 0 + /* + * do not clean mirror blocks + */ + if (p->mirror_offset && (block & 1)) + goto out; +#endif + + bh = lsbd_bread(p, block, p->sector_size); + if (bh == NULL) + goto out; + b = (void *) bh->b_data; + + ret = lsbd_block_verify_ok(p, b); + if (!ret) { + brelse(bh); + goto out; + } + + ptab_offset = be32_to_cpu(b->ptab_offset); + if (ptab_offset >= p->sector_size) { + brelse(bh); + goto out; + } + sectors_max = (p->sector_size - ptab_offset) / sizeof(sects[0]); + sectors = be32_to_cpu(b->sectors_per_block) - 1; + if (sectors > sectors_max) + sectors = sectors_max; + + sects = (void *) (bh->b_data + b->ptab_offset); + + for (i = 0; i < sectors; i++) { + unsigned long long sector_id = be64_to_cpu(sects[i].id); + unsigned long s = block * p->sectors_per_block + i; + + if (sector_id >= p->lsectors) + continue; + if (p->lcache[sector_id] == s) { + rbh = lsbd_sread(p, block, i, p->sector_size); + if (rbh == NULL) + continue; + + lock_buffer(rbh); + ret = lsbd_queue_bh(p, rbh, s, LSBD_QUEUE_MOVE); + if (ret) + brelse(rbh); + } + } + brelse(bh); + +out: + p->clean_block = block; +} + +static int initialize_block(struct lsbd *p, struct lsbd_block *b) +{ + unsigned int i; + + b->magic = cpu_to_be64(LSBD_BLOCK_MAGIC); + b->version = cpu_to_be32(0); + b->revision = cpu_to_be32(0); + b->ctime = cpu_to_be64(0); + b->mtime = cpu_to_be64(0); + + b->epoch = cpu_to_be64(0); + + /* FIXME: store age also in other blocks? */ + b->age = cpu_to_be64(0); + + b->sector_size = cpu_to_be32(p->sector_size); + b->sectors_per_block = cpu_to_be32(p->sectors_per_block); + + b->blocks = cpu_to_be32(p->blocks); + b->lsectors = cpu_to_be32(p->lsectors); + + for (i = 0; i < 16; i++) + b->prev_block[i] = cpu_to_be32(~0); + return 0; +} + +static int lsbd_write_lcache(struct lsbd *p, void *bp) +{ + struct lsbd_block *b = bp; + unsigned int chunk; + unsigned int i; + lsbd_lcache_t *cache; + + if (p->cur_lcache >= p->lsectors) + p->cur_lcache = 0; + + chunk = (p->sector_size - 0x800) / sizeof(*cache); + b->lcache_offset = cpu_to_be32(0x800); + b->lcache_base = cpu_to_be32(p->cur_lcache); + b->lcache_chunk = cpu_to_be32(chunk); + cache = (void *) ((char *) bp + be32_to_cpu(b->lcache_offset)); + + for (i = 0; i < chunk; i++) { + unsigned long long sector_id = p->cur_lcache + i; + + if (sector_id >= p->lsectors) + break; + cache[i] = cpu_to_be32(p->lcache[sector_id]); + } + b->lcache_checksum = cpu_to_be32(lsbd_checksum(cache, + chunk * sizeof(*cache))); + + p->cur_lcache += chunk; + return 0; +} + +/** + * lsbd_request_dequeue - dequeue request + * @p: LSBD state + */ +static struct lsbd_request *lsbd_request_dequeue(struct lsbd *p) +{ + struct lsbd_request *r = NULL; + unsigned int i; + unsigned long flags; + + spin_lock_irqsave(&p->wqueue_lock, flags); + for (i = 0; i < LSBD_QUEUE_COUNT; i++) { + if (!list_empty(&p->wqueue[i])) { + r = list_entry(p->wqueue[i].next, struct lsbd_request, + list); + list_del(&r->list); + p->wqueue_len--; + break; + } + } + spin_unlock_irqrestore(&p->wqueue_lock, flags); + return r; +} + +static void lsbd_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + put_bh(bh); +} + +int lsbd_write_block(struct lsbd *p) +{ + struct buffer_head **bh; + struct buffer_head **wbh; + struct lsbd_block *b; + struct lsbd_sect *sects; + struct lsbd_request *r; + unsigned int base_sector; + unsigned int i; + + p->cur_block++; + if (p->cur_block >= p->blocks) + p->cur_block = 0; + + lsbd_debug(p, "writting block %d\n", p->cur_block); + + bh = kmalloc(p->sectors_per_block * sizeof(*bh), GFP_KERNEL); + if (bh == NULL) + return -ENOMEM; + + wbh = kmalloc(p->sectors_per_block * sizeof(*wbh), GFP_KERNEL); + if (bh == NULL) { + kfree(bh); + return -ENOMEM; + } + + base_sector = p->cur_block * p->sectors_per_block; + for (i = 0; i < p->sectors_per_block; i++) { + lsbd_debug(p, "sector: %d, block: %d\n", base_sector + i, + (base_sector + i)); + bh[i] = getblk(p->dev, base_sector + i, p->sector_size); + BUG_ON(bh[i] == NULL); + lock_buffer(bh[i]); + bh[i]->b_end_io = lsbd_end_buffer_io_sync; + } + + memset(bh[0]->b_data, 0, p->sector_size); + b = (void *) bh[0]->b_data; + initialize_block(p, b); + + p->epoch++; + b->age = cpu_to_be64(be64_to_cpu(b->age) + 1); + b->epoch = cpu_to_be64(p->epoch); + + b->ptab_offset = cpu_to_be32(0x400); + + sects = (void *)(bh[0]->b_data + be32_to_cpu(b->ptab_offset)); + + wbh[0] = NULL; + for (i = 0; i < p->sectors_per_block - 1; i++) { + r = lsbd_request_dequeue(p); + if (r != NULL) { + int move; + + BUG_ON(r->sector >= p->lsectors); + sects[i].id = cpu_to_be32(r->sector); + p->lcache[r->sector] = p->cur_block + * p->sectors_per_block + i + 1; + memcpy(bh[i + 1]->b_data, r->bh->b_data, + p->sector_size); + move = r->move; + if (move) { + unlock_buffer(r->bh); + brelse(r->bh); + r->bh = NULL; + } + wbh[i + 1] = r->bh; + kfree(r); + if (!move) + up(&p->req_sem); + } else { + memset(bh[i + 1]->b_data, 0, p->sector_size); + sects[i].id = cpu_to_be32(LSBD_SECT_INVALID); + wbh[i + 1] = NULL; + } + } + b->ptab_checksum = cpu_to_be32(lsbd_checksum(bh[0]->b_data + + be32_to_cpu(b->ptab_offset), + (p->sectors_per_block - 1) + * sizeof(struct lsbd_sect))); + + lsbd_write_lcache(p, b); + lsbd_block_commit(p, b); + for (i = 0; i < p->sectors_per_block; i++) { + mark_buffer_uptodate(bh[i], 1); + unlock_buffer(bh[i]); + mark_buffer_dirty(bh[i]); + } + ll_rw_block(WRITE, p->sectors_per_block, bh); + + lsbd_debug(p, "waiting for write\n"); + for (i = 0; i < p->sectors_per_block; i++) { + wait_on_buffer(bh[i]); + brelse(bh[i]); + + if (wbh[i] != NULL) + wbh[i]->b_end_io(wbh[i], 1); + } + lsbd_debug(p, "done\n"); + kfree(wbh); + kfree(bh); + return 0; +} + +static int lsbd_thread(void *data) +{ + struct lsbd *p = data; + + lock_kernel(); + daemonize(); + strcpy(current->comm, "lsbd_thread"); + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); + p->tsk = current; + + /* + * lsbd is a 'system-thread', it's priority should be very + * high. + */ + current->policy = SCHED_OTHER; + current->nice = -20; + unlock_kernel(); + + complete(p->event); + for (;;) { + DECLARE_WAITQUEUE(wait, current); + unsigned int len; + + add_wait_queue(&p->wqueue_wait, &wait); + set_task_state(current, TASK_INTERRUPTIBLE); + spin_lock_irq(&p->wqueue_lock); + len = p->wqueue_len; + spin_unlock_irq(&p->wqueue_lock); + if (!len && (block_diff(p, p->clean_block, p->cur_block) >= 16)) + schedule(); + current->state = TASK_RUNNING; + remove_wait_queue(&p->wqueue_wait, &wait); + + lsbd_debug(p, "lsbd_thread: reqs %d\n", len); + + if (signal_pending(current)) { + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); + } + + if (p->stop) + break; + + if (block_diff(p, p->clean_block, p->cur_block) < 16) + lsbd_clean_block(p); + + while (len && (block_diff(p, p->clean_block, p->cur_block) + > 1)) { + lsbd_write_block(p); + spin_lock_irq(&p->wqueue_lock); + len = p->wqueue_len; + spin_unlock_irq(&p->wqueue_lock); + } + } + complete(p->event); + return 0; +} + +int __init lsbd_init_module(void) +{ + unsigned int i; + + for (i = 0; i < LSBD_MAX; i++) { + struct lsbd *p = &lsbd_dev[i]; + unsigned int j; + + for (j = 0; j < LSBD_QUEUE_COUNT; j++) + INIT_LIST_HEAD(&p->wqueue[j]); + + p->id = i; + spin_lock_init(&p->lock); + spin_lock_init(&p->wqueue_lock); + p->wqueue_len = 0; + init_waitqueue_head(&p->wqueue_wait); + init_MUTEX(&p->mutex); + sema_init(&p->req_sem, 512); + p->tsk = NULL; + } + + if (devfs_register_blkdev(MAJOR_NR, "lsbd", &lsbd_fops)) { + printk(KERN_WARNING "Unable to get major number %d for lsbd" + " device\n", MAJOR_NR); + return -EIO; + } + + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lsbd_make_request); + read_ahead[MAJOR_NR] = 4; /* 16 kB */ + + add_gendisk(&lsbd_gendisk); + + /* + * TODO: add support for other block sizes + */ + for (i = 0; i < (LSBD_MAX << PART_BITS); i++) + lsbd_blksizes[i] = 4096; + blksize_size[MAJOR_NR] = lsbd_blksizes; + + for (i = 0; i < (LSBD_MAX << PART_BITS); i++) + lsbd_sizes[i] = 0; + blk_size[MAJOR_NR] = lsbd_sizes; + + /* + * TODO: add support for other block sizes + */ + for (i = 0; i < (LSBD_MAX << PART_BITS); i++) + lsbd_hardsect_size[i] = 4096; + hardsect_size[MAJOR_NR] = lsbd_hardsect_size; + + /* + * TODO: add support for other block sizes + */ + for (i = 0; i < (LSBD_MAX << PART_BITS); i++) + lsbd_maxsect[i] = 0; + max_sectors[MAJOR_NR] = lsbd_maxsect; + + for (i = 0; i < LSBD_MAX; i++) { + register_disk(&lsbd_gendisk, MKDEV(MAJOR_NR, i << PART_BITS), + 1 << PART_BITS, &lsbd_fops, 0); + } + return 0; +} + +void lsbd_cleanup_module(void) +{ + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + hardsect_size[MAJOR_NR] = NULL; + read_ahead[MAJOR_NR] = 0; + del_gendisk(&lsbd_gendisk); + if (devfs_unregister_blkdev(MAJOR_NR, "lsbd")) + printk(KERN_WARNING "lsbd: cannot unregister blkdev\n"); +} + +#if MODULE +module_init(lsbd_init_module); +module_exit(lsbd_cleanup_module); + +MODULE_DESCRIPTION("Log-Structured Block Device"); +MODULE_AUTHOR("Krzysztof Mazur"); +MODULE_LICENSE("GPL"); +#endif diff --git a/include/linux/lsbd.h b/include/linux/lsbd.h new file mode 100644 index 0000000..7faa7ca --- /dev/null +++ b/include/linux/lsbd.h @@ -0,0 +1,87 @@ +/* + * ar3c-kernel - lsbd.h + * Copyright (C) 2010 Krzysztof Mazur + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Fundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __LINUX_LSBD_H +#define __LINUX_LSBD_H + +#include + +#define LSBD_SET_DEV _IO('l', 0x0) + +#ifdef __KERNEL__ +#define LSBD_BLOCK_MAGIC 0x11991299AA1122FFULL +struct lsbd_block { + u64 magic; /* 0x00 block magic */ + u32 version; /* 0x08 block version */ + u32 revision; /* 0x0c block revision */ + u64 ctime; /* 0x10 lsbd device creation time */ + u64 mtime; /* 0x18 last modify time */ + u64 epoch; /* 0x20 unique block sequence number */ + u64 age; /* 0x28 number of writes of this block */ + + u32 sector_size; /* 0x30 sector size in bytes */ + u32 sectors_per_block; /* 0x34 in sectors */ + u32 blocks; /* 0x38 number of blocks */ + u32 lsectors; /* 0x3c number of logical sectors */ + + /* + * lcache access needs to read previously written logical blocks, + * not physical. This is for fast access of those blocks. + * The i-th pointer specifies (1 << i) previous block. + */ + u32 prev_block[16]; /* 0x40 previous block table */ + + u32 ptab_offset; /* 0x80 */ + u32 mirror_offset; /* 0x84 mirror offset */ + u32 pad1; /* 0x88 0 */ + u32 ptab_checksum; /* 0x8c */ + + /* lcache - logical -> physical mapping cache */ + u32 lcache_offset; /* 0x90 lcache location in block */ + u32 lcache_base; /* 0x94 first logical sector in this cache */ + u32 lcache_chunk; /* 0x98 number of entries per lcache */ + u32 lcache_checksum; /* 0x9c lcache content checksum */ + + + u32 reserved[256 - 0xa0/4 - 1]; /* 0x100 */ + + u32 checksum; +} __attribute__((packed)); + +struct lsbd_sect { + u64 epoch; /* version of this sector */ + u64 mtime; /* last modify time */ + u32 id; /* sector id */ + u32 age; /* weighted update frequency */ + u32 flags; + u32 data_checksum; /* data checksum */ +} __attribute__((packed)); + +#define LSBD_SECT_INVALID 0xffffffffUL +#define LSBD_SECT_ZERO 0xfffffffeUL +/* + * lcache is a table of physical mappings of logical sectors [lcache_base; + * lcache_base + lcache_chunk) to physical sectors. + */ +typedef u32 lsbd_lcache_t; +#endif + + + +#endif diff --git a/include/linux/major.h b/include/linux/major.h index 290db4b..b6f5371 100644 --- a/include/linux/major.h +++ b/include/linux/major.h @@ -166,6 +166,8 @@ #define IBM_TTY3270_MAJOR 227 /* Official allocations now */ #define IBM_FS3270_MAJOR 228 +#define LSBD_MAJOR 240 + /* * Tests for SCSI devices. */ -- 1.8.4.652.g0d6e0ce