/* * lsbd - lsbd.c * Copyright (C) 2010 Krzysztof Mazur * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Fundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #define _GNU_SOURCE 1 #include #include #include #include #include #include #include #include #include #ifdef HAVE_BYTESWAP_H #include #else #define bswap_16(x) \ ((((x) >> 8) & 0xff) | (((x) & 0xff) << 8)) #define bswap_32(x) \ ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) #define bswap_64(x) \ ((((x) & 0xff00000000000000ull) >> 56) \ | (((x) & 0x00ff000000000000ull) >> 40) \ | (((x) & 0x0000ff0000000000ull) >> 24) \ | (((x) & 0x000000ff00000000ull) >> 8) \ | (((x) & 0x00000000ff000000ull) << 8) \ | (((x) & 0x0000000000ff0000ull) << 24) \ | (((x) & 0x000000000000ff00ull) << 40) \ | (((x) & 0x00000000000000ffull) << 56)) #endif #include #define u8_to_cpu(x) (x) #define cpu_to_u8(x) (x) #if BYTE_ORDER == BIG_ENDIAN #define u16_to_cpu(x) (x) #define u32_to_cpu(x) (x) #define u64_to_cpu(x) (x) #define cpu_to_u16(x) (x) #define cpu_to_u32(x) (x) #define cpu_to_u64(x) (x) #elif BYTE_ORDER == LITTLE_ENDIAN #define u16_to_cpu(x) (bswap_16(x)) #define u32_to_cpu(x) (bswap_32(x)) #define u64_to_cpu(x) (bswap_64(x)) #define cpu_to_u16(x) (bswap_16(x)) #define cpu_to_u32(x) (bswap_32(x)) #define cpu_to_u64(x) (bswap_64(x)) #else #error "Please define type conversion macros" #endif struct sector { struct list_head list; unsigned long long id; unsigned long long epoch; void *data; }; struct lsbd { int fd; unsigned long long lsize; /* logical size */ unsigned long long lsectors; unsigned long long usize; /* usable size */ unsigned long long size; /* physical size */ unsigned long sector_size; /* size of sector */ unsigned long blocks_per_sector; unsigned long blocks; unsigned long lcache_blocks; unsigned long cur_lcache; unsigned long block_usize; /* usable size of block */ unsigned long sectors_per_block; unsigned long long epoch; unsigned long cur_block; unsigned long clean_block; unsigned long mirror_offset; lsbd_lcache_t *lcache; struct list_head wqueue; unsigned long wqueue_len; }; unsigned long block_prev(struct lsbd *p, unsigned long block) { return block ? block - 1 : p->blocks - 1; } unsigned long block_diff(struct lsbd *p, unsigned long a, unsigned long b) { if (a >= b) return a - b; return p->blocks + a - b; } int lsbdd(struct lsbd *p); unsigned long long read_count; unsigned long long write_count; static ssize_t read_all(int fd, void *buf, size_t count) { char *b = buf; size_t readed = 0; ssize_t ret; while (readed < count) { ret = read(fd, b, count - readed); if (ret < 0) return -errno; if (ret == 0) return readed; b += ret; readed += ret; } read_count++; return readed; } static ssize_t write_all(int fd, const void *buf, size_t count) { const char *b = buf; size_t written = 0; ssize_t ret; while (written < count) { ret = write(fd, b, count - written); if (ret < 0) return -errno; if (ret == 0) return written; b += ret; written += ret; } write_count++; return written; } static size_t lsbd_block_size(struct lsbd *p) { return p->sectors_per_block * p->sector_size; } static void *__lsbd_read_block(struct lsbd *p, unsigned long id, int fast) { ssize_t ret; size_t size; void *b; size = lsbd_block_size(p); b = memalign(4096, size); lseek(p->fd, size * id, SEEK_SET); if (fast) size = p->sector_size; ret = read_all(p->fd, b, size); if (ret != size) { free(b); return NULL; } return b; } static void *lsbd_read_sector(struct lsbd *p, unsigned long sector) { ssize_t ret; size_t size; void *b; size = p->sector_size; b = memalign(4096, size); lseek(p->fd, size * sector, SEEK_SET); ret = read_all(p->fd, b, size); if (ret != size) { free(b); return NULL; } return b; } static void *lsbd_read_block(struct lsbd *p, unsigned long id) { return __lsbd_read_block(p, id, 0); } static void *lsbd_read_block_fast(struct lsbd *p, unsigned long id) { return __lsbd_read_block(p, id, 1); } static u32 checksum(const void *buf, size_t count) { const u32 *b = buf; size_t i; u32 csum = 0x12345678; if (count & 3) { fprintf(stderr, "unaligned checksum\n"); return 0; } for (i = 0; i < count / 4; i++) csum += b[i]; return csum; } static int lsbd_block_verify_ok(struct lsbd *p, const struct lsbd_block *b) { u32 csum = checksum(b, sizeof(*b) - 4); return (b->checksum == cpu_to_u32(csum)); } static int lsbd_block_commit(struct lsbd *p, struct lsbd_block *b) { u32 csum = checksum(b, sizeof(*b) - 4); b->checksum = cpu_to_u32(csum); return 0; } static int lsbd_write_block(struct lsbd *p, void *b, unsigned int id) { size_t size; ssize_t ret; size = lsbd_block_size(p); lseek(p->fd, id * size, SEEK_SET); ret = write_all(p->fd, b, size); if (ret != size) { fprintf(stderr, "write: %s\n", strerror(-ret)); return -EIO; } return 0; } static int initialize_block(struct lsbd *p, struct lsbd_block *b) { unsigned int i; memset(b, 0, sizeof(*b)); b->magic = cpu_to_u64(LSBD_BLOCK_MAGIC); b->version = cpu_to_u32(0); b->revision = cpu_to_u32(0); b->ctime = cpu_to_u64(0); b->mtime = cpu_to_u64(0); b->epoch = cpu_to_u64(0); /* FIXME: store age also in other blocks? */ b->age = cpu_to_u64(0); b->sector_size = cpu_to_u32(p->sector_size); b->sectors_per_block = cpu_to_u32(p->sectors_per_block); b->blocks = cpu_to_u32(p->blocks); b->lsectors = cpu_to_u32(p->lsectors); for (i = 0; i < 16; i++) b->prev_block[i] = cpu_to_u32(~0); return 0; } /* * find_current_block - find current block * * TODO: This should be rewritten to achive O(log n) instead O(n). */ static int find_current_block(struct lsbd *p) { void *buf; struct lsbd_block *b; unsigned long i; int ret; unsigned long block = 0; unsigned long base = 0; unsigned long blocks; unsigned long long epoch = 0; unsigned long long e = 0; unsigned long reads = 0; /* read first block */ for (i = 0; i < p->blocks; i++) { reads++; buf = lsbd_read_block(p, i); if (p == NULL) { fprintf(stderr, "block %ld: I/O error\n", i); free(buf); // fixme continue; } b = buf; ret = lsbd_block_verify_ok(p, b); if (!ret) { free(buf); continue; } base = i; epoch = cpu_to_u64(b->epoch); free(buf); break; } if (i == p->blocks) { p->epoch = 0; p->cur_block = 0; p->clean_block = p->blocks - 1; return 0; } fprintf(stderr, "base %ld\n", base); blocks = p->blocks - base; block = base + blocks / 2; // fprintf(stderr, "a: testing block %ld\n", block); while (blocks > 1) { reads++; buf = lsbd_read_block(p, block); if (buf == NULL) { fprintf(stderr, "block %ld: I/O error\n", block); free(buf); // fixme block++; if (block >= base + blocks) { blocks = blocks / 2; block = base + blocks / 2; } // fprintf(stderr, "b: testing block %ld %ld\n", // block, base + blocks); continue; } b = buf; ret = lsbd_block_verify_ok(p, b); if (!ret) { free(buf); block++; if (block >= base + blocks) { blocks = blocks / 2; block = base + blocks / 2; } // fprintf(stderr, "c: testing block %ld %ld\n", // block, base + blocks); continue; } e = cpu_to_u64(b->epoch); if (e >= epoch) { epoch = e; blocks = base + blocks - block; base = block; } else { blocks = blocks / 2; } block = base + blocks / 2; // fprintf(stderr, "d: testing block %ld\n", // block); free(buf); } fprintf(stderr, "lsbd: current block is %ld, epoch %Ld, %ld io\n", block, epoch, reads); p->epoch = epoch; p->cur_block = block; p->clean_block = block; return 0; } int lsbd_read_lcache(struct lsbd *p) { void *bp; struct lsbd_block *b; unsigned long block; unsigned long long readed = 0; unsigned long long sector_id; unsigned long i = 0; int ret; unsigned long sectors; unsigned long sectors_max; struct lsbd_sect *sects; unsigned long readed_blocks = 0; unsigned long lcache_offset; unsigned long lcache_chunk; unsigned long lcache_base; lsbd_lcache_t *cache; int first = 1; sectors_max = (p->sector_size - sizeof(*b)) / sizeof(sects[0]); block = p->cur_block; do { readed_blocks++; bp = lsbd_read_block_fast(p, block); if (p == NULL) { fprintf(stderr, "block %ld: I/O error\n", i); free(bp); // fixme continue; } b = bp; ret = lsbd_block_verify_ok(p, b); if (!ret) { // fprintf(stderr, "block %ld: checksum error\n", i); free(bp); // fixme continue; } sects = (void *) ((char *) bp + u32_to_cpu(b->ptab_offset)); sectors = u32_to_cpu(b->sectors_per_block) - 1; if (sectors > sectors_max) { fprintf(stderr, "block %ld: invalid sectors per " "block: %ld %ld\n", block, sectors, sectors_max); sectors = sectors_max; } for (i = 0; i < sectors; i++) { sector_id = u64_to_cpu(sects[i].id); if (sector_id >= p->lsectors) continue; if (u32_to_cpu(p->lcache[sector_id]) == LSBD_BLOCK_INVALID) { p->lcache[sector_id] = cpu_to_u32(block * p->sectors_per_block + i); readed++; } } lcache_offset = u32_to_cpu(b->lcache_offset); if (lcache_offset > p->sector_size) { fprintf(stderr, "lcache: invalid offset\n"); free(bp); continue; } lcache_chunk = u32_to_cpu(b->lcache_chunk); if (lcache_chunk > (p->sector_size - lcache_offset) / sizeof(*cache)) { lcache_chunk = (p->sector_size - lcache_offset) / sizeof(*cache); } lcache_base = u32_to_cpu(b->lcache_base); cache = (void *) ((char *) bp + lcache_offset); #if 0 fprintf(stderr, "lcache: reading block %ld: %ld-%ld\n", block, lcache_base, lcache_base + lcache_chunk - 1); #endif if (u32_to_cpu(b->lcache_checksum) != checksum(cache, lcache_chunk * sizeof(*cache))) { fprintf(stderr, "lcache: invalid checksum\n"); free(bp); continue; } if (first) { p->cur_lcache = lcache_base + lcache_chunk; first = 0; } for (i = 0; i < lcache_chunk; i++) { unsigned long long sector_id = lcache_base + i; if (sector_id >= p->lsectors) break; if (u32_to_cpu(p->lcache[sector_id]) == LSBD_BLOCK_INVALID) { p->lcache[sector_id] = cache[i]; readed++; } } free(bp); } while ((readed < p->lsectors) && ((block = block_prev(p, block)) != p->cur_block)); #if 0 for (i = 0; i < p->lsectors; i++) { if (u32_to_cpu(p->lcache[i]) == LSBD_BLOCK_INVALID) { fprintf(stderr, "lcache: invalid %ld\n", i); } else if (u32_to_cpu(p->lcache[i]) != LSBD_BLOCK_ZERO) { fprintf(stderr, "lcache: %ld mapped at block %ld\n", i, u32_to_cpu(p->lcache[i])); } } #endif fprintf(stderr, "lcache: recovered using %ld blocks, %Ld/%Ld\n", readed_blocks, readed, p->lsectors); return 0; } int lsbd_open(struct lsbd *p, const char *path, unsigned int flags) { int fd; unsigned long i; INIT_LIST_HEAD(&p->wqueue); fd = open(path, O_RDWR | O_DIRECT | O_SYNC); if (fd == -1) { fd = open(path, O_RDWR | O_SYNC); if (fd == -1) return -errno; } p->fd = fd; p->size = lseek(fd, 0, SEEK_END); if (p->size == (off_t) -1) return -errno; // p->size = 64 * 1024 * 1024; fprintf(stderr, "size: %Ld MiB\n", p->size >> 20); /* 4 kB sectors */ p->sector_size = 4UL << 10; /* 16 sectors per block */ p->sectors_per_block = 16; p->block_usize = p->sector_size; p->block_usize *= p->sectors_per_block - 1; // p->lsize = 1536ULL << 20; p->lsize = 256ULL << 10; p->lsectors = p->lsize / p->sector_size; fprintf(stderr, "logical sectors: %Ld\n", p->lsectors); fprintf(stderr, "lcache size %Ld\n", p->lsectors * 4); p->mirror_offset = 0; p->lcache = malloc(p->lsectors * sizeof(p->lcache[0])); if (p->lcache == NULL) abort(); for (i = 0; i < p->lsectors; i++) p->lcache[i] = cpu_to_u32(LSBD_BLOCK_INVALID); p->lcache_blocks = (p->lsectors * 4 + p->sector_size - 2049) / (p->sector_size - 2048); fprintf(stderr, "using %ld blocks for lcache\n", p->lcache_blocks); #if 0 p->lcache = malloc(p->lsize / p->sector_size * sizeof(p->lcache)); if (p->lcache == NULL) abort(); #endif p->blocks = p->size / lsbd_block_size(p); fprintf(stderr, "physical blocks: %ld\n", p->blocks); p->usize = p->blocks * p->block_usize; fprintf(stderr, "usable size: %Ld MiB\n", p->usize >> 20); if (flags & LSBD_FORMAT) { p->epoch = 0; p->cur_block = 0; p->clean_block = p->blocks - 1; lsbd_format(p); } else { find_current_block(p); lsbd_read_lcache(p); } #if 0 read_blocks(p); #endif fprintf(stderr, "Summary:\n"); fprintf(stderr, " %Lu KiB Lcache\n", p->lsectors * 4 >> 10); fprintf(stderr, " %lu blocks\n", p->blocks); fprintf(stderr, " %Lu MiB size\n", p->size >> 20); fprintf(stderr, " %Lu MiB usable size\n", p->usize >> 20); return 0; } int lsbd_read_sect(struct lsbd *p, void *buf, unsigned long sector_id) { void *bp; unsigned long sector; sector = u32_to_cpu(p->lcache[sector_id]); if (sector == LSBD_BLOCK_ZERO) { memset(buf, 0, p->sector_size); return 0; } if (sector == LSBD_BLOCK_INVALID) return -EINVAL; bp = lsbd_read_sector(p, sector); if (bp == NULL) { /* TODO: read mirror */ return -EIO; } memcpy(buf, bp, p->sector_size); free(bp); return 0; } static void *allocate_block(struct lsbd *p) { void *buf; struct lsbd_block *b; p->cur_block++; if (p->cur_block >= p->blocks) p->cur_block = 0; buf = memalign(4096, lsbd_block_size(p)); if (buf == NULL) return NULL; memset(buf, 0, lsbd_block_size(p)); b = buf; initialize_block(p, b); p->epoch++; b->age = cpu_to_u64(u64_to_cpu(b->age) + 1); b->epoch = cpu_to_u64(p->epoch); // fprintf(stderr, "block %ld: epoch: %Ld\n", p->cur_block, p->epoch); return buf; } int lsbd_write_sect(struct lsbd *p, const void *buf, unsigned long lid) { struct sector *s; void *data; s = malloc(sizeof(*s)); if (s == NULL) return -ENOMEM; data = malloc(p->sector_size); if (data == NULL) { free(s); return -ENOMEM; } memcpy(data, buf, p->sector_size); s->data = data; s->id = lid; s->epoch = 0; // fprintf(stderr, "write: sector %Ld, size = %ld\n", s->id, // p->sector_size); list_add(&s->list, &p->wqueue); p->wqueue_len++; if (p->wqueue_len > 32) lsbdd(p); return 0; } void clean_block(struct lsbd *p) { unsigned long block = p->clean_block; void *buf; void *fbuf = NULL; struct lsbd_block *b; int ret; struct lsbd_sect *sects; unsigned long sectors; unsigned long sectors_max; unsigned long i; sectors_max = (p->sector_size - sizeof(*b)) / sizeof(sects[0]); block++; if (block >= p->blocks) block = 0; /* * do not clean mirror blocks */ if (p->mirror_offset && (block & 1)) goto out; buf = lsbd_read_block_fast(p, block); if (buf == NULL) goto out; b = buf; ret = lsbd_block_verify_ok(p, b); if (!ret) { free(buf); goto out; } sects = (void *) ((char *) buf + u32_to_cpu(b->ptab_offset)); sectors = u32_to_cpu(b->sectors_per_block) - 1; if (sectors > sectors_max) { fprintf(stderr, "block %ld: invalid sectors per " "block: %ld %ld\n", block, sectors, sectors_max); sectors = sectors_max; } for (i = 0; i < sectors; i++) { unsigned long long sector_id = u64_to_cpu(sects[i].id); unsigned long s = block * p->sectors_per_block + i; if (sector_id >= p->lsectors) continue; if (u32_to_cpu(p->lcache[sector_id]) == s) { if (fbuf == NULL) { fbuf = lsbd_read_block(p, block); if (fbuf == NULL) goto out; } lsbd_write_sect(p, fbuf + (i + 1) * p->sector_size, sector_id); } } free(fbuf); free(buf); out: p->clean_block = block; } void block_write_sector(struct lsbd *p, void *buf, unsigned int i, struct sector *s) { struct lsbd_block *b = buf; struct lsbd_sect *sects = (void *) ((char *) buf + u32_to_cpu(b->ptab_offset)); memcpy((char *) buf + (i + 1) * p->sector_size, s->data, p->sector_size); sects[i].data_checksum = cpu_to_u32(checksum(s->data, p->sector_size)); sects[i].id = cpu_to_u64(s->id); sects[i].epoch = cpu_to_u64(s->epoch); sects[i].age = cpu_to_u32(0); sects[i].flags = cpu_to_u32(0); } int write_lcache(struct lsbd *p, void *bp) { struct lsbd_block *b = bp; unsigned long chunk; unsigned long i; lsbd_lcache_t *cache; if (p->cur_lcache >= p->lsectors) p->cur_lcache = 0; chunk = (p->sector_size - 0x800) / sizeof(*cache); b->lcache_offset = cpu_to_u32(0x800); b->lcache_base = cpu_to_u32(p->cur_lcache); b->lcache_chunk = cpu_to_u32(chunk); cache = (void *) ((char *) bp + u32_to_cpu(b->lcache_offset)); for (i = 0; i < chunk; i++) { unsigned long long sector_id = p->cur_lcache + i; if (sector_id >= p->lsectors) break; cache[i] = p->lcache[sector_id]; } b->lcache_checksum = cpu_to_u32(checksum(cache, chunk * sizeof(*cache))); p->cur_lcache += chunk; return 0; } int write_block_new(struct lsbd *p) { void *buf; struct lsbd_block *b; struct sector *s, *s2; unsigned int sectors = 0; buf = allocate_block(p); if (buf == NULL) return -ENOMEM; b = buf; b->ptab_offset = cpu_to_u32(0x400); /* find sectors to write */ list_for_each_entry_safe(s, s2, &p->wqueue, list) { block_write_sector(p, buf, sectors, s); p->lcache[s->id] = cpu_to_u32(p->cur_block * p->sectors_per_block + sectors); list_del(&s->list); p->wqueue_len--; free(s->data); free(s); sectors++; if (sectors >= p->sectors_per_block - 1) break; } b->ptab_checksum = cpu_to_u32(checksum((char *) buf + u32_to_cpu(b->ptab_offset), (p->sectors_per_block - 1) * sizeof(struct lsbd_sect))); write_lcache(p, b); lsbd_block_commit(p, b); if (lsbd_write_block(p, buf, p->cur_block)) fprintf(stderr, "block %ld: I/O error\n", p->cur_block); free(buf); return 0; } int write_block_mirror(struct lsbd *p) { void *buf; struct lsbd_block *b; unsigned long block; int ret; p->cur_block++; if (p->cur_block >= p->blocks) p->cur_block = 0; if (p->mirror_offset >= p->cur_block) { block = p->cur_block - p->mirror_offset; } else { block = p->blocks + p->cur_block - p->mirror_offset; } buf = lsbd_read_block(p, block); if (buf == NULL) return -ENOMEM; b = buf; ret = lsbd_block_verify_ok(p, b); if (!ret) initialize_block(p, buf); p->epoch++; b->age = cpu_to_u64(u64_to_cpu(b->age) + 1); b->epoch = cpu_to_u64(p->epoch); write_lcache(p, b); lsbd_block_commit(p, b); lsbd_write_block(p, buf, p->cur_block); free(buf); return 0; } int write_block(struct lsbd *p) { if (p->mirror_offset && !(p->cur_block & 1)) { write_block_mirror(p); } else { write_block_new(p); } return 0; } int lsbdd(struct lsbd *p) { if (block_diff(p, p->clean_block, p->cur_block) < 16) clean_block(p); if (!list_empty(&p->wqueue) && (block_diff(p, p->clean_block, p->cur_block) > 1)) write_block(p); return 0; } int lsbd_sync(struct lsbd *p) { while (!list_empty(&p->wqueue)) { lsbdd(p); } fdatasync(p->fd); return 0; } int lsbd_close(struct lsbd *p) { return -ENOSYS; } struct lsbd *lsbd_alloc(void) { return malloc(sizeof(struct lsbd)); } int lsbd_format(struct lsbd *p) { unsigned long i; void *data; for (i = 0; i < p->lsectors; i++) p->lcache[i] = cpu_to_u32(LSBD_BLOCK_ZERO); data = malloc(p->sector_size); memset(data, 0, p->sector_size); for (i = 0; i < p->blocks; i++) { lsbd_write_sect(p, data, 0); lsbd_sync(p); } return 0; } #if 0 int main(void) { struct lsbd lsbd; int ret; unsigned int i; u32 *p; ret = lsbd_open(&lsbd, "/tmp/storage"); if (ret) { perror("lsbd_open"); exit(EXIT_FAILURE); } p = malloc(4096); if (p == NULL) abort(); // lsbd_format(&lsbd); memset(p, 0, 4096); #if 0 for (i = 0; i < 100; i++) { p[0] = i; lsbd_write_sect(&lsbd, p, i); lsbd_sync(&lsbd); } #endif for (i = 0; i < 1000; i++) { unsigned long long t; t = read_count; ret = lsbd_read_sect(&lsbd, p, i); if (ret) { fprintf(stderr, "read: I/O error\n"); continue; } t = read_count - t; fprintf(stderr, "written: %d, readed %d, i/o %Ld\n", i, *p, t); } return 0; } #endif