diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a42f522f52e7..2b9ed0e3af37 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -717,11 +717,248 @@ static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) md_wakeup_thread(log->reclaim_thread); } +struct r5l_recovery_ctx { + struct page *meta_page; /* current meta */ + sector_t meta_total_blocks; /* total size of current meta and data */ + sector_t pos; /* recovery position */ + u64 seq; /* recovery position seq */ +}; + +static int r5l_read_meta_block(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct page *page = ctx->meta_page; + struct r5l_meta_block *mb; + u32 crc, stored_crc; + + if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false)) + return -EIO; + + mb = page_address(page); + stored_crc = le32_to_cpu(mb->checksum); + mb->checksum = 0; + + if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || + le64_to_cpu(mb->seq) != ctx->seq || + mb->version != R5LOG_VERSION || + le64_to_cpu(mb->position) != ctx->pos) + return -EINVAL; + + crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE); + if (stored_crc != crc) + return -EINVAL; + + if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) + return -EINVAL; + + ctx->meta_total_blocks = BLOCK_SECTORS; + + return 0; +} + +static int r5l_recovery_flush_one_stripe(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + sector_t stripe_sect, + int *offset, sector_t *log_offset) +{ + struct r5conf *conf = log->rdev->mddev->private; + struct stripe_head *sh; + struct r5l_payload_data_parity *payload; + int disk_index; + + sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); + while (1) { + payload = page_address(ctx->meta_page) + *offset; + + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, + &disk_index, sh); + + sync_page_io(log->rdev, *log_offset, PAGE_SIZE, + sh->dev[disk_index].page, READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); + ctx->meta_total_blocks += BLOCK_SECTORS; + } else { + disk_index = sh->pd_idx; + sync_page_io(log->rdev, *log_offset, PAGE_SIZE, + sh->dev[disk_index].page, READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); + + if (sh->qd_idx >= 0) { + disk_index = sh->qd_idx; + sync_page_io(log->rdev, + r5l_ring_add(log, *log_offset, BLOCK_SECTORS), + PAGE_SIZE, sh->dev[disk_index].page, + READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[1]); + set_bit(R5_Wantwrite, + &sh->dev[disk_index].flags); + } + ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; + } + + *log_offset = r5l_ring_add(log, *log_offset, + le32_to_cpu(payload->size)); + *offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) + break; + } + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + void *addr; + u32 checksum; + + if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) + continue; + addr = kmap_atomic(sh->dev[disk_index].page); + checksum = crc32_le(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_atomic(addr); + if (checksum != sh->dev[disk_index].log_checksum) + goto error; + } + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + struct md_rdev *rdev, *rrdev; + + if (!test_and_clear_bit(R5_Wantwrite, + &sh->dev[disk_index].flags)) + continue; + + /* in case device is broken */ + rdev = rcu_dereference(conf->disks[disk_index].rdev); + if (rdev) + sync_page_io(rdev, stripe_sect, PAGE_SIZE, + sh->dev[disk_index].page, WRITE, false); + rrdev = rcu_dereference(conf->disks[disk_index].replacement); + if (rrdev) + sync_page_io(rrdev, stripe_sect, PAGE_SIZE, + sh->dev[disk_index].page, WRITE, false); + } + raid5_release_stripe(sh); + return 0; + +error: + for (disk_index = 0; disk_index < sh->disks; disk_index++) + sh->dev[disk_index].flags = 0; + raid5_release_stripe(sh); + return -EINVAL; +} + +static int r5l_recovery_flush_one_meta(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct r5conf *conf = log->rdev->mddev->private; + struct r5l_payload_data_parity *payload; + struct r5l_meta_block *mb; + int offset; + sector_t log_offset; + sector_t stripe_sector; + + mb = page_address(ctx->meta_page); + offset = sizeof(struct r5l_meta_block); + log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + + while (offset < le32_to_cpu(mb->meta_size)) { + int dd; + + payload = (void *)mb + offset; + stripe_sector = raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, &dd, NULL); + if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, + &offset, &log_offset)) + return -EINVAL; + } + return 0; +} + +/* copy data/parity from log to raid disks */ +static void r5l_recovery_flush_log(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + while (1) { + if (r5l_read_meta_block(log, ctx)) + return; + if (r5l_recovery_flush_one_meta(log, ctx)) + return; + ctx->seq++; + ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); + } +} + +static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, + u64 seq) +{ + struct page *page; + struct r5l_meta_block *mb; + u32 crc; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + mb = page_address(page); + mb->magic = cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = cpu_to_le64(seq); + mb->position = cpu_to_le64(pos); + crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE); + mb->checksum = cpu_to_le32(crc); + + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) { + __free_page(page); + return -EIO; + } + __free_page(page); + return 0; +} + static int r5l_recovery_log(struct r5l_log *log) { - /* fake recovery */ - log->seq = log->last_cp_seq + 1; - log->log_start = r5l_ring_add(log, log->last_checkpoint, BLOCK_SECTORS); + struct r5l_recovery_ctx ctx; + + ctx.pos = log->last_checkpoint; + ctx.seq = log->last_cp_seq; + ctx.meta_page = alloc_page(GFP_KERNEL); + if (!ctx.meta_page) + return -ENOMEM; + + r5l_recovery_flush_log(log, &ctx); + __free_page(ctx.meta_page); + + /* + * we did a recovery. Now ctx.pos points to an invalid meta block. New + * log will start here. but we can't let superblock point to last valid + * meta block. The log might looks like: + * | meta 1| meta 2| meta 3| + * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If + * superblock points to meta 1, we write a new valid meta 2n. if crash + * happens again, new recovery will start from meta 1. Since meta 2n is + * valid now, recovery will think meta 3 is valid, which is wrong. + * The solution is we create a new meta in meta2 with its seq == meta + * 1's seq + 10 and let superblock points to meta2. The same recovery will + * not think meta 3 is a valid meta, because its seq doesn't match + */ + if (ctx.seq > log->last_cp_seq + 1) { + int ret; + + ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); + if (ret) + return ret; + log->seq = ctx.seq + 11; + log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); + r5l_write_super(log, ctx.pos); + } else { + log->log_start = ctx.pos; + log->seq = ctx.seq; + } return 0; }