/* Copyright (c) Mark Harmstone 2017 * * This file is part of WinBtrfs. * * WinBtrfs is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public Licence as published by * the Free Software Foundation, either version 3 of the Licence, or * (at your option) any later version. * * WinBtrfs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public Licence for more details. * * You should have received a copy of the GNU Lesser General Public Licence * along with WinBtrfs. If not, see . */ #include "btrfs_drv.h" #define SCRUB_UNIT 0x100000 // 1 MB struct _scrub_context; typedef struct { struct _scrub_context* context; PIRP Irp; uint64_t start; uint32_t length; IO_STATUS_BLOCK iosb; uint8_t* buf; bool csum_error; uint32_t* bad_csums; } scrub_context_stripe; typedef struct _scrub_context { KEVENT Event; scrub_context_stripe* stripes; LONG stripes_left; } scrub_context; typedef struct { ANSI_STRING name; bool orig_subvol; LIST_ENTRY list_entry; } path_part; static void log_file_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t subvol, uint64_t inode, uint64_t offset) { LIST_ENTRY *le, parts; root* r = NULL; KEY searchkey; traverse_ptr tp; uint64_t dir; bool orig_subvol = true, not_in_tree = false; ANSI_STRING fn; scrub_error* err; NTSTATUS Status; ULONG utf16len; le = Vcb->roots.Flink; while (le != &Vcb->roots) { root* r2 = CONTAINING_RECORD(le, root, list_entry); if (r2->id == subvol) { r = r2; break; } le = le->Flink; } if (!r) { ERR("could not find subvol %I64x\n", subvol); return; } InitializeListHead(&parts); dir = inode; while (true) { if (dir == r->root_item.objid) { if (r == Vcb->root_fileref->fcb->subvol) break; searchkey.obj_id = r->id; searchkey.obj_type = TYPE_ROOT_BACKREF; searchkey.offset = 0xffffffffffffffff; Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); if (!NT_SUCCESS(Status)) { ERR("find_item returned %08x\n", Status); goto end; } if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) { ROOT_REF* rr = (ROOT_REF*)tp.item->data; path_part* pp; if (tp.item->size < sizeof(ROOT_REF)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(ROOT_REF)); goto end; } if (tp.item->size < offsetof(ROOT_REF, name[0]) + rr->n) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, offsetof(ROOT_REF, name[0]) + rr->n); goto end; } pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG); if (!pp) { ERR("out of memory\n"); goto end; } pp->name.Buffer = rr->name; pp->name.Length = pp->name.MaximumLength = rr->n; pp->orig_subvol = false; InsertTailList(&parts, &pp->list_entry); r = NULL; le = Vcb->roots.Flink; while (le != &Vcb->roots) { root* r2 = CONTAINING_RECORD(le, root, list_entry); if (r2->id == tp.item->key.offset) { r = r2; break; } le = le->Flink; } if (!r) { ERR("could not find subvol %I64x\n", tp.item->key.offset); goto end; } dir = rr->dir; orig_subvol = false; } else { not_in_tree = true; break; } } else { searchkey.obj_id = dir; searchkey.obj_type = TYPE_INODE_EXTREF; searchkey.offset = 0xffffffffffffffff; Status = find_item(Vcb, r, &tp, &searchkey, false, NULL); if (!NT_SUCCESS(Status)) { ERR("find_item returned %08x\n", Status); goto end; } if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_REF) { INODE_REF* ir = (INODE_REF*)tp.item->data; path_part* pp; if (tp.item->size < sizeof(INODE_REF)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(INODE_REF)); goto end; } if (tp.item->size < offsetof(INODE_REF, name[0]) + ir->n) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, offsetof(INODE_REF, name[0]) + ir->n); goto end; } pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG); if (!pp) { ERR("out of memory\n"); goto end; } pp->name.Buffer = ir->name; pp->name.Length = pp->name.MaximumLength = ir->n; pp->orig_subvol = orig_subvol; InsertTailList(&parts, &pp->list_entry); if (dir == tp.item->key.offset) break; dir = tp.item->key.offset; } else if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_EXTREF) { INODE_EXTREF* ier = (INODE_EXTREF*)tp.item->data; path_part* pp; if (tp.item->size < sizeof(INODE_EXTREF)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(INODE_EXTREF)); goto end; } if (tp.item->size < offsetof(INODE_EXTREF, name[0]) + ier->n) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, offsetof(INODE_EXTREF, name[0]) + ier->n); goto end; } pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG); if (!pp) { ERR("out of memory\n"); goto end; } pp->name.Buffer = ier->name; pp->name.Length = pp->name.MaximumLength = ier->n; pp->orig_subvol = orig_subvol; InsertTailList(&parts, &pp->list_entry); if (dir == ier->dir) break; dir = ier->dir; } else { ERR("could not find INODE_REF for inode %I64x in subvol %I64x\n", dir, r->id); goto end; } } } fn.MaximumLength = 0; if (not_in_tree) { le = parts.Blink; while (le != &parts) { path_part* pp = CONTAINING_RECORD(le, path_part, list_entry); LIST_ENTRY* le2 = le->Blink; if (pp->orig_subvol) break; RemoveTailList(&parts); ExFreePool(pp); le = le2; } } le = parts.Flink; while (le != &parts) { path_part* pp = CONTAINING_RECORD(le, path_part, list_entry); fn.MaximumLength += pp->name.Length + 1; le = le->Flink; } fn.Buffer = ExAllocatePoolWithTag(PagedPool, fn.MaximumLength, ALLOC_TAG); if (!fn.Buffer) { ERR("out of memory\n"); goto end; } fn.Length = 0; le = parts.Blink; while (le != &parts) { path_part* pp = CONTAINING_RECORD(le, path_part, list_entry); fn.Buffer[fn.Length] = '\\'; fn.Length++; RtlCopyMemory(&fn.Buffer[fn.Length], pp->name.Buffer, pp->name.Length); fn.Length += pp->name.Length; le = le->Blink; } if (not_in_tree) ERR("subvol %I64x, %.*s, offset %I64x\n", subvol, fn.Length, fn.Buffer, offset); else ERR("%.*s, offset %I64x\n", fn.Length, fn.Buffer, offset); Status = utf8_to_utf16(NULL, 0, &utf16len, fn.Buffer, fn.Length); if (!NT_SUCCESS(Status)) { ERR("utf8_to_utf16 1 returned %08x\n", Status); ExFreePool(fn.Buffer); goto end; } err = ExAllocatePoolWithTag(PagedPool, offsetof(scrub_error, data.filename[0]) + utf16len, ALLOC_TAG); if (!err) { ERR("out of memory\n"); ExFreePool(fn.Buffer); goto end; } err->address = addr; err->device = devid; err->recovered = false; err->is_metadata = false; err->parity = false; err->data.subvol = not_in_tree ? subvol : 0; err->data.offset = offset; err->data.filename_length = (uint16_t)utf16len; Status = utf8_to_utf16(err->data.filename, utf16len, &utf16len, fn.Buffer, fn.Length); if (!NT_SUCCESS(Status)) { ERR("utf8_to_utf16 2 returned %08x\n", Status); ExFreePool(fn.Buffer); ExFreePool(err); goto end; } ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true); Vcb->scrub.num_errors++; InsertTailList(&Vcb->scrub.errors, &err->list_entry); ExReleaseResourceLite(&Vcb->scrub.stats_lock); ExFreePool(fn.Buffer); end: while (!IsListEmpty(&parts)) { path_part* pp = CONTAINING_RECORD(RemoveHeadList(&parts), path_part, list_entry); ExFreePool(pp); } } static void log_file_checksum_error_shared(device_extension* Vcb, uint64_t treeaddr, uint64_t addr, uint64_t devid, uint64_t extent) { tree_header* tree; NTSTATUS Status; leaf_node* ln; ULONG i; tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG); if (!tree) { ERR("out of memory\n"); return; } Status = read_data(Vcb, treeaddr, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority); if (!NT_SUCCESS(Status)) { ERR("read_data returned %08x\n", Status); goto end; } if (tree->level != 0) { ERR("tree level was %x, expected 0\n", tree->level); goto end; } ln = (leaf_node*)&tree[1]; for (i = 0; i < tree->num_items; i++) { if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)tree + sizeof(tree_header) + ln[i].offset); EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; if (ed->type == EXTENT_TYPE_REGULAR && ed2->size != 0 && ed2->address == addr) log_file_checksum_error(Vcb, addr, devid, tree->tree_id, ln[i].key.obj_id, ln[i].key.offset + addr - extent); } } end: ExFreePool(tree); } static void log_tree_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t root, uint8_t level, KEY* firstitem) { scrub_error* err; err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG); if (!err) { ERR("out of memory\n"); return; } err->address = addr; err->device = devid; err->recovered = false; err->is_metadata = true; err->parity = false; err->metadata.root = root; err->metadata.level = level; if (firstitem) { ERR("root %I64x, level %u, first item (%I64x,%x,%I64x)\n", root, level, firstitem->obj_id, firstitem->obj_type, firstitem->offset); err->metadata.firstitem = *firstitem; } else { ERR("root %I64x, level %u\n", root, level); RtlZeroMemory(&err->metadata.firstitem, sizeof(KEY)); } ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true); Vcb->scrub.num_errors++; InsertTailList(&Vcb->scrub.errors, &err->list_entry); ExReleaseResourceLite(&Vcb->scrub.stats_lock); } static void log_tree_checksum_error_shared(device_extension* Vcb, uint64_t offset, uint64_t address, uint64_t devid) { tree_header* tree; NTSTATUS Status; internal_node* in; ULONG i; tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG); if (!tree) { ERR("out of memory\n"); return; } Status = read_data(Vcb, offset, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority); if (!NT_SUCCESS(Status)) { ERR("read_data returned %08x\n", Status); goto end; } if (tree->level == 0) { ERR("tree level was 0\n"); goto end; } in = (internal_node*)&tree[1]; for (i = 0; i < tree->num_items; i++) { if (in[i].address == address) { log_tree_checksum_error(Vcb, address, devid, tree->tree_id, tree->level - 1, &in[i].key); break; } } end: ExFreePool(tree); } static void log_unrecoverable_error(device_extension* Vcb, uint64_t address, uint64_t devid) { KEY searchkey; traverse_ptr tp; NTSTATUS Status; EXTENT_ITEM* ei; EXTENT_ITEM2* ei2 = NULL; uint8_t* ptr; ULONG len; uint64_t rc; // FIXME - still log even if rest of this function fails searchkey.obj_id = address; searchkey.obj_type = TYPE_METADATA_ITEM; searchkey.offset = 0xffffffffffffffff; Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); if (!NT_SUCCESS(Status)) { ERR("find_item returned %08x\n", Status); return; } if ((tp.item->key.obj_type != TYPE_EXTENT_ITEM && tp.item->key.obj_type != TYPE_METADATA_ITEM) || tp.item->key.obj_id >= address + Vcb->superblock.sector_size || (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.obj_id + tp.item->key.offset <= address) || (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->key.obj_id + Vcb->superblock.node_size <= address) ) return; if (tp.item->size < sizeof(EXTENT_ITEM)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM)); return; } ei = (EXTENT_ITEM*)tp.item->data; ptr = (uint8_t*)&ei[1]; len = tp.item->size - sizeof(EXTENT_ITEM); if (tp.item->key.obj_id == TYPE_EXTENT_ITEM && ei->flags & EXTENT_ITEM_TREE_BLOCK) { if (tp.item->size < sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2)); return; } ei2 = (EXTENT_ITEM2*)ptr; ptr += sizeof(EXTENT_ITEM2); len -= sizeof(EXTENT_ITEM2); } rc = 0; while (len > 0) { uint8_t type = *ptr; ptr++; len--; if (type == TYPE_TREE_BLOCK_REF) { TREE_BLOCK_REF* tbr; if (len < sizeof(TREE_BLOCK_REF)) { ERR("TREE_BLOCK_REF takes up %u bytes, but only %u remaining\n", sizeof(TREE_BLOCK_REF), len); break; } tbr = (TREE_BLOCK_REF*)ptr; log_tree_checksum_error(Vcb, address, devid, tbr->offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL); rc++; ptr += sizeof(TREE_BLOCK_REF); len -= sizeof(TREE_BLOCK_REF); } else if (type == TYPE_EXTENT_DATA_REF) { EXTENT_DATA_REF* edr; if (len < sizeof(EXTENT_DATA_REF)) { ERR("EXTENT_DATA_REF takes up %u bytes, but only %u remaining\n", sizeof(EXTENT_DATA_REF), len); break; } edr = (EXTENT_DATA_REF*)ptr; log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id); rc += edr->count; ptr += sizeof(EXTENT_DATA_REF); len -= sizeof(EXTENT_DATA_REF); } else if (type == TYPE_SHARED_BLOCK_REF) { SHARED_BLOCK_REF* sbr; if (len < sizeof(SHARED_BLOCK_REF)) { ERR("SHARED_BLOCK_REF takes up %u bytes, but only %u remaining\n", sizeof(SHARED_BLOCK_REF), len); break; } sbr = (SHARED_BLOCK_REF*)ptr; log_tree_checksum_error_shared(Vcb, sbr->offset, address, devid); rc++; ptr += sizeof(SHARED_BLOCK_REF); len -= sizeof(SHARED_BLOCK_REF); } else if (type == TYPE_SHARED_DATA_REF) { SHARED_DATA_REF* sdr; if (len < sizeof(SHARED_DATA_REF)) { ERR("SHARED_DATA_REF takes up %u bytes, but only %u remaining\n", sizeof(SHARED_DATA_REF), len); break; } sdr = (SHARED_DATA_REF*)ptr; log_file_checksum_error_shared(Vcb, sdr->offset, address, devid, tp.item->key.obj_id); rc += sdr->count; ptr += sizeof(SHARED_DATA_REF); len -= sizeof(SHARED_DATA_REF); } else { ERR("unknown extent type %x\n", type); break; } } if (rc < ei->refcount) { do { traverse_ptr next_tp; if (find_next_item(Vcb, &tp, &next_tp, false, NULL)) tp = next_tp; else break; if (tp.item->key.obj_id == address) { if (tp.item->key.obj_type == TYPE_TREE_BLOCK_REF) log_tree_checksum_error(Vcb, address, devid, tp.item->key.offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL); else if (tp.item->key.obj_type == TYPE_EXTENT_DATA_REF) { EXTENT_DATA_REF* edr; if (tp.item->size < sizeof(EXTENT_DATA_REF)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_DATA_REF)); break; } edr = (EXTENT_DATA_REF*)tp.item->data; log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id); } else if (tp.item->key.obj_type == TYPE_SHARED_BLOCK_REF) log_tree_checksum_error_shared(Vcb, tp.item->key.offset, address, devid); else if (tp.item->key.obj_type == TYPE_SHARED_DATA_REF) log_file_checksum_error_shared(Vcb, tp.item->key.offset, address, devid, tp.item->key.obj_id); } else break; } while (true); } } static void log_error(device_extension* Vcb, uint64_t addr, uint64_t devid, bool metadata, bool recoverable, bool parity) { if (recoverable) { scrub_error* err; if (parity) { ERR("recovering from parity error at %I64x on device %I64x\n", addr, devid); } else { if (metadata) ERR("recovering from metadata checksum error at %I64x on device %I64x\n", addr, devid); else ERR("recovering from data checksum error at %I64x on device %I64x\n", addr, devid); } err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG); if (!err) { ERR("out of memory\n"); return; } err->address = addr; err->device = devid; err->recovered = true; err->is_metadata = metadata; err->parity = parity; if (metadata) RtlZeroMemory(&err->metadata, sizeof(err->metadata)); else RtlZeroMemory(&err->data, sizeof(err->data)); ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true); Vcb->scrub.num_errors++; InsertTailList(&Vcb->scrub.errors, &err->list_entry); ExReleaseResourceLite(&Vcb->scrub.stats_lock); } else { if (metadata) ERR("unrecoverable metadata checksum error at %I64x\n", addr); else ERR("unrecoverable data checksum error at %I64x\n", addr); log_unrecoverable_error(Vcb, addr, devid); } } _Function_class_(IO_COMPLETION_ROUTINE) static NTSTATUS __stdcall scrub_read_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) { scrub_context_stripe* stripe = conptr; scrub_context* context = (scrub_context*)stripe->context; ULONG left = InterlockedDecrement(&context->stripes_left); UNUSED(DeviceObject); stripe->iosb = Irp->IoStatus; if (left == 0) KeSetEvent(&context->Event, 0, false); return STATUS_MORE_PROCESSING_REQUIRED; } static NTSTATUS scrub_extent_dup(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t* csum, scrub_context* context) { NTSTATUS Status; bool csum_error = false; ULONG i; CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; uint16_t present_devices = 0; if (csum) { ULONG good_stripe = 0xffffffff; for (i = 0; i < c->chunk_item->num_stripes; i++) { if (c->devices[i]->devobj) { present_devices++; // if first stripe is okay, we only need to check that the others are identical to it if (good_stripe != 0xffffffff) { if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf, context->stripes[good_stripe].length) != context->stripes[i].length) { context->stripes[i].csum_error = true; csum_error = true; log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } else { Status = check_csum(Vcb, context->stripes[i].buf, context->stripes[i].length / Vcb->superblock.sector_size, csum); if (Status == STATUS_CRC_ERROR) { context->stripes[i].csum_error = true; csum_error = true; log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } else if (!NT_SUCCESS(Status)) { ERR("check_csum returned %08x\n", Status); return Status; } else good_stripe = i; } } } } else { ULONG good_stripe = 0xffffffff; for (i = 0; i < c->chunk_item->num_stripes; i++) { ULONG j; if (c->devices[i]->devobj) { // if first stripe is okay, we only need to check that the others are identical to it if (good_stripe != 0xffffffff) { if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf, context->stripes[good_stripe].length) != context->stripes[i].length) { context->stripes[i].csum_error = true; csum_error = true; log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } else { for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) { tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size]; uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); if (crc32 != *((uint32_t*)th->csum) || th->address != offset + UInt32x32To64(j, Vcb->superblock.node_size)) { context->stripes[i].csum_error = true; csum_error = true; log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } if (!context->stripes[i].csum_error) good_stripe = i; } } } } if (!csum_error) return STATUS_SUCCESS; // handle checksum error for (i = 0; i < c->chunk_item->num_stripes; i++) { if (context->stripes[i].csum_error) { if (csum) { context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[i].length * sizeof(uint32_t) / Vcb->superblock.sector_size, ALLOC_TAG); if (!context->stripes[i].bad_csums) { ERR("out of memory\n"); return STATUS_INSUFFICIENT_RESOURCES; } Status = calc_csum(Vcb, context->stripes[i].buf, context->stripes[i].length / Vcb->superblock.sector_size, context->stripes[i].bad_csums); if (!NT_SUCCESS(Status)) { ERR("calc_csum returned %08x\n", Status); return Status; } } else { ULONG j; context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[i].length * sizeof(uint32_t) / Vcb->superblock.node_size, ALLOC_TAG); if (!context->stripes[i].bad_csums) { ERR("out of memory\n"); return STATUS_INSUFFICIENT_RESOURCES; } for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) { tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size]; uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); context->stripes[i].bad_csums[j] = crc32; } } } } if (present_devices > 1) { ULONG good_stripe = 0xffffffff; for (i = 0; i < c->chunk_item->num_stripes; i++) { if (c->devices[i]->devobj && !context->stripes[i].csum_error) { good_stripe = i; break; } } if (good_stripe != 0xffffffff) { // log for (i = 0; i < c->chunk_item->num_stripes; i++) { if (context->stripes[i].csum_error) { ULONG j; if (csum) { for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) { if (context->stripes[i].bad_csums[j] != csum[j]) { uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size); log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } } else { for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) { tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size]; uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size); if (context->stripes[i].bad_csums[j] != *((uint32_t*)th->csum) || th->address != addr) { log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } } } } // write good data over bad for (i = 0; i < c->chunk_item->num_stripes; i++) { if (context->stripes[i].csum_error && !c->devices[i]->readonly) { Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset, context->stripes[good_stripe].buf, context->stripes[i].length); if (!NT_SUCCESS(Status)) { ERR("write_data_phys returned %08x\n", Status); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS); return Status; } } } return STATUS_SUCCESS; } // if csum errors on all stripes, check sector by sector for (i = 0; i < c->chunk_item->num_stripes; i++) { ULONG j; if (c->devices[i]->devobj) { if (csum) { for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) { if (context->stripes[i].bad_csums[j] != csum[j]) { ULONG k; uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size); bool recovered = false; for (k = 0; k < c->chunk_item->num_stripes; k++) { if (i != k && c->devices[k]->devobj && context->stripes[k].bad_csums[j] == csum[j]) { log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); RtlCopyMemory(context->stripes[i].buf + (j * Vcb->superblock.sector_size), context->stripes[k].buf + (j * Vcb->superblock.sector_size), Vcb->superblock.sector_size); recovered = true; break; } } if (!recovered) { log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } } } else { for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) { tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size]; uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size); if (context->stripes[i].bad_csums[j] != *((uint32_t*)th->csum) || th->address != addr) { ULONG k; bool recovered = false; for (k = 0; k < c->chunk_item->num_stripes; k++) { if (i != k && c->devices[k]->devobj) { tree_header* th2 = (tree_header*)&context->stripes[k].buf[j * Vcb->superblock.node_size]; if (context->stripes[k].bad_csums[j] == *((uint32_t*)th2->csum) && th2->address == addr) { log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); RtlCopyMemory(th, th2, Vcb->superblock.node_size); recovered = true; break; } } } if (!recovered) { log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } } } } } // write good data over bad for (i = 0; i < c->chunk_item->num_stripes; i++) { if (c->devices[i]->devobj && !c->devices[i]->readonly) { Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset, context->stripes[i].buf, context->stripes[i].length); if (!NT_SUCCESS(Status)) { ERR("write_data_phys returned %08x\n", Status); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS); return Status; } } } return STATUS_SUCCESS; } for (i = 0; i < c->chunk_item->num_stripes; i++) { if (c->devices[i]->devobj) { ULONG j; if (csum) { for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) { if (context->stripes[i].bad_csums[j] != csum[j]) { uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size); log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false); } } } else { for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) { tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size]; uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size); if (context->stripes[i].bad_csums[j] != *((uint32_t*)th->csum) || th->address != addr) log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false); } } } } return STATUS_SUCCESS; } static NTSTATUS scrub_extent_raid0(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, uint32_t* csum, scrub_context* context) { ULONG j; uint16_t stripe; uint32_t pos, *stripeoff; pos = 0; stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes, ALLOC_TAG); if (!stripeoff) { ERR("out of memory\n"); return STATUS_INSUFFICIENT_RESOURCES; } RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes); stripe = startoffstripe; while (pos < length) { uint32_t readlen; if (pos == 0) readlen = (uint32_t)min(context->stripes[stripe].length, c->chunk_item->stripe_length - (context->stripes[stripe].start % c->chunk_item->stripe_length)); else readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length); if (csum) { for (j = 0; j < readlen; j += Vcb->superblock.sector_size) { uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[stripe].buf + stripeoff[stripe], Vcb->superblock.sector_size); if (crc32 != csum[pos / Vcb->superblock.sector_size]) { uint64_t addr = offset + pos; log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false); log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } pos += Vcb->superblock.sector_size; stripeoff[stripe] += Vcb->superblock.sector_size; } } else { for (j = 0; j < readlen; j += Vcb->superblock.node_size) { tree_header* th = (tree_header*)(context->stripes[stripe].buf + stripeoff[stripe]); uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); uint64_t addr = offset + pos; if (crc32 != *((uint32_t*)th->csum) || th->address != addr) { log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false); log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } pos += Vcb->superblock.node_size; stripeoff[stripe] += Vcb->superblock.node_size; } } stripe = (stripe + 1) % c->chunk_item->num_stripes; } ExFreePool(stripeoff); return STATUS_SUCCESS; } static NTSTATUS scrub_extent_raid10(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, uint32_t* csum, scrub_context* context) { ULONG j; uint16_t stripe, sub_stripes = max(c->chunk_item->sub_stripes, 1); uint32_t pos, *stripeoff; bool csum_error = false; NTSTATUS Status; pos = 0; stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes, ALLOC_TAG); if (!stripeoff) { ERR("out of memory\n"); return STATUS_INSUFFICIENT_RESOURCES; } RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes); stripe = startoffstripe; while (pos < length) { uint32_t readlen; if (pos == 0) readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length, c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length)); else readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length); if (csum) { ULONG good_stripe = 0xffffffff; uint16_t k; for (k = 0; k < sub_stripes; k++) { if (c->devices[(stripe * sub_stripes) + k]->devobj) { // if first stripe is okay, we only need to check that the others are identical to it if (good_stripe != 0xffffffff) { if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe], context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe], readlen) != readlen) { context->stripes[(stripe * sub_stripes) + k].csum_error = true; csum_error = true; log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } else { for (j = 0; j < readlen; j += Vcb->superblock.sector_size) { uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j, Vcb->superblock.sector_size); if (crc32 != csum[(pos + j) / Vcb->superblock.sector_size]) { csum_error = true; context->stripes[(stripe * sub_stripes) + k].csum_error = true; log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS); break; } } if (!context->stripes[(stripe * sub_stripes) + k].csum_error) good_stripe = k; } } } pos += readlen; stripeoff[stripe] += readlen; } else { ULONG good_stripe = 0xffffffff; uint16_t k; for (k = 0; k < sub_stripes; k++) { if (c->devices[(stripe * sub_stripes) + k]->devobj) { // if first stripe is okay, we only need to check that the others are identical to it if (good_stripe != 0xffffffff) { if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe], context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe], readlen) != readlen) { context->stripes[(stripe * sub_stripes) + k].csum_error = true; csum_error = true; log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } else { for (j = 0; j < readlen; j += Vcb->superblock.node_size) { tree_header* th = (tree_header*)(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j); uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); uint64_t addr = offset + pos + j; if (crc32 != *((uint32_t*)th->csum) || th->address != addr) { csum_error = true; context->stripes[(stripe * sub_stripes) + k].csum_error = true; log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS); break; } } if (!context->stripes[(stripe * sub_stripes) + k].csum_error) good_stripe = k; } } } pos += readlen; stripeoff[stripe] += readlen; } stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes); } if (!csum_error) { Status = STATUS_SUCCESS; goto end; } for (j = 0; j < c->chunk_item->num_stripes; j += sub_stripes) { ULONG goodstripe = 0xffffffff; uint16_t k; bool hasbadstripe = false; if (context->stripes[j].length == 0) continue; for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj) { if (!context->stripes[j + k].csum_error) goodstripe = k; else hasbadstripe = true; } } if (hasbadstripe) { if (goodstripe != 0xffffffff) { for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj && context->stripes[j + k].csum_error) { uint32_t so = 0; bool recovered = false; pos = 0; stripe = startoffstripe; while (pos < length) { uint32_t readlen; if (pos == 0) readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length, c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length)); else readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length); if (stripe == j / sub_stripes) { if (csum) { ULONG l; for (l = 0; l < readlen; l += Vcb->superblock.sector_size) { if (RtlCompareMemory(context->stripes[j + k].buf + so, context->stripes[j + goodstripe].buf + so, Vcb->superblock.sector_size) != Vcb->superblock.sector_size) { uint64_t addr = offset + pos; log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false); recovered = true; } pos += Vcb->superblock.sector_size; so += Vcb->superblock.sector_size; } } else { ULONG l; for (l = 0; l < readlen; l += Vcb->superblock.node_size) { if (RtlCompareMemory(context->stripes[j + k].buf + so, context->stripes[j + goodstripe].buf + so, Vcb->superblock.node_size) != Vcb->superblock.node_size) { uint64_t addr = offset + pos; log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false); recovered = true; } pos += Vcb->superblock.node_size; so += Vcb->superblock.node_size; } } } else pos += readlen; stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes); } if (recovered) { // write good data over bad if (!c->devices[j + k]->readonly) { CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset, context->stripes[j + goodstripe].buf, context->stripes[j + goodstripe].length); if (!NT_SUCCESS(Status)) { ERR("write_data_phys returned %08x\n", Status); log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS); goto end; } } } } } } else { uint32_t so = 0; bool recovered = false; if (csum) { for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj) { context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * sizeof(uint32_t) / Vcb->superblock.sector_size, ALLOC_TAG); if (!context->stripes[j + k].bad_csums) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } Status = calc_csum(Vcb, context->stripes[j + k].buf, context->stripes[j + k].length / Vcb->superblock.sector_size, context->stripes[j + k].bad_csums); if (!NT_SUCCESS(Status)) { ERR("calc_csum returned %08x\n", Status); goto end; } } } } else { for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj) { ULONG l; context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * sizeof(uint32_t) / Vcb->superblock.node_size, ALLOC_TAG); if (!context->stripes[j + k].bad_csums) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } for (l = 0; l < context->stripes[j + k].length / Vcb->superblock.node_size; l++) { tree_header* th = (tree_header*)&context->stripes[j + k].buf[l * Vcb->superblock.node_size]; uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); context->stripes[j + k].bad_csums[l] = crc32; } } } } pos = 0; stripe = startoffstripe; while (pos < length) { uint32_t readlen; if (pos == 0) readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length, c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length)); else readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length); if (stripe == j / sub_stripes) { ULONG l; if (csum) { for (l = 0; l < readlen; l += Vcb->superblock.sector_size) { uint32_t crc32 = csum[pos / Vcb->superblock.sector_size]; bool has_error = false; goodstripe = 0xffffffff; for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj) { if (context->stripes[j + k].bad_csums[so / Vcb->superblock.sector_size] != crc32) has_error = true; else goodstripe = k; } } if (has_error) { if (goodstripe != 0xffffffff) { for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj && context->stripes[j + k].bad_csums[so / Vcb->superblock.sector_size] != crc32) { uint64_t addr = offset + pos; log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false); recovered = true; RtlCopyMemory(context->stripes[j + k].buf + so, context->stripes[j + goodstripe].buf + so, Vcb->superblock.sector_size); } } } else { uint64_t addr = offset + pos; for (k = 0; k < sub_stripes; k++) { if (c->devices[j + j]->devobj) { log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, false, false); log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } } } pos += Vcb->superblock.sector_size; so += Vcb->superblock.sector_size; } } else { for (l = 0; l < readlen; l += Vcb->superblock.node_size) { for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj) { tree_header* th = (tree_header*)&context->stripes[j + k].buf[so]; uint64_t addr = offset + pos; if (context->stripes[j + k].bad_csums[so / Vcb->superblock.node_size] != *((uint32_t*)th->csum) || th->address != addr) { ULONG m; recovered = false; for (m = 0; m < sub_stripes; m++) { if (m != k) { tree_header* th2 = (tree_header*)&context->stripes[j + m].buf[so]; if (context->stripes[j + m].bad_csums[so / Vcb->superblock.node_size] == *((uint32_t*)th2->csum) && th2->address == addr) { log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false); RtlCopyMemory(th, th2, Vcb->superblock.node_size); recovered = true; break; } else log_device_error(Vcb, c->devices[j + m], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } if (!recovered) log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, false, false); } } } pos += Vcb->superblock.node_size; so += Vcb->superblock.node_size; } } } else pos += readlen; stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes); } if (recovered) { // write good data over bad for (k = 0; k < sub_stripes; k++) { if (c->devices[j + k]->devobj && !c->devices[j + k]->readonly) { CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset, context->stripes[j + k].buf, context->stripes[j + k].length); if (!NT_SUCCESS(Status)) { ERR("write_data_phys returned %08x\n", Status); log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS); goto end; } } } } } } } Status = STATUS_SUCCESS; end: ExFreePool(stripeoff); return Status; } static NTSTATUS scrub_extent(device_extension* Vcb, chunk* c, ULONG type, uint64_t offset, uint32_t size, uint32_t* csum) { ULONG i; scrub_context context; CHUNK_ITEM_STRIPE* cis; NTSTATUS Status; uint16_t startoffstripe, num_missing, allowed_missing; TRACE("(%p, %p, %I64x, %I64x, %p)\n", Vcb, c, offset, size, csum); context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes, ALLOC_TAG); if (!context.stripes) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } RtlZeroMemory(context.stripes, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes); context.stripes_left = 0; cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; if (type == BLOCK_FLAG_RAID0) { uint64_t startoff, endoff; uint16_t endoffstripe; get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &startoff, &startoffstripe); get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &endoff, &endoffstripe); for (i = 0; i < c->chunk_item->num_stripes; i++) { if (startoffstripe > i) context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length; else if (startoffstripe == i) context.stripes[i].start = startoff; else context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length); if (endoffstripe > i) context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start); else if (endoffstripe == i) context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start); else context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start); } allowed_missing = 0; } else if (type == BLOCK_FLAG_RAID10) { uint64_t startoff, endoff; uint16_t endoffstripe, j, sub_stripes = max(c->chunk_item->sub_stripes, 1); get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &startoff, &startoffstripe); get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &endoff, &endoffstripe); if ((c->chunk_item->num_stripes % sub_stripes) != 0) { ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", c->offset, c->chunk_item->num_stripes, sub_stripes); Status = STATUS_INTERNAL_ERROR; goto end; } startoffstripe *= sub_stripes; endoffstripe *= sub_stripes; for (i = 0; i < c->chunk_item->num_stripes; i += sub_stripes) { if (startoffstripe > i) context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length; else if (startoffstripe == i) context.stripes[i].start = startoff; else context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length); if (endoffstripe > i) context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start); else if (endoffstripe == i) context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start); else context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start); for (j = 1; j < sub_stripes; j++) { context.stripes[i+j].start = context.stripes[i].start; context.stripes[i+j].length = context.stripes[i].length; } } startoffstripe /= sub_stripes; allowed_missing = 1; } else allowed_missing = c->chunk_item->num_stripes - 1; num_missing = 0; for (i = 0; i < c->chunk_item->num_stripes; i++) { PIO_STACK_LOCATION IrpSp; context.stripes[i].context = (struct _scrub_context*)&context; if (type == BLOCK_FLAG_DUPLICATE) { context.stripes[i].start = offset - c->offset; context.stripes[i].length = size; } else if (type != BLOCK_FLAG_RAID0 && type != BLOCK_FLAG_RAID10) { ERR("unexpected chunk type %x\n", type); Status = STATUS_INTERNAL_ERROR; goto end; } if (!c->devices[i]->devobj) { num_missing++; if (num_missing > allowed_missing) { ERR("too many missing devices (at least %u, maximum allowed %u)\n", num_missing, allowed_missing); Status = STATUS_INTERNAL_ERROR; goto end; } } else if (context.stripes[i].length > 0) { context.stripes[i].buf = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG); if (!context.stripes[i].buf) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false); if (!context.stripes[i].Irp) { ERR("IoAllocateIrp failed\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp); IrpSp->MajorFunction = IRP_MJ_READ; IrpSp->FileObject = c->devices[i]->fileobj; if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) { context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG); if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION; context.stripes[i].Irp->UserBuffer = context.stripes[i].buf; } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) { context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, context.stripes[i].length, false, false, NULL); if (!context.stripes[i].Irp->MdlAddress) { ERR("IoAllocateMdl failed\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } Status = STATUS_SUCCESS; _SEH2_TRY { MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess); } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) { Status = _SEH2_GetExceptionCode(); } _SEH2_END; if (!NT_SUCCESS(Status)) { ERR("MmProbeAndLockPages threw exception %08x\n", Status); IoFreeMdl(context.stripes[i].Irp->MdlAddress); context.stripes[i].Irp->MdlAddress = NULL; goto end; } } else context.stripes[i].Irp->UserBuffer = context.stripes[i].buf; IrpSp->Parameters.Read.Length = context.stripes[i].length; IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].start + cis[i].offset; context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb; IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion, &context.stripes[i], true, true, true); context.stripes_left++; Vcb->scrub.data_scrubbed += context.stripes[i].length; } } if (context.stripes_left == 0) { ERR("error - not reading any stripes\n"); Status = STATUS_INTERNAL_ERROR; goto end; } KeInitializeEvent(&context.Event, NotificationEvent, false); for (i = 0; i < c->chunk_item->num_stripes; i++) { if (c->devices[i]->devobj && context.stripes[i].length > 0) IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp); } KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL); // return an error if any of the stripes returned an error for (i = 0; i < c->chunk_item->num_stripes; i++) { if (!NT_SUCCESS(context.stripes[i].iosb.Status)) { Status = context.stripes[i].iosb.Status; log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS); goto end; } } if (type == BLOCK_FLAG_DUPLICATE) { Status = scrub_extent_dup(Vcb, c, offset, csum, &context); if (!NT_SUCCESS(Status)) { ERR("scrub_extent_dup returned %08x\n", Status); goto end; } } else if (type == BLOCK_FLAG_RAID0) { Status = scrub_extent_raid0(Vcb, c, offset, size, startoffstripe, csum, &context); if (!NT_SUCCESS(Status)) { ERR("scrub_extent_raid0 returned %08x\n", Status); goto end; } } else if (type == BLOCK_FLAG_RAID10) { Status = scrub_extent_raid10(Vcb, c, offset, size, startoffstripe, csum, &context); if (!NT_SUCCESS(Status)) { ERR("scrub_extent_raid10 returned %08x\n", Status); goto end; } } end: if (context.stripes) { for (i = 0; i < c->chunk_item->num_stripes; i++) { if (context.stripes[i].Irp) { if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) { MmUnlockPages(context.stripes[i].Irp->MdlAddress); IoFreeMdl(context.stripes[i].Irp->MdlAddress); } IoFreeIrp(context.stripes[i].Irp); } if (context.stripes[i].buf) ExFreePool(context.stripes[i].buf); if (context.stripes[i].bad_csums) ExFreePool(context.stripes[i].bad_csums); } ExFreePool(context.stripes); } return Status; } static NTSTATUS scrub_data_extent(device_extension* Vcb, chunk* c, uint64_t offset, ULONG type, uint32_t* csum, RTL_BITMAP* bmp, ULONG bmplen) { NTSTATUS Status; ULONG runlength, index; runlength = RtlFindFirstRunClear(bmp, &index); while (runlength != 0) { if (index >= bmplen) break; if (index + runlength >= bmplen) { runlength = bmplen - index; if (runlength == 0) break; } do { ULONG rl; if (runlength * Vcb->superblock.sector_size > SCRUB_UNIT) rl = SCRUB_UNIT / Vcb->superblock.sector_size; else rl = runlength; Status = scrub_extent(Vcb, c, type, offset + UInt32x32To64(index, Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index]); if (!NT_SUCCESS(Status)) { ERR("scrub_data_extent_dup returned %08x\n", Status); return Status; } runlength -= rl; index += rl; } while (runlength > 0); runlength = RtlFindNextForwardRunClear(bmp, index, &index); } return STATUS_SUCCESS; } typedef struct { uint8_t* buf; PIRP Irp; void* context; IO_STATUS_BLOCK iosb; uint64_t offset; bool rewrite, missing; RTL_BITMAP error; ULONG* errorarr; } scrub_context_raid56_stripe; typedef struct { scrub_context_raid56_stripe* stripes; LONG stripes_left; KEVENT Event; RTL_BITMAP alloc; RTL_BITMAP has_csum; RTL_BITMAP is_tree; uint32_t* csum; uint8_t* parity_scratch; uint8_t* parity_scratch2; } scrub_context_raid56; _Function_class_(IO_COMPLETION_ROUTINE) static NTSTATUS __stdcall scrub_read_completion_raid56(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) { scrub_context_raid56_stripe* stripe = conptr; scrub_context_raid56* context = (scrub_context_raid56*)stripe->context; LONG left = InterlockedDecrement(&context->stripes_left); UNUSED(DeviceObject); stripe->iosb = Irp->IoStatus; if (left == 0) KeSetEvent(&context->Event, 0, false); return STATUS_MORE_PROCESSING_REQUIRED; } static void scrub_raid5_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start, uint64_t num, uint16_t missing_devices) { ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size), i, off; uint16_t stripe, parity = (bit_start + num + c->chunk_item->num_stripes - 1) % c->chunk_item->num_stripes; uint64_t stripeoff; stripe = (parity + 1) % c->chunk_item->num_stripes; off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1); stripeoff = num * sectors_per_stripe; if (missing_devices == 0) RtlCopyMemory(context->parity_scratch, &context->stripes[parity].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length); while (stripe != parity) { RtlClearAllBits(&context->stripes[stripe].error); for (i = 0; i < sectors_per_stripe; i++) { if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) { if (RtlCheckBit(&context->is_tree, off)) { tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff * Vcb->superblock.sector_size]; uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size); uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); if (crc32 != *((uint32_t*)th->csum) || th->address != addr) { RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size / Vcb->superblock.sector_size); log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS); if (missing_devices > 0) log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false); } off += Vcb->superblock.node_size / Vcb->superblock.sector_size; stripeoff += Vcb->superblock.node_size / Vcb->superblock.sector_size; i += (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1; continue; } else if (RtlCheckBit(&context->has_csum, off)) { uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[stripe].buf + (stripeoff * Vcb->superblock.sector_size), Vcb->superblock.sector_size); if (crc32 != context->csum[off]) { RtlSetBit(&context->stripes[stripe].error, i); log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS); if (missing_devices > 0) { uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size); log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false); } } } } off++; stripeoff++; } if (missing_devices == 0) do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length); stripe = (stripe + 1) % c->chunk_item->num_stripes; stripeoff = num * sectors_per_stripe; } // check parity if (missing_devices == 0) { RtlClearAllBits(&context->stripes[parity].error); for (i = 0; i < sectors_per_stripe; i++) { ULONG o, j; o = i * Vcb->superblock.sector_size; for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE if (context->parity_scratch[o] != 0) { RtlSetBit(&context->stripes[parity].error, i); break; } o++; } } } // log and fix errors if (missing_devices > 0) return; for (i = 0; i < sectors_per_stripe; i++) { ULONG num_errors = 0, bad_off; uint64_t bad_stripe; bool alloc = false; stripe = (parity + 1) % c->chunk_item->num_stripes; off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i; while (stripe != parity) { if (RtlCheckBit(&context->alloc, off)) { alloc = true; if (RtlCheckBit(&context->stripes[stripe].error, i)) { bad_stripe = stripe; bad_off = off; num_errors++; } } off += sectors_per_stripe; stripe = (stripe + 1) % c->chunk_item->num_stripes; } if (!alloc) continue; if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity].error, i)) // everything fine continue; if (num_errors == 0 && RtlCheckBit(&context->stripes[parity].error, i)) { // parity error uint64_t addr; do_xor(&context->stripes[parity].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); bad_off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i; addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off * Vcb->superblock.sector_size); context->stripes[parity].rewrite = true; log_error(Vcb, addr, c->devices[parity]->devitem.dev_id, false, true, true); log_device_error(Vcb, c->devices[parity], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } else if (num_errors == 1) { uint32_t crc32; uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off * Vcb->superblock.sector_size); if (RtlCheckBit(&context->is_tree, bad_off)) { tree_header* th; do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size], &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size); th = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size]; crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); if (crc32 == *((uint32_t*)th->csum) && th->address == addr) { RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size); context->stripes[bad_stripe].rewrite = true; RtlClearBits(&context->stripes[bad_stripe].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1); log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, false, false); } else { do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size], &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size); crc32 = ~calc_crc32c(0xffffffff, &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); if (crc32 == context->csum[bad_off]) { RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); context->stripes[bad_stripe].rewrite = true; log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, false, false); } } else { stripe = (parity + 1) % c->chunk_item->num_stripes; off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i; while (stripe != parity) { if (RtlCheckBit(&context->alloc, off)) { if (RtlCheckBit(&context->stripes[stripe].error, i)) { uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size); log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false); } } off += sectors_per_stripe; stripe = (stripe + 1) % c->chunk_item->num_stripes; } } } } static void scrub_raid6_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start, uint64_t num, uint16_t missing_devices) { ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size), i, off; uint16_t stripe, parity1 = (bit_start + num + c->chunk_item->num_stripes - 2) % c->chunk_item->num_stripes; uint16_t parity2 = (parity1 + 1) % c->chunk_item->num_stripes; uint64_t stripeoff; stripe = (parity1 + 2) % c->chunk_item->num_stripes; off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2); stripeoff = num * sectors_per_stripe; if (c->devices[parity1]->devobj) RtlCopyMemory(context->parity_scratch, &context->stripes[parity1].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length); if (c->devices[parity2]->devobj) RtlZeroMemory(context->parity_scratch2, (ULONG)c->chunk_item->stripe_length); while (stripe != parity1) { RtlClearAllBits(&context->stripes[stripe].error); for (i = 0; i < sectors_per_stripe; i++) { if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) { if (RtlCheckBit(&context->is_tree, off)) { tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff * Vcb->superblock.sector_size]; uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size); uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); if (crc32 != *((uint32_t*)th->csum) || th->address != addr) { RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size / Vcb->superblock.sector_size); log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS); if (missing_devices == 2) log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false); } off += Vcb->superblock.node_size / Vcb->superblock.sector_size; stripeoff += Vcb->superblock.node_size / Vcb->superblock.sector_size; i += (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1; continue; } else if (RtlCheckBit(&context->has_csum, off)) { uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[stripe].buf + (stripeoff * Vcb->superblock.sector_size), Vcb->superblock.sector_size); if (crc32 != context->csum[off]) { uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size); RtlSetBit(&context->stripes[stripe].error, i); log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS); if (missing_devices == 2) log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false); } } } off++; stripeoff++; } if (c->devices[parity1]->devobj) do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length); stripe = (stripe + 1) % c->chunk_item->num_stripes; stripeoff = num * sectors_per_stripe; } RtlClearAllBits(&context->stripes[parity1].error); if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity2]->devobj)) { // check parity 1 for (i = 0; i < sectors_per_stripe; i++) { ULONG o, j; o = i * Vcb->superblock.sector_size; for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE if (context->parity_scratch[o] != 0) { RtlSetBit(&context->stripes[parity1].error, i); break; } o++; } } } RtlClearAllBits(&context->stripes[parity2].error); if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity1]->devobj)) { // check parity 2 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1); while (stripe != parity2) { galois_double(context->parity_scratch2, (uint32_t)c->chunk_item->stripe_length); do_xor(context->parity_scratch2, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length); stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); } for (i = 0; i < sectors_per_stripe; i++) { if (RtlCompareMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size) != Vcb->superblock.sector_size) RtlSetBit(&context->stripes[parity2].error, i); } } if (missing_devices == 2) return; // log and fix errors for (i = 0; i < sectors_per_stripe; i++) { ULONG num_errors = 0; uint64_t bad_stripe1, bad_stripe2; ULONG bad_off1, bad_off2; bool alloc = false; stripe = (parity1 + 2) % c->chunk_item->num_stripes; off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i; while (stripe != parity1) { if (RtlCheckBit(&context->alloc, off)) { alloc = true; if (!c->devices[stripe]->devobj || RtlCheckBit(&context->stripes[stripe].error, i)) { if (num_errors == 0) { bad_stripe1 = stripe; bad_off1 = off; } else if (num_errors == 1) { bad_stripe2 = stripe; bad_off2 = off; } num_errors++; } } off += sectors_per_stripe; stripe = (stripe + 1) % c->chunk_item->num_stripes; } if (!alloc) continue; if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity1].error, i) && !RtlCheckBit(&context->stripes[parity2].error, i)) // everything fine continue; if (num_errors == 0) { // parity error uint64_t addr; if (RtlCheckBit(&context->stripes[parity1].error, i)) { do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i; addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size); context->stripes[parity1].rewrite = true; log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true); log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } if (RtlCheckBit(&context->stripes[parity2].error, i)) { RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i; addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size); context->stripes[parity2].rewrite = true; log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true); log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } else if (num_errors == 1) { uint32_t crc32a, crc32b, len; uint16_t stripe_num, bad_stripe_num; uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size); uint8_t* scratch; len = RtlCheckBit(&context->is_tree, bad_off1)? Vcb->superblock.node_size : Vcb->superblock.sector_size; scratch = ExAllocatePoolWithTag(PagedPool, len, ALLOC_TAG); if (!scratch) { ERR("out of memory\n"); return; } RtlZeroMemory(scratch, len); do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size], &context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1); if (c->devices[parity2]->devobj) { stripe_num = c->chunk_item->num_stripes - 3; while (stripe != parity2) { galois_double(scratch, len); if (stripe != bad_stripe1) do_xor(scratch, &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); else bad_stripe_num = stripe_num; stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); stripe_num--; } do_xor(scratch, &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); if (bad_stripe_num != 0) galois_divpower(scratch, (uint8_t)bad_stripe_num, len); } if (RtlCheckBit(&context->is_tree, bad_off1)) { tree_header *th1 = NULL, *th2 = NULL; if (c->devices[parity1]->devobj) { th1 = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size]; crc32a = ~calc_crc32c(0xffffffff, (uint8_t*)&th1->fs_uuid, Vcb->superblock.node_size - sizeof(th1->csum)); } if (c->devices[parity2]->devobj) { th2 = (tree_header*)scratch; crc32b = ~calc_crc32c(0xffffffff, (uint8_t*)&th2->fs_uuid, Vcb->superblock.node_size - sizeof(th2->csum)); } if ((c->devices[parity1]->devobj && crc32a == *((uint32_t*)th1->csum) && th1->address == addr) || (c->devices[parity2]->devobj && crc32b == *((uint32_t*)th2->csum) && th2->address == addr)) { if (!c->devices[parity1]->devobj || crc32a != *((uint32_t*)th1->csum) || th1->address != addr) { RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], scratch, Vcb->superblock.node_size); if (c->devices[parity1]->devobj) { // fix parity 1 stripe = (parity1 + 2) % c->chunk_item->num_stripes; RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size); stripe = (stripe + 1) % c->chunk_item->num_stripes; while (stripe != parity1) { do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size); stripe = (stripe + 1) % c->chunk_item->num_stripes; } context->stripes[parity1].rewrite = true; log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true); log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } else { RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size); if (!c->devices[parity2]->devobj || crc32b != *((uint32_t*)th2->csum) || th2->address != addr) { // fix parity 2 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1); if (c->devices[parity2]->devobj) { RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size); stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); while (stripe != parity2) { galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size); do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size); stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); } context->stripes[parity2].rewrite = true; log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true); log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } } context->stripes[bad_stripe1].rewrite = true; RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1); log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false); } else { if (c->devices[parity1]->devobj) crc32a = ~calc_crc32c(0xffffffff, &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); if (c->devices[parity2]->devobj) crc32b = ~calc_crc32c(0xffffffff, scratch, Vcb->superblock.sector_size); if ((c->devices[parity1]->devobj && crc32a == context->csum[bad_off1]) || (c->devices[parity2]->devobj && crc32b == context->csum[bad_off1])) { if (c->devices[parity2]->devobj && crc32b == context->csum[bad_off1]) { RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], scratch, Vcb->superblock.sector_size); if (c->devices[parity1]->devobj && crc32a != context->csum[bad_off1]) { // fix parity 1 stripe = (parity1 + 2) % c->chunk_item->num_stripes; RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size); stripe = (stripe + 1) % c->chunk_item->num_stripes; while (stripe != parity1) { do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size); stripe = (stripe + 1) % c->chunk_item->num_stripes; } context->stripes[parity1].rewrite = true; log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true); log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } else { RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); if (c->devices[parity2]->devobj && crc32b != context->csum[bad_off1]) { // fix parity 2 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1); RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size); stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); while (stripe != parity2) { galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size); do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size); stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); } context->stripes[parity2].rewrite = true; log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true); log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS); } } context->stripes[bad_stripe1].rewrite = true; log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false); } ExFreePool(scratch); } else if (num_errors == 2 && missing_devices == 0) { uint16_t x, y, k; uint64_t addr; uint32_t len = (RtlCheckBit(&context->is_tree, bad_off1) || RtlCheckBit(&context->is_tree, bad_off2)) ? Vcb->superblock.node_size : Vcb->superblock.sector_size; uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy; uint32_t j; stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1); // put qxy in parity_scratch // put pxy in parity_scratch2 k = c->chunk_item->num_stripes - 3; if (stripe == bad_stripe1 || stripe == bad_stripe2) { RtlZeroMemory(&context->parity_scratch[i * Vcb->superblock.sector_size], len); RtlZeroMemory(&context->parity_scratch2[i * Vcb->superblock.sector_size], len); if (stripe == bad_stripe1) x = k; else y = k; } else { RtlCopyMemory(&context->parity_scratch[i * Vcb->superblock.sector_size], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); RtlCopyMemory(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); } stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); k--; do { galois_double(&context->parity_scratch[i * Vcb->superblock.sector_size], len); if (stripe != bad_stripe1 && stripe != bad_stripe2) { do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); } else if (stripe == bad_stripe1) x = k; else if (stripe == bad_stripe2) y = k; stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1); k--; } while (stripe != parity2); gyx = gpow2(y > x ? (y-x) : (255-x+y)); gx = gpow2(255-x); denom = gdiv(1, gyx ^ 1); a = gmul(gyx, denom); b = gmul(gx, denom); p = &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)]; q = &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)]; pxy = &context->parity_scratch2[i * Vcb->superblock.sector_size]; qxy = &context->parity_scratch[i * Vcb->superblock.sector_size]; for (j = 0; j < len; j++) { *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy); p++; q++; pxy++; qxy++; } do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->parity_scratch[i * Vcb->superblock.sector_size], len); do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len); addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size); if (RtlCheckBit(&context->is_tree, bad_off1)) { tree_header* th = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size]; uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); if (crc32 == *((uint32_t*)th->csum) && th->address == addr) { RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size); context->stripes[bad_stripe1].rewrite = true; RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1); log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false); } else { uint32_t crc32 = ~calc_crc32c(0xffffffff, &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); if (crc32 == context->csum[bad_off1]) { RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); context->stripes[bad_stripe1].rewrite = true; log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false); } addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off2 * Vcb->superblock.sector_size); if (RtlCheckBit(&context->is_tree, bad_off2)) { tree_header* th = (tree_header*)&context->parity_scratch2[i * Vcb->superblock.sector_size]; uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum)); if (crc32 == *((uint32_t*)th->csum) && th->address == addr) { RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.node_size); context->stripes[bad_stripe2].rewrite = true; RtlClearBits(&context->stripes[bad_stripe2].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1); log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, false, false); } else { uint32_t crc32 = ~calc_crc32c(0xffffffff, &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); if (crc32 == context->csum[bad_off2]) { RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size); context->stripes[bad_stripe2].rewrite = true; log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, true, false); } else log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, false, false); } } else { stripe = (parity2 + 1) % c->chunk_item->num_stripes; off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i; while (stripe != parity1) { if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) { if (RtlCheckBit(&context->stripes[stripe].error, i)) { uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size); log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false); } } off += sectors_per_stripe; stripe = (stripe + 1) % c->chunk_item->num_stripes; } } } } static NTSTATUS scrub_chunk_raid56_stripe_run(device_extension* Vcb, chunk* c, uint64_t stripe_start, uint64_t stripe_end) { NTSTATUS Status; KEY searchkey; traverse_ptr tp; bool b; uint64_t run_start, run_end, full_stripe_len, stripe; uint32_t max_read, num_sectors; ULONG arrlen, *allocarr, *csumarr = NULL, *treearr, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1; scrub_context_raid56 context; uint16_t i; CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; TRACE("(%p, %p, %I64x, %I64x)\n", Vcb, c, stripe_start, stripe_end); full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length; run_start = c->offset + (stripe_start * full_stripe_len); run_end = c->offset + ((stripe_end + 1) * full_stripe_len); searchkey.obj_id = run_start; searchkey.obj_type = TYPE_METADATA_ITEM; searchkey.offset = 0xffffffffffffffff; Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); if (!NT_SUCCESS(Status)) { ERR("find_item returned %08x\n", Status); return Status; } num_sectors = (uint32_t)((stripe_end - stripe_start + 1) * full_stripe_len / Vcb->superblock.sector_size); arrlen = (ULONG)sector_align((num_sectors / 8) + 1, sizeof(ULONG)); allocarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG); if (!allocarr) { ERR("out of memory\n"); return STATUS_INSUFFICIENT_RESOURCES; } treearr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG); if (!treearr) { ERR("out of memory\n"); ExFreePool(allocarr); return STATUS_INSUFFICIENT_RESOURCES; } RtlInitializeBitMap(&context.alloc, allocarr, num_sectors); RtlClearAllBits(&context.alloc); RtlInitializeBitMap(&context.is_tree, treearr, num_sectors); RtlClearAllBits(&context.is_tree); context.parity_scratch = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG); if (!context.parity_scratch) { ERR("out of memory\n"); ExFreePool(allocarr); ExFreePool(treearr); return STATUS_INSUFFICIENT_RESOURCES; } if (c->chunk_item->type & BLOCK_FLAG_DATA) { csumarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG); if (!csumarr) { ERR("out of memory\n"); ExFreePool(allocarr); ExFreePool(treearr); ExFreePool(context.parity_scratch); return STATUS_INSUFFICIENT_RESOURCES; } RtlInitializeBitMap(&context.has_csum, csumarr, num_sectors); RtlClearAllBits(&context.has_csum); context.csum = ExAllocatePoolWithTag(PagedPool, num_sectors * sizeof(uint32_t), ALLOC_TAG); if (!context.csum) { ERR("out of memory\n"); ExFreePool(allocarr); ExFreePool(treearr); ExFreePool(context.parity_scratch); ExFreePool(csumarr); return STATUS_INSUFFICIENT_RESOURCES; } } if (c->chunk_item->type & BLOCK_FLAG_RAID6) { context.parity_scratch2 = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG); if (!context.parity_scratch2) { ERR("out of memory\n"); ExFreePool(allocarr); ExFreePool(treearr); ExFreePool(context.parity_scratch); if (c->chunk_item->type & BLOCK_FLAG_DATA) { ExFreePool(csumarr); ExFreePool(context.csum); } return STATUS_INSUFFICIENT_RESOURCES; } } do { traverse_ptr next_tp; if (tp.item->key.obj_id >= run_end) break; if (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM) { uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset; if (tp.item->key.obj_id + size > run_start) { uint64_t extent_start = max(run_start, tp.item->key.obj_id); uint64_t extent_end = min(tp.item->key.obj_id + size, run_end); bool extent_is_tree = false; RtlSetBits(&context.alloc, (ULONG)((extent_start - run_start) / Vcb->superblock.sector_size), (ULONG)((extent_end - extent_start) / Vcb->superblock.sector_size)); if (tp.item->key.obj_type == TYPE_METADATA_ITEM) extent_is_tree = true; else { EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; if (tp.item->size < sizeof(EXTENT_ITEM)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM)); Status = STATUS_INTERNAL_ERROR; goto end; } if (ei->flags & EXTENT_ITEM_TREE_BLOCK) extent_is_tree = true; } if (extent_is_tree) RtlSetBits(&context.is_tree, (ULONG)((extent_start - run_start) / Vcb->superblock.sector_size), (ULONG)((extent_end - extent_start) / Vcb->superblock.sector_size)); else if (c->chunk_item->type & BLOCK_FLAG_DATA) { traverse_ptr tp2; bool b2; searchkey.obj_id = EXTENT_CSUM_ID; searchkey.obj_type = TYPE_EXTENT_CSUM; searchkey.offset = extent_start; Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL); if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { ERR("find_item returned %08x\n", Status); goto end; } do { traverse_ptr next_tp2; if (tp2.item->key.offset >= extent_end) break; if (tp2.item->key.offset >= extent_start) { uint64_t csum_start = max(extent_start, tp2.item->key.offset); uint64_t csum_end = min(extent_end, tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / sizeof(uint32_t))); RtlSetBits(&context.has_csum, (ULONG)((csum_start - run_start) / Vcb->superblock.sector_size), (ULONG)((csum_end - csum_start) / Vcb->superblock.sector_size)); RtlCopyMemory(&context.csum[(csum_start - run_start) / Vcb->superblock.sector_size], tp2.item->data + ((csum_start - tp2.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size), (ULONG)((csum_end - csum_start) * sizeof(uint32_t) / Vcb->superblock.sector_size)); } b2 = find_next_item(Vcb, &tp2, &next_tp2, false, NULL); if (b2) tp2 = next_tp2; } while (b2); } } } b = find_next_item(Vcb, &tp, &next_tp, false, NULL); if (b) tp = next_tp; } while (b); context.stripes = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_context_raid56_stripe) * c->chunk_item->num_stripes, ALLOC_TAG); if (!context.stripes) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } max_read = (uint32_t)min(1048576 / c->chunk_item->stripe_length, stripe_end - stripe_start + 1); // only process 1 MB of data at a time for (i = 0; i < c->chunk_item->num_stripes; i++) { context.stripes[i].buf = ExAllocatePoolWithTag(PagedPool, (ULONG)(max_read * c->chunk_item->stripe_length), ALLOC_TAG); if (!context.stripes[i].buf) { uint64_t j; ERR("out of memory\n"); for (j = 0; j < i; j++) { ExFreePool(context.stripes[j].buf); } ExFreePool(context.stripes); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } context.stripes[i].errorarr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(((c->chunk_item->stripe_length / Vcb->superblock.sector_size) / 8) + 1, sizeof(ULONG)), ALLOC_TAG); if (!context.stripes[i].errorarr) { uint64_t j; ERR("out of memory\n"); ExFreePool(context.stripes[i].buf); for (j = 0; j < i; j++) { ExFreePool(context.stripes[j].buf); } ExFreePool(context.stripes); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } RtlInitializeBitMap(&context.stripes[i].error, context.stripes[i].errorarr, (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size)); context.stripes[i].context = &context; context.stripes[i].rewrite = false; } stripe = stripe_start; Status = STATUS_SUCCESS; chunk_lock_range(Vcb, c, run_start, run_end - run_start); do { ULONG read_stripes; uint16_t missing_devices = 0; bool need_wait = false; if (max_read < stripe_end + 1 - stripe) read_stripes = max_read; else read_stripes = (ULONG)(stripe_end + 1 - stripe); context.stripes_left = c->chunk_item->num_stripes; // read megabyte by megabyte for (i = 0; i < c->chunk_item->num_stripes; i++) { if (c->devices[i]->devobj) { PIO_STACK_LOCATION IrpSp; context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false); if (!context.stripes[i].Irp) { ERR("IoAllocateIrp failed\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end3; } context.stripes[i].Irp->MdlAddress = NULL; IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp); IrpSp->MajorFunction = IRP_MJ_READ; IrpSp->FileObject = c->devices[i]->fileobj; if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) { context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(read_stripes * c->chunk_item->stripe_length), ALLOC_TAG); if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end3; } context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION; context.stripes[i].Irp->UserBuffer = context.stripes[i].buf; } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) { context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, (ULONG)(read_stripes * c->chunk_item->stripe_length), false, false, NULL); if (!context.stripes[i].Irp->MdlAddress) { ERR("IoAllocateMdl failed\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end3; } Status = STATUS_SUCCESS; _SEH2_TRY { MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess); } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) { Status = _SEH2_GetExceptionCode(); } _SEH2_END; if (!NT_SUCCESS(Status)) { ERR("MmProbeAndLockPages threw exception %08x\n", Status); IoFreeMdl(context.stripes[i].Irp->MdlAddress); goto end3; } } else context.stripes[i].Irp->UserBuffer = context.stripes[i].buf; context.stripes[i].offset = stripe * c->chunk_item->stripe_length; IrpSp->Parameters.Read.Length = (ULONG)(read_stripes * c->chunk_item->stripe_length); IrpSp->Parameters.Read.ByteOffset.QuadPart = cis[i].offset + context.stripes[i].offset; context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb; context.stripes[i].missing = false; IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion_raid56, &context.stripes[i], true, true, true); Vcb->scrub.data_scrubbed += read_stripes * c->chunk_item->stripe_length; need_wait = true; } else { context.stripes[i].Irp = NULL; context.stripes[i].missing = true; missing_devices++; InterlockedDecrement(&context.stripes_left); } } if (c->chunk_item->type & BLOCK_FLAG_RAID5 && missing_devices > 1) { ERR("too many missing devices (%u, maximum 1)\n", missing_devices); Status = STATUS_UNEXPECTED_IO_ERROR; goto end3; } else if (c->chunk_item->type & BLOCK_FLAG_RAID6 && missing_devices > 2) { ERR("too many missing devices (%u, maximum 2)\n", missing_devices); Status = STATUS_UNEXPECTED_IO_ERROR; goto end3; } if (need_wait) { KeInitializeEvent(&context.Event, NotificationEvent, false); for (i = 0; i < c->chunk_item->num_stripes; i++) { if (c->devices[i]->devobj) IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp); } KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL); } // return an error if any of the stripes returned an error for (i = 0; i < c->chunk_item->num_stripes; i++) { if (!context.stripes[i].missing && !NT_SUCCESS(context.stripes[i].iosb.Status)) { Status = context.stripes[i].iosb.Status; log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS); goto end3; } } if (c->chunk_item->type & BLOCK_FLAG_RAID6) { for (i = 0; i < read_stripes; i++) { scrub_raid6_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices); } } else { for (i = 0; i < read_stripes; i++) { scrub_raid5_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices); } } stripe += read_stripes; end3: for (i = 0; i < c->chunk_item->num_stripes; i++) { if (context.stripes[i].Irp) { if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) { MmUnlockPages(context.stripes[i].Irp->MdlAddress); IoFreeMdl(context.stripes[i].Irp->MdlAddress); } IoFreeIrp(context.stripes[i].Irp); context.stripes[i].Irp = NULL; if (context.stripes[i].rewrite) { Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + context.stripes[i].offset, context.stripes[i].buf, (uint32_t)(read_stripes * c->chunk_item->stripe_length)); if (!NT_SUCCESS(Status)) { ERR("write_data_phys returned %08x\n", Status); log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS); goto end2; } } } } if (!NT_SUCCESS(Status)) break; } while (stripe < stripe_end); end2: chunk_unlock_range(Vcb, c, run_start, run_end - run_start); for (i = 0; i < c->chunk_item->num_stripes; i++) { ExFreePool(context.stripes[i].buf); ExFreePool(context.stripes[i].errorarr); } ExFreePool(context.stripes); end: ExFreePool(treearr); ExFreePool(allocarr); ExFreePool(context.parity_scratch); if (c->chunk_item->type & BLOCK_FLAG_RAID6) ExFreePool(context.parity_scratch2); if (c->chunk_item->type & BLOCK_FLAG_DATA) { ExFreePool(csumarr); ExFreePool(context.csum); } return Status; } static NTSTATUS scrub_chunk_raid56(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) { NTSTATUS Status; KEY searchkey; traverse_ptr tp; bool b; uint64_t full_stripe_len, stripe, stripe_start, stripe_end, total_data = 0; ULONG num_extents = 0, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1; full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length; stripe = (*offset - c->offset) / full_stripe_len; *offset = c->offset + (stripe * full_stripe_len); searchkey.obj_id = *offset; searchkey.obj_type = TYPE_METADATA_ITEM; searchkey.offset = 0xffffffffffffffff; Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); if (!NT_SUCCESS(Status)) { ERR("find_item returned %08x\n", Status); return Status; } *changed = false; do { traverse_ptr next_tp; if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) break; if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) { uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset; TRACE("%I64x\n", tp.item->key.obj_id); if (size < Vcb->superblock.sector_size) { ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, Vcb->superblock.sector_size); return STATUS_INTERNAL_ERROR; } stripe = (tp.item->key.obj_id - c->offset) / full_stripe_len; if (*changed) { if (stripe > stripe_end + 1) { Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end); if (!NT_SUCCESS(Status)) { ERR("scrub_chunk_raid56_stripe_run returned %08x\n", Status); return Status; } stripe_start = stripe; } } else stripe_start = stripe; stripe_end = (tp.item->key.obj_id + size - 1 - c->offset) / full_stripe_len; *changed = true; total_data += size; num_extents++; // only do so much at a time if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB break; } b = find_next_item(Vcb, &tp, &next_tp, false, NULL); if (b) tp = next_tp; } while (b); if (*changed) { Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end); if (!NT_SUCCESS(Status)) { ERR("scrub_chunk_raid56_stripe_run returned %08x\n", Status); return Status; } *offset = c->offset + ((stripe_end + 1) * full_stripe_len); } return STATUS_SUCCESS; } static NTSTATUS scrub_chunk(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) { NTSTATUS Status; KEY searchkey; traverse_ptr tp; bool b = false, tree_run = false; ULONG type, num_extents = 0; uint64_t total_data = 0, tree_run_start, tree_run_end; TRACE("chunk %I64x\n", c->offset); ExAcquireResourceSharedLite(&Vcb->tree_lock, true); if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE) type = BLOCK_FLAG_DUPLICATE; else if (c->chunk_item->type & BLOCK_FLAG_RAID0) type = BLOCK_FLAG_RAID0; else if (c->chunk_item->type & BLOCK_FLAG_RAID1) type = BLOCK_FLAG_DUPLICATE; else if (c->chunk_item->type & BLOCK_FLAG_RAID10) type = BLOCK_FLAG_RAID10; else if (c->chunk_item->type & BLOCK_FLAG_RAID5) { Status = scrub_chunk_raid56(Vcb, c, offset, changed); goto end; } else if (c->chunk_item->type & BLOCK_FLAG_RAID6) { Status = scrub_chunk_raid56(Vcb, c, offset, changed); goto end; } else // SINGLE type = BLOCK_FLAG_DUPLICATE; searchkey.obj_id = *offset; searchkey.obj_type = TYPE_METADATA_ITEM; searchkey.offset = 0xffffffffffffffff; Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); if (!NT_SUCCESS(Status)) { ERR("error - find_item returned %08x\n", Status); goto end; } do { traverse_ptr next_tp; if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) break; if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) { uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset; bool is_tree; uint32_t* csum = NULL; RTL_BITMAP bmp; ULONG* bmparr = NULL, bmplen; TRACE("%I64x\n", tp.item->key.obj_id); is_tree = false; if (tp.item->key.obj_type == TYPE_METADATA_ITEM) is_tree = true; else { EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; if (tp.item->size < sizeof(EXTENT_ITEM)) { ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM)); Status = STATUS_INTERNAL_ERROR; goto end; } if (ei->flags & EXTENT_ITEM_TREE_BLOCK) is_tree = true; } if (size < Vcb->superblock.sector_size) { ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, Vcb->superblock.sector_size); Status = STATUS_INTERNAL_ERROR; goto end; } // load csum if (!is_tree) { traverse_ptr tp2; csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(sizeof(uint32_t) * size / Vcb->superblock.sector_size), ALLOC_TAG); if (!csum) { ERR("out of memory\n"); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } bmplen = (ULONG)(size / Vcb->superblock.sector_size); bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)(sector_align((bmplen >> 3) + 1, sizeof(ULONG))), ALLOC_TAG); if (!bmparr) { ERR("out of memory\n"); ExFreePool(csum); Status = STATUS_INSUFFICIENT_RESOURCES; goto end; } RtlInitializeBitMap(&bmp, bmparr, bmplen); RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum searchkey.obj_id = EXTENT_CSUM_ID; searchkey.obj_type = TYPE_EXTENT_CSUM; searchkey.offset = tp.item->key.obj_id; Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL); if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { ERR("find_item returned %08x\n", Status); ExFreePool(csum); ExFreePool(bmparr); goto end; } if (Status != STATUS_NOT_FOUND) { do { traverse_ptr next_tp2; if (tp2.item->key.obj_type == TYPE_EXTENT_CSUM) { if (tp2.item->key.offset >= tp.item->key.obj_id + size) break; else if (tp2.item->size >= sizeof(uint32_t) && tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= tp.item->key.obj_id) { uint64_t cs = max(tp.item->key.obj_id, tp2.item->key.offset); uint64_t ce = min(tp.item->key.obj_id + size, tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / sizeof(uint32_t))); RtlCopyMemory(csum + ((cs - tp.item->key.obj_id) / Vcb->superblock.sector_size), tp2.item->data + ((cs - tp2.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size), (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size)); RtlClearBits(&bmp, (ULONG)((cs - tp.item->key.obj_id) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size)); if (ce == tp.item->key.obj_id + size) break; } } if (find_next_item(Vcb, &tp2, &next_tp2, false, NULL)) tp2 = next_tp2; else break; } while (true); } } if (tree_run) { if (!is_tree || tp.item->key.obj_id > tree_run_end) { Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL); if (!NT_SUCCESS(Status)) { ERR("scrub_extent returned %08x\n", Status); goto end; } if (!is_tree) tree_run = false; else { tree_run_start = tp.item->key.obj_id; tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size; } } else tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size; } else if (is_tree) { tree_run = true; tree_run_start = tp.item->key.obj_id; tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size; } if (!is_tree) { Status = scrub_data_extent(Vcb, c, tp.item->key.obj_id, type, csum, &bmp, bmplen); if (!NT_SUCCESS(Status)) { ERR("scrub_data_extent returned %08x\n", Status); ExFreePool(csum); ExFreePool(bmparr); goto end; } ExFreePool(csum); ExFreePool(bmparr); } *offset = tp.item->key.obj_id + size; *changed = true; total_data += size; num_extents++; // only do so much at a time if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB break; } b = find_next_item(Vcb, &tp, &next_tp, false, NULL); if (b) tp = next_tp; } while (b); if (tree_run) { Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL); if (!NT_SUCCESS(Status)) { ERR("scrub_extent returned %08x\n", Status); goto end; } } Status = STATUS_SUCCESS; end: ExReleaseResourceLite(&Vcb->tree_lock); return Status; } _Function_class_(KSTART_ROUTINE) static void __stdcall scrub_thread(void* context) { device_extension* Vcb = context; LIST_ENTRY chunks, *le; NTSTATUS Status; LARGE_INTEGER time; KeInitializeEvent(&Vcb->scrub.finished, NotificationEvent, false); InitializeListHead(&chunks); ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); if (Vcb->need_write && !Vcb->readonly) Status = do_write(Vcb, NULL); else Status = STATUS_SUCCESS; free_trees(Vcb); if (!NT_SUCCESS(Status)) { ExReleaseResourceLite(&Vcb->tree_lock); ERR("do_write returned %08x\n", Status); Vcb->scrub.error = Status; goto end; } ExConvertExclusiveToSharedLite(&Vcb->tree_lock); ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true); KeQuerySystemTime(&Vcb->scrub.start_time); Vcb->scrub.finish_time.QuadPart = 0; Vcb->scrub.resume_time.QuadPart = Vcb->scrub.start_time.QuadPart; Vcb->scrub.duration.QuadPart = 0; Vcb->scrub.total_chunks = 0; Vcb->scrub.chunks_left = 0; Vcb->scrub.data_scrubbed = 0; Vcb->scrub.num_errors = 0; while (!IsListEmpty(&Vcb->scrub.errors)) { scrub_error* err = CONTAINING_RECORD(RemoveHeadList(&Vcb->scrub.errors), scrub_error, list_entry); ExFreePool(err); } ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); le = Vcb->chunks.Flink; while (le != &Vcb->chunks) { chunk* c = CONTAINING_RECORD(le, chunk, list_entry); acquire_chunk_lock(c, Vcb); if (!c->readonly) { InsertTailList(&chunks, &c->list_entry_balance); Vcb->scrub.total_chunks++; Vcb->scrub.chunks_left++; } release_chunk_lock(c, Vcb); le = le->Flink; } ExReleaseResourceLite(&Vcb->chunk_lock); ExReleaseResource(&Vcb->scrub.stats_lock); ExReleaseResourceLite(&Vcb->tree_lock); while (!IsListEmpty(&chunks)) { chunk* c = CONTAINING_RECORD(RemoveHeadList(&chunks), chunk, list_entry_balance); uint64_t offset = c->offset; bool changed; c->reloc = true; KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL); if (!Vcb->scrub.stopping) { do { changed = false; Status = scrub_chunk(Vcb, c, &offset, &changed); if (!NT_SUCCESS(Status)) { ERR("scrub_chunk returned %08x\n", Status); Vcb->scrub.stopping = true; Vcb->scrub.error = Status; break; } if (offset == c->offset + c->chunk_item->size || Vcb->scrub.stopping) break; KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL); } while (changed); } ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true); if (!Vcb->scrub.stopping) Vcb->scrub.chunks_left--; if (IsListEmpty(&chunks)) KeQuerySystemTime(&Vcb->scrub.finish_time); ExReleaseResource(&Vcb->scrub.stats_lock); c->reloc = false; c->list_entry_balance.Flink = NULL; } KeQuerySystemTime(&time); Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart; end: ZwClose(Vcb->scrub.thread); Vcb->scrub.thread = NULL; KeSetEvent(&Vcb->scrub.finished, 0, false); } NTSTATUS start_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { NTSTATUS Status; if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) return STATUS_PRIVILEGE_NOT_HELD; if (Vcb->locked) { WARN("cannot start scrub while locked\n"); return STATUS_DEVICE_NOT_READY; } if (Vcb->balance.thread) { WARN("cannot start scrub while balance running\n"); return STATUS_DEVICE_NOT_READY; } if (Vcb->scrub.thread) { WARN("scrub already running\n"); return STATUS_DEVICE_NOT_READY; } if (Vcb->readonly) return STATUS_MEDIA_WRITE_PROTECTED; Vcb->scrub.stopping = false; Vcb->scrub.paused = false; Vcb->scrub.error = STATUS_SUCCESS; KeInitializeEvent(&Vcb->scrub.event, NotificationEvent, !Vcb->scrub.paused); Status = PsCreateSystemThread(&Vcb->scrub.thread, 0, NULL, NULL, NULL, scrub_thread, Vcb); if (!NT_SUCCESS(Status)) { ERR("PsCreateSystemThread returned %08x\n", Status); return Status; } return STATUS_SUCCESS; } NTSTATUS query_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode, void* data, ULONG length) { btrfs_query_scrub* bqs = (btrfs_query_scrub*)data; ULONG len; NTSTATUS Status; LIST_ENTRY* le; btrfs_scrub_error* bse = NULL; if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) return STATUS_PRIVILEGE_NOT_HELD; if (length < offsetof(btrfs_query_scrub, errors)) return STATUS_BUFFER_TOO_SMALL; ExAcquireResourceSharedLite(&Vcb->scrub.stats_lock, true); if (Vcb->scrub.thread && Vcb->scrub.chunks_left > 0) bqs->status = Vcb->scrub.paused ? BTRFS_SCRUB_PAUSED : BTRFS_SCRUB_RUNNING; else bqs->status = BTRFS_SCRUB_STOPPED; bqs->start_time.QuadPart = Vcb->scrub.start_time.QuadPart; bqs->finish_time.QuadPart = Vcb->scrub.finish_time.QuadPart; bqs->chunks_left = Vcb->scrub.chunks_left; bqs->total_chunks = Vcb->scrub.total_chunks; bqs->data_scrubbed = Vcb->scrub.data_scrubbed; bqs->duration = Vcb->scrub.duration.QuadPart; if (bqs->status == BTRFS_SCRUB_RUNNING) { LARGE_INTEGER time; KeQuerySystemTime(&time); bqs->duration += time.QuadPart - Vcb->scrub.resume_time.QuadPart; } bqs->error = Vcb->scrub.error; bqs->num_errors = Vcb->scrub.num_errors; len = length - offsetof(btrfs_query_scrub, errors); le = Vcb->scrub.errors.Flink; while (le != &Vcb->scrub.errors) { scrub_error* err = CONTAINING_RECORD(le, scrub_error, list_entry); ULONG errlen; if (err->is_metadata) errlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY); else errlen = offsetof(btrfs_scrub_error, data.filename) + err->data.filename_length; if (len < errlen) { Status = STATUS_BUFFER_OVERFLOW; goto end; } if (!bse) bse = &bqs->errors; else { ULONG lastlen; if (bse->is_metadata) lastlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY); else lastlen = offsetof(btrfs_scrub_error, data.filename) + bse->data.filename_length; bse->next_entry = lastlen; bse = (btrfs_scrub_error*)(((uint8_t*)bse) + lastlen); } bse->next_entry = 0; bse->address = err->address; bse->device = err->device; bse->recovered = err->recovered; bse->is_metadata = err->is_metadata; bse->parity = err->parity; if (err->is_metadata) { bse->metadata.root = err->metadata.root; bse->metadata.level = err->metadata.level; bse->metadata.firstitem = err->metadata.firstitem; } else { bse->data.subvol = err->data.subvol; bse->data.offset = err->data.offset; bse->data.filename_length = err->data.filename_length; RtlCopyMemory(bse->data.filename, err->data.filename, err->data.filename_length); } len -= errlen; le = le->Flink; } Status = STATUS_SUCCESS; end: ExReleaseResourceLite(&Vcb->scrub.stats_lock); return Status; } NTSTATUS pause_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { LARGE_INTEGER time; if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) return STATUS_PRIVILEGE_NOT_HELD; if (!Vcb->scrub.thread) return STATUS_DEVICE_NOT_READY; if (Vcb->scrub.paused) return STATUS_DEVICE_NOT_READY; Vcb->scrub.paused = true; KeClearEvent(&Vcb->scrub.event); KeQuerySystemTime(&time); Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart; return STATUS_SUCCESS; } NTSTATUS resume_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) return STATUS_PRIVILEGE_NOT_HELD; if (!Vcb->scrub.thread) return STATUS_DEVICE_NOT_READY; if (!Vcb->scrub.paused) return STATUS_DEVICE_NOT_READY; Vcb->scrub.paused = false; KeSetEvent(&Vcb->scrub.event, 0, false); KeQuerySystemTime(&Vcb->scrub.resume_time); return STATUS_SUCCESS; } NTSTATUS stop_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) return STATUS_PRIVILEGE_NOT_HELD; if (!Vcb->scrub.thread) return STATUS_DEVICE_NOT_READY; Vcb->scrub.paused = false; Vcb->scrub.stopping = true; KeSetEvent(&Vcb->scrub.event, 0, false); return STATUS_SUCCESS; }