Re: ZFS: i/o error - all block copies unavailable after upgrading to r225312

From: Andriy Gapon <avg_at_FreeBSD.org>
Date: Fri, 16 Sep 2011 10:20:40 +0300
I've discovered another issue with both the current code and the previous
versions of the patch.  So another version is at the same URL:
http://people.freebsd.org/~avg/zfs-boot-gang.diff

The problem was with gang blocks on a raidz vdev.  zio_read_gang would pass a
NULL bp to vdev_raidz_read, but that function doesn't expect that and really
needs a valid bp.

Pawel, I also understood the code in zio_read that does size rounding up based
on v_ashift.  That code is for vdev_raidz_read which needs the size to be
multiple of 1 << v_ashift.  And, again, this would have posed a problem for
zio_read_gang, which always passed SPA_GANGBLOCKSIZE when reading a gang block
header.

So, a list of fixes and logical changes:
- correctly read gang header from raidz [*]
- decompress assembled gang block data if compressed
- verify checksum of a gang header
- verify checksum of assembled gang block data
- verify checksum of uber block

[*] - new in this version of patch.

Description of the code changes:
- remove offset parameter from zio_checksum_error
This parameter has only been used for the case of verifying checksum of a vdev
label.  The offset is now passed via DVA offset field in a made up bp pointing
to the label.
zio_checksum_error now gets all checksum parameters from the bp.

- zio_read_gang new uses an artificial bp to read a gang header via zio_read
This solves all the problems with gang blocks on raidz as zio_read already has
all the code to handle raidz correctly.

- zio_read performs size rounding only if v_read == vdev_raidz_read
This is to make the intention of the code more clear.  And also to slightly
optimize non-raidz cases with non-default ashift where an unnecessary
intermediate buffer would otherwise be used.

Some inline comments (marked with %):

Index: sys/cddl/boot/zfs/zfssubr.c
===================================================================
--- sys/cddl/boot/zfs/zfssubr.c	(revision 225581)
+++ sys/cddl/boot/zfs/zfssubr.c	(working copy)
_at__at_ -181,14 +181,17 _at__at_
 }

 static int
-zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
+zio_checksum_error(const blkptr_t *bp, void *data)
 {
-	unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
BP_GET_CHECKSUM(bp);
-	uint64_t size = BP_GET_PSIZE(bp);
+	uint64_t size;
+	unsigned int checksum;
 	zio_checksum_info_t *ci;
 	zio_cksum_t actual_cksum, expected_cksum, verifier;
 	int byteswap;

+	checksum = BP_GET_CHECKSUM(bp);
+	size = BP_GET_PSIZE(bp);
+
%
% checksum type and size are always taken from the bp.
% Note that BP_IS_GANG(bp) doesn't imply that the caller wants to verify
% the gang header (as was assumed before); the caller want want to verify
% the whole assembled data as well.
%
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 		return (EINVAL);
 	ci = &zio_checksum_table[checksum];
_at__at_ -206,7 +209,8 _at__at_
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 			zio_checksum_gang_verifier(&verifier, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
-			zio_checksum_label_verifier(&verifier, offset);
+			zio_checksum_label_verifier(&verifier,
+			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
%
% label offset is taken from the bp now.
%
 		else
 			verifier = bp->blk_cksum;

_at__at_ -224,7 +228,6 _at__at_
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
 	} else {
-		ASSERT(!BP_IS_GANG(bp));
%
% the assert is no longer valid as we pass a gang block bp
% when verifying checksum of assembled data
%
 		expected_cksum = bp->blk_cksum;
 		ci->ci_func[0](data, size, &actual_cksum);
 	}
_at__at_ -1248,7 +1251,7 _at__at_
 raidz_checksum_verify(const blkptr_t *bp, void *data)
 {

-	return (zio_checksum_error(bp, data, 0));
+	return (zio_checksum_error(bp, data));
 }

 /*
Index: sys/boot/zfs/zfsimpl.c
===================================================================
--- sys/boot/zfs/zfsimpl.c	(revision 225581)
+++ sys/boot/zfs/zfsimpl.c	(working copy)
_at__at_ -347,7 +347,7 _at__at_
 	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
 	if (rc)
 		return (rc);
-	if (bp && zio_checksum_error(bp, buf, offset))
+	if (bp && zio_checksum_error(bp, buf))
%
% the bp can still be NULL here: when raidz code reads blocks of data
% it doesn't pass any bp, because there is really no bp for that data.
% this should be the only case, in all other cases a valid bp must be provided.
%
 		return (EIO);

 	return (0);
_at__at_ -800,6 +800,7 _at__at_
 	BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
%
% as described above, the offset is now passed via DVA offset field
%
 	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 	if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
 		return (EIO);
_at__at_ -941,7 +942,7 _at__at_
 		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
 		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);

-		if (vdev_read_phys(vdev, NULL, upbuf, off, VDEV_UBERBLOCK_SIZE(vdev)))
+		if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
%
% pass the artificial uberblock bp to vdev_read_phys, so that it
% can call zio_checksum_error and verify the checksum
%
 			continue;

 		if (up->ub_magic != UBERBLOCK_MAGIC)
_at__at_ -974,34 +975,39 _at__at_
 }

 static int
-zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
+zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
 {
+	blkptr_t gbh_bp;
 	zio_gbh_phys_t zio_gb;
-	vdev_t *vdev;
-	int vdevid;
-	off_t offset;
+	char *pbuf;
 	int i;

-	vdevid = DVA_GET_VDEV(dva);
-	offset = DVA_GET_OFFSET(dva);
-	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
-		if (vdev->v_id == vdevid)
-			break;
-	if (!vdev || !vdev->v_read)
+	/* Artificial BP for gang block header. */
+	gbh_bp = *bp;
+	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
+	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
+	for (i = 0; i < SPA_DVAS_PER_BP; i++)
+		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
%
% the artifical bp for the gang header.
% it has PSIZE and LSIZE of SPA_GANGBLOCKSIZE.
% cheksum is set ZIO_CHECKSUM_GANG_HEADER, so that zio_checksum_error
% does the right thing.
% compression is set to ZIO_COMPRESS_OFF, so that zio_read does the right thing.
% the gang bit is cleared from the DVAs - we read only the gang header block and
% do not want to create endless recursion here.
% vdevs and offsets are preserved in the DVAs of the BP.
%
+
+	/* Read gang header block using the artificial BP. */
+	if (zio_read(spa, &gbh_bp, &zio_gb))
 		return (EIO);

-	if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
-		return (EIO);
%
% so now smarter zio_read can be used instead of dumber v_read.
% Which, again, would have crashed on NULL bp if v_read == vdev_raidz_read
%

+	pbuf = buf;
%
% keep the original buffer pointer, use pbuf to populate the buffer
%
 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
 		blkptr_t *gbp = &zio_gb.zg_blkptr[i];

 		if (BP_IS_HOLE(gbp))
 			continue;
-		if (zio_read(spa, gbp, buf))
+		if (zio_read(spa, gbp, pbuf))
 			return (EIO);
-		buf = (char*)buf + BP_GET_PSIZE(gbp);
+		pbuf += BP_GET_PSIZE(gbp);
 	}
-
+
+	if (zio_checksum_error(bp, buf))
+		return (EIO);
%
% the original pointer is used to verify checksum of the assembled data
% of the gang block.
% Note: this is where the gang bp is passed to zio_checksum_error.
%
 	return (0);
 }

_at__at_ -1024,46 +1030,41 _at__at_
 		if (!dva->dva_word[0] && !dva->dva_word[1])
 			continue;

-		if (DVA_GET_GANG(dva)) {
-			error = zio_read_gang(spa, bp, dva, buf);
-			if (error != 0)
-				continue;
-		} else {
-			vdevid = DVA_GET_VDEV(dva);
-			offset = DVA_GET_OFFSET(dva);
-			STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
-				if (vdev->v_id == vdevid)
-					break;
-			}
-			if (!vdev || !vdev->v_read)
-				continue;
+		vdevid = DVA_GET_VDEV(dva);
+		offset = DVA_GET_OFFSET(dva);
+		STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+			if (vdev->v_id == vdevid)
+				break;
+		}
+		if (!vdev || !vdev->v_read)
+			continue;

-			size = BP_GET_PSIZE(bp);
+		size = BP_GET_PSIZE(bp);
+		if (vdev->v_read == vdev_raidz_read) {
 			align = 1ULL << vdev->v_top->v_ashift;
 			if (P2PHASE(size, align) != 0)
 				size = P2ROUNDUP(size, align);
%
% do size rounding up only if v_read == vdev_raidz_read,
% because only vdev_raidz_read requires that.
% vdev_read_phys requires only 512 byte granularity.
%
-			if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
-				pbuf = zfs_alloc(size);
-			else
-				pbuf = buf;
+		}
+		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
+			pbuf = zfs_alloc(size);
+		else
+			pbuf = buf;
%
% So use temporary buffer only if either the block is compressed or
% vdev_raidz_read requires a larger buffer
%

+		if (DVA_GET_GANG(dva))
+			error = zio_read_gang(spa, bp, pbuf);
+		else
 			error = vdev->v_read(vdev, bp, pbuf, offset, size);
-			if (error == 0) {
-				if (cpfunc != ZIO_COMPRESS_OFF) {
-					error = zio_decompress_data(cpfunc,
-					    pbuf, BP_GET_PSIZE(bp), buf,
-					    BP_GET_LSIZE(bp));
-				} else if (size != BP_GET_PSIZE(bp)) {
-					bcopy(pbuf, buf, BP_GET_PSIZE(bp));
-				}
-			}
-			if (buf != pbuf)
-				zfs_free(pbuf, size);
-			if (error != 0)
-				continue;
+		if (error == 0) {
+			if (cpfunc != ZIO_COMPRESS_OFF)
+				error = zio_decompress_data(cpfunc, pbuf,
+				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
+			else if (size != BP_GET_PSIZE(bp))
+				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
 		}
%
% Decompression is now done for the gang blocks too.
% Ditto for the use of larger buffers in the raidz case.
%
-		error = 0;
-		break;
+		if (buf != pbuf)
+			zfs_free(pbuf, size);
+		if (error == 0)
+			break;
 	}
 	if (error != 0)
 		printf("ZFS: i/o error - all block copies unavailable\n");



-- 
Andriy Gapon
Received on Fri Sep 16 2011 - 05:20:46 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:17 UTC