Re: Scrub incredibly slow with 13.0-RC3 (as well as RC1 & 2)

From: Mathieu Chouquet-Stringer <me+freebsd_at_mathieu.digital>
Date: Tue, 30 Mar 2021 15:43:06 +0200
	Hello,

TL;DR: we need kvmclock support in FreeBSD

On Sat, Mar 27, 2021 at 05:06:43PM +0100, Mathieu Chouquet-Stringer wrote:
> TL;DR: read_max makes no difference, don't use hpet with KVM, use
> scan_legacy.

I ended up resurrecting a set of patches created by Bryan [1] to add support
for kvmclock because this whole slow zfs thing is incredibly dependent on the
timecounter infrastructure. I inlined below the patch I'm using at the moment.
And results are mind blowing.

I can scrub my zroot pool in 10s instead of 1m35s...

And I started scrubbing my raidz2 pool 30 minutes ago and it's telling
me I have about 15 hours left. The best time I got before that was 23:04:17
(and that was with scan_legacy=1 which was around 3 hours quicker than the same
scrub withscan_legacy=0)...

  scan: scrub in progress since Tue Mar 30 15:00:03 2021
        962G scanned at 541M/s, 810G issued at 456M/s, 24.4T total
        0B repaired, 3.24% done, 15:05:39 to go

And that's with vfs.zfs.scan_legacy == 0 (the default). So it seems the scan
process uses timing information a lot more than the legacy one. And
that's what I saw in the FlameGraph when I stupidly added hpet support in KVM:
I was seeing a lot of calls to hpet_get_timecount.

So the question is: what is needed to merge that kvmclock thing in the kernel?

Regards,

[1] https://people.freebsd.org/~bryanv/patches/kvm_clock-1.patch

--- 
 sys/conf/files.x86      |   1 +
 sys/x86/include/kvm.h   |  49 +++++++++++++++++++++++
 sys/x86/x86/kvm_clock.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 182 insertions(+)

diff --git a/sys/conf/files.x86 b/sys/conf/files.x86
index f51392d0614..bcf53fcf1f6 100644
--- a/sys/conf/files.x86
+++ b/sys/conf/files.x86
_at__at_ -335,6 +335,7 _at__at_ x86/x86/dump_machdep.c		standard
 x86/x86/fdt_machdep.c		optional	fdt
 x86/x86/identcpu.c		standard
 x86/x86/intr_machdep.c		standard
+x86/x86/kvm_clock.c		standard
 x86/x86/legacy.c		standard
 x86/x86/mca.c			standard
 x86/x86/x86_mem.c		optional	mem
diff --git a/sys/x86/include/kvm.h b/sys/x86/include/kvm.h
new file mode 100644
index 00000000000..dc26822ed1c
--- /dev/null
+++ b/sys/x86/include/kvm.h
_at__at_ -0,0 +1,49 _at__at_
+/*-
+ * Copyright (c) 2014 Bryan Venteicher <bryanv_at_FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_KVM_H_
+#define _X86_KVM_H_
+
+#define KVM_CPUID_FEATURES_LEAF                0x40000001
+
+#define KVM_FEATURE_CLOCKSOURCE                0x00000001
+#define KVM_FEATURE_CLOCKSOURCE2       0x00000008
+
+/* Deprecated: for the CLOCKSOURCE feature. */
+#define KVM_MSR_WALL_CLOCK             0x11
+#define KVM_MSR_SYSTEM_TIME            0x12
+
+#define KVM_MSR_WALL_CLOCK_NEW         0x4b564d00
+#define KVM_MSR_SYSTEM_TIME_NEW                0x4b564d01
+
+int            kvm_paravirt_supported(void);
+uint32_t       kvm_get_features(void);
+
+uint64_t       kvm_clock_tsc_freq(void);
+
+#endif /* !_X86_KVM_H_ */
diff --git a/sys/x86/x86/kvm_clock.c b/sys/x86/x86/kvm_clock.c
new file mode 100644
index 00000000000..f16aa88c695
--- /dev/null
+++ b/sys/x86/x86/kvm_clock.c
_at__at_ -0,0 +1,132 _at__at_
+/*-
+ * Copyright (c) 2014 Bryan Venteicher <bryanv_at_FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+#include <sys/smp.h>
+#include <sys/timetc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pvclock.h>
+#include <x86/kvm.h>
+#include <x86/x86_var.h>
+
+static u_int   kvm_clock_get_timecounter(struct timecounter *);
+static void    kvm_clock_pcpu_system_time(void *);
+
+DPCPU_DEFINE(struct pvclock_vcpu_time_info, kvm_clock_vcpu_time_info);
+
+static struct timecounter kvm_clock_timecounter = {
+       kvm_clock_get_timecounter,
+       NULL,
+       ~0u,
+       1000000000ULL,
+       "KVMCLOCK",
+       1000,
+};
+
+static int             kvm_clock_registered;
+static uint32_t                kvm_clock_wall_clock_msr;
+static uint32_t                kvm_clock_system_time_msr;
+
+uint64_t
+kvm_clock_tsc_freq(void)
+{
+       struct pvclock_vcpu_time_info *ti;
+       uint64_t freq;
+
+       critical_enter();
+       ti = DPCPU_PTR(kvm_clock_vcpu_time_info);
+       freq = pvclock_tsc_freq(ti);
+       critical_exit();
+
+       return (freq);
+}
+
+static u_int
+kvm_clock_get_timecounter(struct timecounter *tc)
+{
+       struct pvclock_vcpu_time_info *ti;
+       uint64_t time;
+
+       critical_enter();
+       ti = DPCPU_PTR(kvm_clock_vcpu_time_info);
+       time = pvclock_get_timecount(ti);
+       critical_exit();
+
+       return (time & UINT_MAX);
+}
+
+static void
+kvm_clock_pcpu_system_time(void *arg)
+{
+       uint64_t data;
+       int enable;
+
+       enable = *(int *) arg;
+
+       if (enable != 0)
+               data = vtophys(DPCPU_PTR(kvm_clock_vcpu_time_info)) | 1;
+       else
+               data = 0;
+
+       wrmsr(kvm_clock_system_time_msr, data);
+}
+
+static void
+kvm_clock_init(void)
+{
+       if (vm_guest != VM_GUEST_KVM)
+               return;
+
+       if (hv_base > 0 == 0)
+	       return;
+
+       if (hv_high & KVM_FEATURE_CLOCKSOURCE2) {
+               kvm_clock_wall_clock_msr = KVM_MSR_WALL_CLOCK_NEW;
+               kvm_clock_system_time_msr = KVM_MSR_SYSTEM_TIME_NEW;
+       } else if (hv_high & KVM_FEATURE_CLOCKSOURCE) {
+               kvm_clock_wall_clock_msr = KVM_MSR_WALL_CLOCK;
+               kvm_clock_system_time_msr = KVM_MSR_SYSTEM_TIME;
+       } else
+               return;
+
+       kvm_clock_registered = 1;
+       smp_rendezvous(smp_no_rendezvous_barrier, kvm_clock_pcpu_system_time,
+           smp_no_rendezvous_barrier, &kvm_clock_registered);
+
+       tc_init(&kvm_clock_timecounter);
+}
+
+SYSINIT(kvm_clock, SI_SUB_SMP, SI_ORDER_ANY, kvm_clock_init, NULL);

-- 
Mathieu Chouquet-Stringer
            The sun itself sees not till heaven clears.
	             -- William Shakespeare --
Received on Tue Mar 30 2021 - 11:43:11 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:41:27 UTC