__builtin_memcpy() slower than memcpy/bcopy (and on linux it is the opposite) ?

From: Luigi Rizzo <rizzo_at_iet.unipi.it> Date: Wed, 23 Jan 2013 17:32:38 +0100 · This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:40:34 UTC

Probably our compiler folks have some ideas on this...

When doing netmap i found that on FreeBSD memcpy/bcopy was expensive,
__builtin_memcpy() was even worse, and so i ended up writing
my custom routine, (called pkt_copy() in the program below).
This happens with gcc 4.2.1, clang, gcc 4.6.4

I was then surprised to notice that on a recent ubuntu using
gcc 4.6.2 (if that matters) the __builtin_memcpy beats other
methods by a large factor.

Here are the number in millions of calls per second.  Is the test
program flawed, or the compiler is built with different options ?

Unfortunately i have no chance to run the two versions of the code
on the same machine, but the hardware should be relatively similar
(i7-2600 i_at_ 3.4 GHz on one, Xeon E5-1650 _at_ 3.2 GHz on the other)

			BSD / Linux

block size (bytes)	31		32		64		2048

__builtin_memcpy	10 / 150	13 / 158	13 / 152	5.1 / 23.2
memcpy			23 / 64		47 / 64		45 / 64		5.4 / 3.8
bcopy			24 / 64		47 / 64		45 / 63		5.4 / 3.8
pkt_copy		65 / 63		65 / 63		64 / 63		5.5 / 3.7

	cheers
	luigi

/*
 * Copyright (C) 2012 Luigi Rizzo. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: testlock.c 12015 2013-01-23 15:51:17Z luigi $
 *
 * Test program to study various ops and concurrency issues.
 * Create multiple threads, possibly bind to cpus, and run a workload.
 *
 * cc -O2 -Werror -Wall testlock.c -o testlock -lpthread
 *	you might need -lrt
 */

#include <inttypes.h>
#include <sys/types.h>
#include <pthread.h>	/* pthread_* */

#if defined(__APPLE__)

#include <libkern/OSAtomic.h>
#define atomic_add_int(p, n) OSAtomicAdd32(n, (int *)p)
#define	atomic_cmpset_32(p, o, n)	OSAtomicCompareAndSwap32(o, n, (int *)p)

#elif defined(linux)

int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new)
{
	int ret = *p == old;
	*p = new;
	return ret;
}

#if defined(HAVE_GCC_ATOMICS)
int atomic_add_int(volatile int *p, int v)
{
        return __sync_fetch_and_add(p, v);
}
#else
inline
uint32_t atomic_add_int(uint32_t *p, int v)
{
        __asm __volatile (
        "       lock   xaddl   %0, %1 ;        "
        : "+r" (v),                     /* 0 (result) */
          "=m" (*p)                     /* 1 */
        : "m" (*p));                    /* 2 */
        return (v);
}
#endif

#else /* FreeBSD */
#include <sys/param.h>
#include <machine/atomic.h>
#include <pthread_np.h>	/* pthread w/ affinity */

#if __FreeBSD_version > 500000
#include <sys/cpuset.h>	/* cpu_set */
#if __FreeBSD_version > 800000
#define HAVE_AFFINITY
#endif

inline void prefetch (const void *x)
{
        __asm volatile("prefetcht0 %0" :: "m" (*(const unsigned long *)x));
}

#else /* FreeBSD 4.x */
int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new)
{
	int ret = *p == old;
	*p = new;
	return ret;
}

#define PRIu64	"llu"
#endif /* FreeBSD 4.x */

#endif /* FreeBSD */

#include <signal.h>	/* signal */
#include <stdlib.h>
#include <stdio.h>
#include <poll.h>
#include <inttypes.h>	/* PRI* macros */
#include <string.h>	/* strcmp */
#include <fcntl.h>	/* open */
#include <unistd.h>	/* getopt */

#include <sys/sysctl.h>	/* sysctl */
#include <sys/time.h>	/* timersub */

static inline int min(int a, int b) { return a < b ? a : b; }

#define ONE_MILLION	1000000
/* debug support */
#define ND(format, ...)			
#define D(format, ...)				\
	fprintf(stderr, "%s [%d] " format "\n",	\
	__FUNCTION__, __LINE__, ##__VA_ARGS__)

int verbose = 0;

#if 1//def MY_RDTSC
/* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */ 
#define my_rdtsc(t)				\
	do {					\
		u_int __regs[4];		\
						\
		do_cpuid(0, __regs);		\
		(t) = rdtsc();			\
	} while (0)

static __inline void
do_cpuid(u_int ax, u_int *p)
{
	__asm __volatile("cpuid"
			 : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
			 :  "0" (ax) );
}

static __inline uint64_t
rdtsc(void)
{
	uint64_t rv;

	// XXX does not work on linux-64 bit
	__asm __volatile("rdtscp" : "=A" (rv) : : "%rax");
	return (rv);
}
#endif /* 1 */

struct targ;

/*** global arguments for all threads ***/
struct glob_arg {
	struct  {
		uint32_t	ctr[1024];
	} v __attribute__ ((aligned(256) ));
	int64_t m_cycles;	/* total cycles */
	int nthreads;
	int cpus;
	int privs;	// 1 if has IO privileges
	int arg;	// microseconds in usleep
	char *test_name;
	void (*fn)(struct targ *);
	uint64_t scale;	// scaling factor
	char *scale_name;	// scaling factor
};

/*
 * Arguments for a new thread.
 */
struct targ {
	struct glob_arg *g;
	int		completed;
	u_int		*glob_ctr;
	uint64_t volatile count;
	struct timeval	tic, toc;
	int		me;
	pthread_t	thread;
	int		affinity;
};

static struct targ *ta;
static int global_nthreads;

/* control-C handler */
static void
sigint_h(int sig)
{
	int i;

	(void)sig;	/* UNUSED */
	for (i = 0; i < global_nthreads; i++) {
		/* cancel active threads. */
		if (ta[i].completed)
			continue;
		D("Cancelling thread #%d\n", i);
		pthread_cancel(ta[i].thread);
		ta[i].completed = 0;
	}
	signal(SIGINT, SIG_DFL);
}

/* sysctl wrapper to return the number of active CPUs */
static int
system_ncpus(void)
{
#ifdef linux
	return 1;
#else
	int mib[2] = { CTL_HW, HW_NCPU}, ncpus;
	size_t len = sizeof(mib);
	sysctl(mib, len / sizeof(mib[0]), &ncpus, &len, NULL, 0);
	D("system had %d cpus", ncpus);

	return (ncpus);
#endif
}

/*
 * try to get I/O privileges so we can execute cli/sti etc.
 */
int
getprivs(void)
{
	int fd = open("/dev/io", O_RDWR);
	if (fd < 0) {
		D("cannot open /dev/io, fd %d", fd);
		return 0;
	}
	return 1;
}

/* set the thread affinity. */
/* ARGSUSED */
#ifdef HAVE_AFFINITY
static int
setaffinity(pthread_t me, int i)
{
	cpuset_t cpumask;

	if (i == -1)
		return 0;

	/* Set thread affinity affinity.*/
	CPU_ZERO(&cpumask);
	CPU_SET(i, &cpumask);

	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
		D("Unable to set affinity");
		return 1;
	}
	return 0;
}
#endif

static void *
td_body(void *data)
{
	struct targ *t = (struct targ *) data;

#ifdef HAVE_AFFINITY
	if (0 == setaffinity(t->thread, t->affinity))
#endif
	{
		/* main loop.*/
		D("testing %ld cycles", t->g->m_cycles);
		gettimeofday(&t->tic, NULL);
		t->g->fn(t);
		gettimeofday(&t->toc, NULL);
	}
	t->completed = 1;
	return (NULL);
}

void
test_sel(struct targ *t)
{
	int64_t m;
	for (m = 0; m < t->g->m_cycles; m++) {
		fd_set r;
		struct timeval to = { 0, t->g->arg};
		FD_ZERO(&r);
		FD_SET(0,&r);
		// FD_SET(1,&r);
		select(1, &r, NULL, NULL, &to);
		t->count++;
	}
}

void
test_poll(struct targ *t)
{
	int64_t m, ms = t->g->arg/1000;
	for (m = 0; m < t->g->m_cycles; m++) {
		struct pollfd x;
		x.fd = 0;
		x.events = POLLIN;
		poll(&x, 1, ms);
		t->count++;
	}
}

void
test_usleep(struct targ *t)
{
	int64_t m;
	for (m = 0; m < t->g->m_cycles; m++) {
		usleep(t->g->arg);
		t->count++;
	}
}

void
test_cli(struct targ *t)
{
        int64_t m, i;
	if (!t->g->privs) {	
		D("%s", "privileged instructions not available");
		return;
	}
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
			__asm __volatile("cli;");
			__asm __volatile("and %eax, %eax;");
			__asm __volatile("sti;");
			t->count++;
		}
        }
}

void
test_nop(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
			__asm __volatile("nop;");
			__asm __volatile("nop; nop; nop; nop; nop;");
			//__asm __volatile("nop; nop; nop; nop; nop;");
			t->count++;
		}
	}
}

void
test_rdtsc1(struct targ *t)
{
        int64_t m, i;
	uint64_t v;
	(void)v;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	my_rdtsc(v);
			t->count++;
		}
        }
}

void
test_rdtsc(struct targ *t)
{
        int64_t m, i;
	volatile uint64_t v;
	(void)v;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	v = rdtsc();
			t->count++;
		}
        }
}

void
test_add(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	t->glob_ctr[0] ++;
			t->count++;
		}
        }
}

void
test_atomic_add(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	atomic_add_int(t->glob_ctr, 1);
			t->count++;
		}
        }
}

void
test_atomic_cmpset(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
		        atomic_cmpset_32(t->glob_ctr, m, i);
			t->count++;
		}
        }
}

void
test_time(struct targ *t)
{
        int64_t m;
        for (m = 0; m < t->g->m_cycles; m++) {
#ifndef __APPLE__
		struct timespec ts;
		clock_gettime(t->g->arg, &ts);
#endif
		t->count++;
        }
}

void
test_gettimeofday(struct targ *t)
{
        int64_t m;
	struct timeval ts;
        for (m = 0; m < t->g->m_cycles; m++) {
		gettimeofday(&ts, NULL);
		t->count++;
        }
}

/*
 * getppid is the simplest system call (getpid is cached by glibc
 * so it would not be a good test)
 */
void
test_getpid(struct targ *t)
{
        int64_t m;
        for (m = 0; m < t->g->m_cycles; m++) {
		getppid();
		t->count++;
        }
}

#define likely(x)	__builtin_expect(!!(x), 1)
#define unlikely(x)	__builtin_expect(!!(x), 0)

static void
fast_bcopy(void *_src, void *_dst, int l)
{
	uint64_t *src = _src;
	uint64_t *dst = _dst;
	if (unlikely(l >= 1024)) {
		bcopy(src, dst, l);
		return;
	}
	for (; likely(l > 0); l-=64) {
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
	}
}

// XXX if you want to make sure there is no inlining...
// static void (*fp)(void *_src, void *_dst, int l) = fast_bcopy;

#define HU	0x3ffff
static struct glob_arg huge[HU+1];

void
test_fastcopy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("fast copying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		fast_bcopy(t->g, (void *)&huge[m & HU], len);
		t->count+=1;
        }
}

void
test_bcopy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("bcopying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		bcopy(t->g, (void *)&huge[m & HU], len);
		t->count+=1;
        }
}

void
test_builtin_memcpy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("bcopying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		__builtin_memcpy(t->g, (void *)&huge[m & HU], len);
		t->count+=1;
        }
}

void
test_memcpy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("memcopying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		memcpy((void *)&huge[m & HU], t->g, len);
		t->count+=1;
        }
}

struct entry {
	void (*fn)(struct targ *);
	char *name;
	uint64_t scale;
	uint64_t m_cycles;
};
struct entry tests[] = {
	{ test_sel, "select", 1, 1000 },
	{ test_poll, "poll", 1, 1000 },
	{ test_usleep, "usleep", 1, 1000 },
	{ test_time, "time", 1, 1000 },
	{ test_gettimeofday, "gettimeofday", 1, 1000000 },
	{ test_getpid, "getpid", 1, 1000000 },
	{ test_bcopy, "bcopy", 1000, 100000000 },
	{ test_builtin_memcpy, "__builtin_memcpy", 1000, 100000000 },
	{ test_memcpy, "memcpy", 1000, 100000000 },
	{ test_fastcopy, "fastcopy", 1000, 100000000 },
	{ test_add, "add", ONE_MILLION, 100000000 },
	{ test_nop, "nop", ONE_MILLION, 100000000 },
	{ test_atomic_add, "atomic-add", ONE_MILLION, 100000000 },
	{ test_cli, "cli", ONE_MILLION, 100000000 },
	{ test_rdtsc, "rdtsc", ONE_MILLION, 100000000 },	// unserialized
	{ test_rdtsc1, "rdtsc1", ONE_MILLION, 100000000 },	// serialized
	{ test_atomic_cmpset, "cmpset", ONE_MILLION, 100000000 },
	{ NULL, NULL, 0, 0 }
};

static void
usage(void)
{
	const char *cmd = "test";
	int i;

	fprintf(stderr,
		"Usage:\n"
		"%s arguments\n"
		"\t-m name		test name\n"
		"\t-n cycles		(millions) of cycles\n"
		"\t-l arg		bytes, usec, ... \n"
		"\t-t threads		total threads\n"
		"\t-c cores		cores to use\n"
		"\t-a n			force affinity every n cores\n"
		"\t-A n			cache contention every n bytes\n"
		"\t-w report_ms		milliseconds between reports\n"
		"",
		cmd);
	fprintf(stderr, "Available tests:\n");
	for (i = 0; tests[i].name; i++) {
		fprintf(stderr, "%12s\n", tests[i].name);
	}

	exit(0);
}

static int64_t
getnum(const char *s)
{
	int64_t n;
	char *e;

	n = strtol(s, &e, 0);
	switch (e ? *e : '\0')  {
	case 'k':
	case 'K':
		return n*1000;
	case 'm':
	case 'M':
		return n*1000*1000;
	case 'g':
	case 'G':
		return n*1000*1000*1000;
	case 't':
	case 'T':
		return n*1000*1000*1000*1000;
	default:
		return n;
	}
}

struct glob_arg g;
int
main(int argc, char **argv)
{
	int i, ch, report_interval, affinity, align;

	ND("g has size %d", (int)sizeof(g));
	report_interval = 250;	/* ms */
	affinity = 0;		/* no affinity */
	align = 0;		/* global variable */

	bzero(&g, sizeof(g));

	g.privs = getprivs();
	g.nthreads = 1;
	g.cpus = 1;
	g.m_cycles = 0;

	while ( (ch = getopt(argc, argv, "A:a:m:n:w:c:t:vl:")) != -1) {
		switch(ch) {
		default:
			D("bad option %c %s", ch, optarg);
			usage();
			break;
		case 'A':	/* align */
			align = atoi(optarg);
			break;
		case 'a':	/* force affinity */
			affinity = atoi(optarg);
			break;
		case 'n':	/* cycles */
			g.m_cycles = getnum(optarg);
			break;
		case 'w':	/* report interval */
			report_interval = atoi(optarg);
			break;
		case 'c':
			g.cpus = atoi(optarg);
			break;
		case 't':
			g.nthreads = atoi(optarg);
			break;
		case 'm':
			g.test_name = optarg;
			break;
		case 'l':
			g.arg = getnum(optarg);
			break;

		case 'v':
			verbose++;
			break;
		}
	}
	argc -= optind;
	argv += optind;
	if (!g.test_name && argc > 0)
		g.test_name = argv[0];

	if (g.test_name) {
		for (i = 0; tests[i].name; i++) {
			if (!strcmp(g.test_name, tests[i].name)) {
				g.fn = tests[i].fn;
				g.scale = tests[i].scale;
				if (g.m_cycles == 0)
					g.m_cycles = tests[i].m_cycles;
				if (g.scale == ONE_MILLION)
					g.scale_name = "M";
				else if (g.scale == 1000)
					g.scale_name = "K";
				else {
					g.scale = 1;
					g.scale_name = "";
				}
				break;
			}
		}
	}
	if (!g.fn) {
		D("%s", "missing/unknown test name");
		usage();
	}
	i = system_ncpus();
	if (g.cpus < 0 || g.cpus > i) {
		D("%d cpus is too high, have only %d cpus", g.cpus, i);
		usage();
	}
	if (g.cpus == 0)
		g.cpus = i;
	if (g.nthreads < 1) {
		D("bad nthreads %d, using 1", g.nthreads);
		g.nthreads = 1;
	}
	i = sizeof(g.v.ctr) / g.nthreads*sizeof(g.v.ctr[0]);
	if (align < 0 || align > i) {
		D("bad align %d, max is %d", align, i);
		align = i;
	}

	/* Install ^C handler. */
	global_nthreads = g.nthreads;
	signal(SIGINT, sigint_h);

	ta = calloc(g.nthreads, sizeof(*ta));
	/*
	 * Now create the desired number of threads, each one
	 * using a single descriptor.
 	 */
	D("start %d threads on %d cores", g.nthreads, g.cpus);
	for (i = 0; i < g.nthreads; i++) {
		struct targ *t = &ta[i];
		bzero(t, sizeof(*t));
		t->g = &g;
		t->me = i;
		t->glob_ctr = &g.v.ctr[(i*align)/sizeof(g.v.ctr[0])];
		D("thread %d ptr %p", i, t->glob_ctr);
		t->affinity = affinity ? (affinity*i) % g.cpus : -1;
		if (pthread_create(&t->thread, NULL, td_body, t) == -1) {
			D("Unable to create thread %d", i);
			t->completed = 1;
		}
	}
	/* the main loop */

    {
	uint64_t my_count = 0, prev = 0;
	uint64_t count = 0;
	double delta_t;
	struct timeval tic, toc;

	gettimeofday(&toc, NULL);
	for (;;) {
		struct timeval now, delta;
		uint64_t pps;
		int done = 0;

		delta.tv_sec = report_interval/1000;
		delta.tv_usec = (report_interval%1000)*1000;
		select(0, NULL, NULL, NULL, &delta);
		gettimeofday(&now, NULL);
		timersub(&now, &toc, &toc);
		my_count = 0;
		for (i = 0; i < g.nthreads; i++) {
			my_count += ta[i].count;
			if (ta[i].completed)
				done++;
		}
		pps = toc.tv_sec* ONE_MILLION + toc.tv_usec;
		if (pps < 10000)
			continue;
		pps = (my_count - prev)*ONE_MILLION / pps;
		D("%" PRIu64 " %scycles/s scale %" PRIu64 " in %dus", pps/g.scale,
			g.scale_name, g.scale, (int)(toc.tv_sec* ONE_MILLION + toc.tv_usec));
		prev = my_count;
		toc = now;
		if (done == g.nthreads)
			break;
	}
	D("total %" PRIu64 " cycles", prev);

	timerclear(&tic);
	timerclear(&toc);
	for (i = 0; i < g.nthreads; i++) {
		pthread_join(ta[i].thread, NULL);

		if (ta[i].completed == 0)
			continue;

		/*
		 * Collect threads o1utput and extract information about
		 * how log it took to send all the packets.
		 */
		count += ta[i].count;
		if (!timerisset(&tic) || timercmp(&ta[i].tic, &tic, <))
			tic = ta[i].tic;
		if (!timerisset(&toc) || timercmp(&ta[i].toc, &toc, >))
			toc = ta[i].toc;
	}

	/* print output. */
	timersub(&toc, &tic, &toc);
	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
	D("total %8.6f seconds", delta_t);
    }

	return (0);
}
/* end of file */