/*
 * lat_ctx.c - context switch timer 
 *
 * usage: lat_ctx [-s size] #procs [#procs....]
 *
 * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
 * additional restriction that results may published only if
 * (1) the benchmark is unmodified, and
 * (2) the version in the sccsid below is included in the report.
 * Support for this development by Sun Microsystems is gratefully acknowledged.
 */
char	*id = "$Id: lat_ctx.c,v 1.8 1997/02/13 03:19:17 lm Exp lm $\n";

#include "timing.h"
#include "bench.h"
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>

#define	CTX_TRIES	4
#define	DATAFILE	"/tmp/lat_ctx.tmp"

int	process_size = 0, *data;	/* size & pointer to an array that big */
int	pids[100];
int	caches(), pipe_cost(), overhead(), sumit();
int	ctx(int overhead, int writes, int procs);
void	killem(int procs);
void	doit(int p[100][2], int rd, int wr);

int
main(int ac, char **av)
{
	int	i;
	int	tries;
	int	result;
	int	min;
	int	overhead;
	int	writes;

	if (ac < 2) {
usage:		printf("Usage: %s [-s kbytes] processes [processes ...]\n",
		    av[0]);
		exit(1);
	}

	/*
	 * If they specified a context size, get it.
	 */
	if (!strcmp(av[1], "-s")) {
		int	fd;

		if (ac < 3) {
			goto usage;
		}
		process_size = atoi(av[2]) * 1024;
		unlink(DATAFILE);
		fd = open(DATAFILE, O_RDWR|O_CREAT, 0666);
		data = (int *)malloc(process_size);
		write(fd, data, process_size);
		free(data);
		data = mmap(0, process_size,
		    PROT_READ, MAP_SHARED, fd, 0);
		close(fd);
		if ((int)data == -1) {
			perror("mmap");
			goto usage;
		}
		ac -= 2;
		av += 2;
	}

	overhead = pipe_cost();

	fprintf(stderr, "\n\"size=%dk ovr=%d\n", process_size/1024, overhead);

	for (i = 1; i < ac; ++i) {
		/*
		 * OK, assume that the worst clock is 10msec.
		 * We want 10 times that.
		 */
		writes = 100;
		do {
			writes <<= 1;
			result = ctx(overhead, writes, 10);
		} while (((result + overhead) * writes) < 100000);

		/*
		 * Now make each subtest take about a second.
		 */
		writes *= 10;

		min = 0x7fffffff;
		for (tries = 0; tries < CTX_TRIES; ++tries) {
			result = ctx(overhead, writes, atoi(av[i]));
			if (min > result) {
				min = result;
			}
		}
	    	fprintf(stderr, "%d %d\n", atoi(av[i]), min);
	}
	return (0);
}

int
ctx(int overhead, int writes, int procs)
{
	int	p[100][2];
	int	msg = 0, i;
	int	time;
	int	sum;


	/*
	 * Get a bunch of pipes.
	 */
     	for (i = 0; i < procs; ++i) {
		if (pipe(p[i]) == -1) {
			perror("pipe");
			exit(1);
		}
	}

	/*
	 * Use the pipes as a ring, and fork off a bunch of processes
	 * to pass the byte through their part of the ring.
	 */
	signal(SIGTERM, SIG_IGN);
     	for (i = 1; i < procs; ++i) {
		switch (pids[i] = fork()) {
		    case -1: 
			perror("fork");
			killem(procs);

		    case 0:	/* child */
			doit(p, i-1, i);
			/* NOTREACHED */

		    default:	/* parent */
		    	;
	    	}
	}

	/*
	 * Go once around the loop to make sure that everyone is ready and
	 * to get the token in the pipeline.
	 */
	if (write(p[0][1], &msg, sizeof(msg)) != sizeof(msg) ||
	    read(p[procs-1][0], &msg, sizeof(msg)) != sizeof(msg) ||
	    write(p[0][1], &msg, sizeof(msg)) != sizeof(msg)) {
		perror("write/read/write on pipe");
		exit(1);
	}
	bzero(data, process_size);	/* make sure we have our own copy */

	/*
	 * Main process - all others should be ready to roll, time the
	 * loop.
	 */
	start(0);
	for (i = writes / procs; i--; ) {
		if (read(p[procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
		sum = sumit();
	    	if (write(p[0][1], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
	}
	time = stop(0,0);

	/*
	 * Close the pipes and kill the children.
	 */
     	killem(procs);
     	for (i = 0; i < procs; ++i) {
		close(p[i][0]);
		close(p[i][1]);
		if (i > 0) {
			wait(0);
		}
	}

	/*
	 * We know the overhead cost of each pipe trip, but we did it
	 * write times.
	 */
	return ((time / writes) - overhead);
}

void
killem(int procs)
{
	int	i;

	for (i = 1; i < procs; ++i) {
		if (pids[i] > 0) {
			kill(pids[i], SIGTERM);
		}
	}
}

void
doit(int p[100][2], int rd, int wr)
{
	int	msg, sum;

	signal(SIGTERM, SIG_DFL);
	bzero(data, process_size);	/* make sure we have our own copy */
	for ( ;; ) {
		if (read(p[rd][0], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
		sum = sumit();
		if (write(p[wr][1], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
	}
}

/*
 * Run the overhead test several times, taking the smallest overhead to
 * be valid.  The reasoning is that larger overheads probably included
 * the cost of some other system activity.
 *
 * The cost returned is the cost of going through one pipe once in usecs.
 */
int
pipe_cost()
{
	int	p[100][2];
	int	msg, sum, i, n, k, N;

	/*
	 * Get a bunch of pipes.
	 */
	n = 0;
	while (n < 20 && pipe(p[n]) != -1)
		n++;

	/*
	 * Measure the overhead of passing a byte around the ring.
	 */
	if (write(p[k = 0][1], &msg, sizeof(msg)) != sizeof(msg)) {
		perror("read/write on pipe");
		exit(1);
	}
	k = 0;
	LOOP_FIRST(N, i, ENOUGH);
	if (write(p[k][1], &msg, sizeof(msg)) != sizeof(msg)) {
		perror("read/write on pipe");
		exit(1);
	}
	if (read(p[k][0], &msg, sizeof(msg)) != sizeof(msg)) {
		perror("read/write on pipe");
		exit(1);
	}
	if (++k == n) {
		k = 0;
	}
	sum = sumit();
	LOOP_LAST(N, i, ENOUGH);
	for (k = 0; k < n; ++k) {
		close(p[k][0]);
		close(p[k][1]);
	}
	/* printf("pipe cost=%d %d %d\n", i/N,i,N); */
	return (i / N);
}

int
sumit()
{
	int	i, sum = 0;
	int	*d = data;

#define	TEN	sum+=d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7]+d[8]+d[9];d+=10;
#define	FIFTY	TEN TEN TEN TEN TEN
#define	HUNDRED	FIFTY FIFTY
#define	HALFK	HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED TEN sum+=*d++;sum+=*d++;

	for (i = process_size/sizeof(int); i > 512; i -= 512) {
		HALFK
	}
	return (sum);
}

#define	SIZE	(1024 * 1024)

/*
 * This is a huge unrolled loop that is supposed to blow the instruction
 * and the data caches in an attempt to get more reproducible numbers.
 * It sort of works.
 */
int
caches()
{
#if 0
	int	i, sum = 0;
	char	*d = (char *)malloc(SIZE);
	char	*save = d;

	bzero(d, SIZE);

#define	TEN	sum+=d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7]+d[8]+d[9];d+=10;
#define	FIFTY	TEN TEN TEN TEN TEN
#define	HUNDRED	FIFTY FIFTY
#define	HALFK	HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED TEN sum+=*d++;sum+=*d++;
#define	KILO	HALFK HALFK

#if 0
	for (i = SIZE; i > 8192; i -= 8192) {
		KILO KILO KILO KILO
		KILO KILO KILO KILO
	}
#else
	for (i = SIZE; i > 1024; i -= 1024) {
		KILO 
	}
#endif
	free(save);
	return (sum);
#endif
}
