/* $Header: /usr/cvsroot/libutf/utf.c,v 1.1.1.1 2006/05/26 20:14:56 agc Exp $ */

/*
 * Copyright © 1996-2006 Alistair Crooks.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <config.h>

#include <stdio.h>

#ifdef HAVE_STRING_H
#include <string.h>
#endif

#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif

#include "ure.h"
#include "utf.h"

#define RUNELEN(r) (((r) > 0 && (r) <= 0x007f) ? 1 : ((r) >= 0x0800) ? 3 : 2)

#ifndef MIN
#define MIN(a, b)	(((a) < (b)) ? (a) : (b))
#endif /* !MIN */

/* translate a single Rune to a UTF sequence, returning # of bytes produced */
int
runetochar(char *cp, Rune *rp)
{
	switch(RUNELEN(*rp)) {
	case 1:
		*cp++ = (unsigned char)(*rp & 0x7f);
		return 1;
	case 2:
		*cp++ = (0xc0 | ((*rp & 0x07c0) >> 6));
		*cp++ = (0x80 | (*rp & 0x003f));
		return 2;
	case 3:
		cp[0] = 0xe0 | ((*rp & 0xf000) >> 12);
		cp[1] = (0x80 | ((*rp & 0x0fc0) >> 6));
		cp[2] = (0x80 | (*rp & 0x003f));
		return 3;
	}
	/* can't happen */
	return -1;
}

/* translate a UTF sequence to a Rune, returning # of bytes consumed */
int
chartorune(Rune *rp, char *cp)
{
	if (cp != (char *) NULL) {
		if ((*cp & 0x80) == 0) {
			*rp = (*cp & 0x7f);
			return 1;
		}
		if ((*cp & 0xe0) == 0xc0 && (*(cp + 1) & 0xc0) == 0x80) {
			*rp = ((*cp & 0x1f) << 6) | (cp[1] & 0x3f);
			return 2;
		}
		if ((*cp & 0xf0) == 0xe0 && (*(cp + 1) & 0xc0) == 0x80 && (*(cp + 2) & 0xc0) == 0x80) {
			*rp = ((*cp & 0x0f) << 12) | ((cp[1] & 0x3f) << 6) | (cp[2] & 0x3f);
			return 3;
		}
	}
	/* `rune error' - return error rune, length 1 */
	*rp = Runeerror;
	return 1; 
}

/* return the # of bytes in the UTF encoding of a Rune */
int
runelen(long r)
{
	return RUNELEN(r);
}

/* return 1 if `n' bytes of `cp' contains a complete UTF encoding */
int
fullrune(char *cp, int n)
{
	Rune	r;
	int	len;

	if (n > 0) {
		len = chartorune(&r, cp);
		if (r != Runeerror) {
			return (n == len);
		}
	}
	return 0;
}

/* return the number of runes in a UTF string */
int
utflen(char *s)
{
	Rune	r;
	int	rc;
	int	i;

	rc = 0;
	for (;;) {
		i = chartorune(&r, s);
		if (r == 0) {
			break;
		}
		s += i;
		rc++;
	}
	return rc;
}

/* return the number of runes in an n-byte UTF string */
int
utfnlen(char *s, int n)
{
	Rune	r;
	int	rc;
	int	i;

	for (rc = 0 ; n-- > 0 ; ) {
		i = chartorune(&r, s);
		if (r == 0) {
			break;
		}
		s += i;
		rc++;
	}
	return rc;
}

/* return the number of bytes in a UTF string */
int
utfbytes(char *s)
{
	char	*cp;
	Rune	r;
	int	i;

	cp = s;
	for (;;) {
		i = chartorune(&r, cp);
		if (r == 0) {
			break;
		}
		cp += i;
	}
	return cp - s;
}

/* point to the first occurrence of `r' in the UTF sequence */
char *
utfrune(char *cp, long r)
{
	Rune	rch;
	int	len;

	for (;;) {
		len = chartorune(&rch, cp);
		if (rch == r) {
			return cp;
		}
		if (rch == 0) {
			return (char *) NULL;
		}
		cp += len;
	}
}

/* point to the last occurrence of `r' in the UTF sequence */
char *
utfrrune(char *cp, long r)
{
	char	*last;
	Rune	rch;
	int	len;

	last = (char *) NULL;
	for (;;) {
		len = chartorune(&rch, cp);
		if (rch == r) {
			last = cp;
		}
		if (rch == 0) {
			return last;
		}
		cp += len;
	}
}

/* return the first occurrence of UTF string `little' in `big' */
char *
utfutf(char *big, char *little)
{
	Rune	r;
	char	*cp;
	int	bytes;
	int	len;

	cp = little;
	do {
		len = chartorune(&r, cp);
		cp += len;
	} while (r != 0);
	bytes = (cp - little) - len;
	(void) chartorune(&r, little);
	for (cp = big ; (cp = utfrune(cp, r)) != (char *) NULL ; ) {
		if (memcmp(cp, little, bytes) == 0) {
			return cp;
		}
	}
	return (char *) NULL;
}

/* compare, lexicographically by Rune, two UTF strings */
int
utfcmp(char *s1, char *s2)
{
	Rune	r1;
	Rune	r2;

	do {
		s1 += chartorune(&r1, s1);
		s2 += chartorune(&r2, s2);
	} while (r1 == r2 && r1 != 0 && r2 != 0);
	return r2 - r1;
}

/* compare, lexicographically by Rune, two UTF strings at most rc Runes long */
int
utfncmp(char *s1, char *s2, int rc)
{
	Rune	r1;
	Rune	r2;

	do {
		s1 += chartorune(&r1, s1);
		s2 += chartorune(&r2, s2);
	} while (r1 == r2 && r1 != 0 && r2 != 0 && rc-- > 0);
	return r2 - r1;
}

/* span in s1 for set s2, return # of bytes */
int
utfspan(char *s1, char *s2, int *rc)
{
	Rune	r;
	char	*cp;
	int	i;

	cp = s1;
	*rc = 0;
	for (;;) {
		i = chartorune(&r, cp);
		if (r == 0 || utfrune(s2, r) == (char *) NULL) {
			return cp - s1;
		}
		cp += i;
		*rc += 1;
	}
}

/* span in s1 for the complement of set s2, return # of bytes */
int
utfcspan(char *s1, char *s2, int *rc)
{
	Rune	r;
	char	*cp;
	int	i;

	cp = s1;
	*rc = 0;
	for (;;) {
		i = chartorune(&r, cp);
		if (r == 0 || utfrune(s2, r) != (char *) NULL) {
			return cp - s1;
		}
		cp += i;
		*rc += 1;
	}
}

/* get the rune prior to s, and return its length */
int
priorrune(Rune *rp, char *s)
{
	s -= 1;
	if ((*s & 0x80) != 0) {
		s -= 1;
		if ((*s & 0xe0) != 0xc0) {
			s -= 1;
		}
	}
	return chartorune(rp, s);
}

/* functionally equivalent to strpbrk for utf strings */
char *
utffindrune(char *s, char *charset)
{
	
	Rune	r;
	char	*cp;
	int	i;

	for (;;) {
		i = chartorune(&r, charset);
		if (r == 0) {
			return (char *) NULL;
		}
		if ((cp = utfrune(s, r)) != (char *) NULL) {
			return cp;
		}
		charset += i;
	}
}

/* compare an array of `n' runes against a utf string */
int
runeutfncmp(Rune *rp, char *up, int n)
{
	Rune	r;
	int	diff;
	int	rc;
	int	i;

	for (diff = rc = 0 ; rc < n ; rc++) {
		i = chartorune(&r, up);
		up += i;
		if ((diff = r - rp[rc]) != 0) {
			break;
		}
	}
	return diff;
}

#ifndef HAVE_MEMMOVE
/* overlapping-safe memory move function */
static char *
memmove(char *dst, char *src, int nbytes)
{
	char	*ret;

	if ((ret = dst) >= src && dst <= &src[nbytes]) {
		for (dst += nbytes, src += nbytes ; nbytes-- > 0 ; ) {
			*--dst = *--src;
		}
	} else {
		while (nbytes-- > 0) {
			*dst++ = *src++;
		}
	}
	return ret;
}
#endif

/* copy utf string src to dst */
char *
utfcpy(char *dst, char *src)
{
	char	*cp;

	/* make sure we get the null byte */
	return memmove(dst, src, utfbytes(src) + 1);
}

/* copy utf string src to dst */
char *
utfncpy(char *dst, char *src, int nbytes)
{
	int	len;

	if ((len = utfbytes(src)) < nbytes) {
		dst[len] = 0;
	}
	return memmove(dst, src, MIN(len, nbytes));
}

/* copy utf string src to dst */
char *
utflcpy(char *dst, char *src, int nbytes)
{
	int	len;

	if ((len = utfbytes(src)) < nbytes) {
		dst[len] = 0x0;
	}
	return memmove(dst, src, MIN(len, nbytes));
}

char *
utfcat(char *src, char *append)
{
	(void) utfcpy(&src[utfbytes(src)], append);
	return src;
}

char *
utfncat(char *src, char *append, int slen)
{
	int	len;

	len = utfbytes(src);
	(void) utfncpy(&src[len], append, slen - len);
	return src;
}

/* a particluarly dumb implementation of snprintf - just does simple %[sdcx] */
int
utf_snprintf(char *buf, int size, char *fmt, ...)
{
	va_list	vp;
	char	lfmt[BUFSIZ];
	char	lbuf[BUFSIZ];
	char	done;
	char	*lfp;
	char	*fp;
	char	*bp;
	char	*s;
	char	c;
	long	i;

	va_start(vp, fmt);
	for (bp = buf, fp = fmt, done = 0 ; bp - buf < size && !done ; ) {
		switch(*fp) {
		case '%':
			switch(*++fp) {
			case '%':
				*bp++ = *fp++;
				break;
			case 'c':
				*bp++ = (char) va_arg(vp, int);
				fp++;
				break;
			case 's':
				if ((s = (char *) va_arg(vp, char *)) == (char *) NULL) {
					s = "(null)";
				}
				for (i = utfbytes(s); bp - buf < size && i-- > 0 ; ) {
					*bp++ = *s++;
				}
				fp++;
				break;
			default:
				for (lfp = lfmt, *lfp++ = '%' ; isdigit(*fp) ; ) {
					*lfp++ = *fp++;
				}
				if (*fp == 'l') {
					*lfp++ = *fp++;
					i = (long) va_arg(vp, long);
				} else if (*fp == 'h') {
					*lfp++ = *fp++;
					i = (short) va_arg(vp, int);
				} else {
					i = (int) va_arg(vp, int);
				}
				if (*fp == 'd' || *fp == 'x' || *fp == 'o') {
					*lfp++ = *fp++;
				} else {
					*lfp++ = 'd';
				}
				*lfp = 0;
				(void) sprintf(lbuf, lfmt, i);
				for (i = utfbytes(s = lbuf); bp - buf < size && i-- > 0 ; ) {
					*bp++ = *s++;
				}
			}
			break;
		case 0:
			done = 1;
			break;
		default:
			*bp++ = *fp++;
		}
	}
	if (bp - buf < size) {
		*bp = 0;
	} else {
		buf[size - 1] = 0;
	}
	va_end(vp);
	return bp - buf;
}
