/* AbiSource Program Utilities
 * Copyright (C) 1998 AbiSource, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */

#include <stdio.h>

#include <stdlib.h>
#include <string.h>

#ifdef __QNXNTO__
#include <strings.h>
#endif

#include <math.h>
#include <ctype.h>

#include "ut_types.h"
#include "ut_misc.h"
#include "ut_assert.h"
#include "ut_string.h"
#include "ut_debugmsg.h"
#include "ut_growbuf.h"
#include <fribidi.h>
#include "ut_mbtowc.h"
#include "ut_wctomb.h"

#include "ut_string_class.h"

#include "xap_EncodingManager.h"

#define UT_STRING_CPP
#include "ut_case.h"
#undef  UT_STRING_CPP

// really simple way to determine if something is a hyperlink
// probably not 100% correct, but better than !stricmp(http://), ...
bool UT_isUrl ( const char * szName )
{
  UT_return_val_if_fail(szName, false);

  int len = strlen ( szName );
  int mailto_len = strlen ( "mailto:" );

  if ( NULL != strstr ( szName, "://") ) // dumb check, but probably true
    return true;
  else if ( ( len >= mailto_len ) &&
	    !UT_XML_strnicmp ( "mailto:", szName, mailto_len ) )
    return true;
  else
    return false;
}

/*
 * This is cut & pasted from glib 1.3 (c) RedHat
 * We need this to convert UTF-32 to UTF-8
 */
int
unichar_to_utf8 (int c, unsigned char *outbuf)
{
  size_t len = 0;
  int first;
  int i;

  if (c < 0x80)
    {
      first = 0;
      len = 1;
    }
  else if (c < 0x800)
    {
      first = 0xc0;
      len = 2;
    }
  else if (c < 0x10000)
    {
      first = 0xe0;
      len = 3;
    }
   else if (c < 0x200000)
    {
      first = 0xf0;
      len = 4;
    }
  else if (c < 0x4000000)
    {
      first = 0xf8;
      len = 5;
    }
  else
    {
      first = 0xfc;
      len = 6;
    }

  if (outbuf)
    {
      for (i = len - 1; i > 0; --i)
	{
	  outbuf[i] = (c & 0x3f) | 0x80;
	  c >>= 6;
	}
      outbuf[0] = c | first;
    }

  return len;
}

UT_uint32 UT_pointerArrayLength(void ** array)
{
	if (! (array && *array))
		return 0;

	UT_uint32 i = 0;
	while (array[i])
		i++;

	return i;
}

////////////////////////////////////////////////////////////////////////
//
//  8-bit string (char)
//
//  String is built of 8-bit units (bytes)
//  Encoding could be any single-byte or multi-byte encoding
//
////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////
// char * UT_catPathname(const char * szPath, const char * szFile);
// is defined in platform-specific code.
//////////////////////////////////////////////////////////////////

char * UT_strdup(const char * szSource)
{
  UT_return_val_if_fail(szSource, NULL);

	int len = strlen(szSource)+1;
	if(char * ret = static_cast<char *>(UT_calloc(len, sizeof(char))))
		return(static_cast<char *>(memcpy(ret, szSource, len)));
	else
		return(NULL);
}

UT_sint32 UT_stricmp(const char * s1, const char * s2)
{
 UT_return_val_if_fail(s1, 1);
 UT_return_val_if_fail(s2, -1);

#if defined(HAVE_STRCASECMP)

  return strcasecmp(s1,s2);

#elif defined(HAVE_STRICMP)
  
  return stricmp(s1,s2);

#else

  // Lifted from glibc.  Looks better (in a constant-factor sort of way)
  // than what we had before.  Ideally this should be per-platform.
  const unsigned char *p1 = reinterpret_cast<const unsigned char *>(s1);
  const unsigned char *p2 = reinterpret_cast<const unsigned char *>(s2);
  unsigned char c1, c2;
  
  if (s1 == s2)
	  return 0;
  
  do
  {
	  c1 = tolower (*p1++);
	  c2 = tolower (*p2++);
	  if (c1 == '\0')
		  break;
  }
  while (c1 == c2);
  
  return c1 - c2;
#endif /* HAVE_STRCASECMP || HAVE_STRICMP */
}

// should really be a size_t, but that might break compilation on weird
// platforms.  I don't know.
UT_sint32 UT_strnicmp(const char *s1, const char *s2, int n)
{
  UT_return_val_if_fail(s1, 1);
  UT_return_val_if_fail(s2, -1);

	const unsigned char *p1 = reinterpret_cast<const unsigned char *>(s1);
	const unsigned char *p2 = reinterpret_cast<const unsigned char *>(s2);
	unsigned char c1, c2;

	if (p1 == p2 || n == 0)
		return 0;

	do
    {
		c1 = tolower (*p1++);
		c2 = tolower (*p2++);
		if (c1 == '\0' || c1 != c2)
			return c1 - c2;
    } while (--n > 0);

	return c1 - c2;
}

bool UT_cloneString(char *& rszDest, const char * szSource)
{
	if (szSource && *szSource)
	{
		UT_uint32 length = strlen(szSource) + 1;
		rszDest = static_cast<char *>(malloc(length));

		if (!rszDest)
			return false;

		memmove(rszDest, szSource, length);
	}
	else
		rszDest = NULL;
	
	return true;
}

bool UT_replaceString(char *& rszDest, const char * szSource)
{
	if (rszDest)
		free(rszDest);
	rszDest = NULL;

	return UT_cloneString(rszDest,szSource);
}

// convert each character in a string to ASCII uppercase
char * UT_upperString(char * string)
{
	if (!string)
		return 0;

	for (char * ch = string; *ch != 0; ch++)
		*ch = toupper(*ch);

	return string;
}

// convert each character in a string to ASCII lowercase
char * UT_lowerString(char * string)
{
	if (!string)
		return 0;

	for (char * ch = string; *ch != 0; ch++)
		*ch = tolower(*ch);

	return string;
}

////////////////////////////////////////////////////////////////////////
//
//  XML string (XML_Char)
//
//  String is built of 8-bit units (bytes)
//
////////////////////////////////////////////////////////////////////////

UT_uint32 UT_XML_strlen(const XML_Char * sz)
{
	if (!sz || !*sz)
		return 0;

	UT_uint32 k = 0;
	while (sz[k])
		k++;

	return k;
}

// Is this function implemented somewhere else?

bool UT_XML_cloneList(XML_Char **& rszDest, const XML_Char ** szSource)
{
	if (!szSource)
		return true;

	XML_Char ** newmemory = (XML_Char **)
		UT_calloc(UT_pointerArrayLength(reinterpret_cast<void **>(const_cast<XML_Char **>(szSource))) + 1, sizeof(XML_Char *));

	if (newmemory == NULL)
		return false;

	memcpy(static_cast<void *>(newmemory), static_cast<const void *>(szSource),
		   UT_pointerArrayLength(reinterpret_cast<void **>(const_cast<XML_Char **>(szSource)) ) * sizeof(XML_Char *));

	rszDest = newmemory;

	return true;
}

bool UT_XML_replaceList(XML_Char **& rszDest, const XML_Char ** szSource)
{
	FREEP(rszDest);

	return UT_XML_cloneList(rszDest, szSource);
}

bool UT_XML_cloneString(XML_Char *& rszDest, const XML_Char * szSource)
{
	UT_uint32 length = UT_XML_strlen(szSource) + 1;
	rszDest = static_cast<XML_Char *>(UT_calloc(length,sizeof(XML_Char)));
	if (!rszDest)
		return false;
	memmove(rszDest,szSource,length*sizeof(XML_Char));
	return true;
}

UT_sint32 UT_XML_stricmp(const XML_Char * sz1, const XML_Char * sz2)
{
	UT_ASSERT(sizeof(char) == sizeof(XML_Char));
	return UT_stricmp(static_cast<const char*>(sz1),static_cast<const char*>(sz2));
}

UT_sint32 UT_XML_strnicmp(const XML_Char * sz1, const XML_Char * sz2, const UT_uint32 n)
{
	UT_ASSERT(sizeof(char) == sizeof(XML_Char));
	return UT_strnicmp(static_cast<const char*>(sz1),static_cast<const char*>(sz2),n);
}

UT_sint32 UT_XML_strcmp(const XML_Char * sz1, const XML_Char * sz2)
{
	UT_ASSERT(sizeof(char) == sizeof(XML_Char));
	return strcmp(static_cast<const char*>(sz1),static_cast<const char*>(sz2));
}

bool UT_XML_cloneNoAmpersands(XML_Char *& rszDest, const XML_Char * szSource)
{
	if (szSource == NULL)
		return false;

	UT_uint32 length = UT_XML_strlen(szSource) + 1;
	rszDest = static_cast<XML_Char *>(UT_calloc(length, sizeof(XML_Char)));

	if (!rszDest)
		return false;

	const XML_Char * o = szSource;
	XML_Char * n = rszDest;
	while (*o != 0)
	{
		if (*o != '&')
		{
			*n = *o;
			n++;
		}
		o++;
	}

	return true;
}

/* This uses the clone no ampersands but dumps into a static buffer */
XML_Char *UT_XML_transNoAmpersands(const XML_Char * szSource)
{
	static XML_Char *rszDestBuffer = NULL;
	static UT_uint32 iDestBufferLength = 0;

	if (szSource == NULL)
		return NULL;

	UT_uint32 length = UT_XML_strlen(szSource) + 1;
	if (length > iDestBufferLength) {
		if (rszDestBuffer && iDestBufferLength) {
			free(rszDestBuffer);
		}
		iDestBufferLength = 0;
		rszDestBuffer = static_cast<XML_Char *>(UT_calloc(length, sizeof(XML_Char)));

		if (!rszDestBuffer)
			return NULL;

		iDestBufferLength = length;
	}
	memset(rszDestBuffer, 0, iDestBufferLength);

	const XML_Char * o = szSource;
	XML_Char * n = rszDestBuffer;
	while (*o != 0)
	{
		if (*o != '&')
		{
			*n = *o;
			n++;
		}
		o++;
	}

	return rszDestBuffer;
}


// TODO : put a better strncpy here; resolve to platform version if available

UT_uint32 UT_XML_strncpy(XML_Char * szDest, UT_uint32 nLen, const XML_Char * szSource)
{
	if (!szSource)
		return 0;

	UT_ASSERT(szDest);

	UT_uint32 i = 0;

	while (i < nLen)
	{
		szDest[i] = szSource[i];

		// if we just wrote NULL, return
		if (szDest[i] == 0)
			return i;

		i++;
	}

	return i;
}

UT_UCSChar UT_decodeUTF8char(const XML_Char * p, UT_uint32 len)
{
	UT_UCSChar ucs, ucs1, ucs2, ucs3;

	switch (len)
	{
	case 2:
		ucs1 = static_cast<UT_UCSChar>(p[0] & 0x1f);
		ucs2 = static_cast<UT_UCSChar>(p[1] & 0x3f);
		ucs  = (ucs1 << 6) | ucs2;
		return ucs;

	case 3:
		ucs1 = static_cast<UT_UCSChar>(p[0] & 0x0f);
		ucs2 = static_cast<UT_UCSChar>(p[1] & 0x3f);
		ucs3 = static_cast<UT_UCSChar>(p[2] & 0x3f);
		ucs  = (ucs1 << 12) | (ucs2 << 6) | ucs3;
		return ucs;

	default:
		UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
		return 0;
	}
}

void UT_decodeUTF8string(const XML_Char * pString, UT_uint32 len, UT_GrowBuf * pResult)
{
	// decode the given string [ p[0]...p[len] ] and append to the given growbuf.

	UT_ASSERT(sizeof(XML_Char) == sizeof(UT_Byte));
	const UT_Byte * p = reinterpret_cast<const UT_Byte *>(pString);	// XML_Char is signed...

	int bytesInSequence = 0;
	int bytesExpectedInSequence = 0;
	XML_Char buf[5];

	for (UT_uint32 k=0; k<len; k++)
	{
		if (p[k] < 0x80)						// plain us-ascii part of latin-1
		{
			UT_ASSERT(bytesInSequence == 0);
			UT_UCSChar c = p[k];
			pResult->append(reinterpret_cast<UT_GrowBufElement *>(&c),1);
		}
		else if ((p[k] & 0xf0) == 0xf0)			// lead byte in 4-byte surrogate pair
		{
			// surrogate pairs are defined in section 3.7 of the
			// unicode standard version 2.0 as an extension
			// mechanism for rare characters in future extensions
			// of the unicode standard.
			UT_ASSERT(bytesInSequence == 0);
			UT_ASSERT(UT_NOT_IMPLEMENTED);
		}
		else if ((p[k] & 0xe0) == 0xe0)			// lead byte in 3-byte sequence
		{
			UT_ASSERT(bytesInSequence == 0);
			bytesExpectedInSequence = 3;
			buf[bytesInSequence++] = p[k];
		}
		else if ((p[k] & 0xc0) == 0xc0)			// lead byte in 2-byte sequence
		{
			UT_ASSERT(bytesInSequence == 0);
			bytesExpectedInSequence = 2;
			buf[bytesInSequence++] = p[k];
		}
		else if ((p[k] & 0x80) == 0x80)			// trailing byte in multi-byte sequence
		{
			UT_ASSERT(bytesInSequence > 0);
			buf[bytesInSequence++] = p[k];
			if (bytesInSequence == bytesExpectedInSequence)		// final byte in multi-byte sequence
			{
				UT_UCSChar c = UT_decodeUTF8char(buf,bytesInSequence);
				pResult->append(reinterpret_cast<UT_GrowBufElement *>(&c),1);
				bytesInSequence = 0;
				bytesExpectedInSequence = 0;
			}
		}
	}
}

/*
  The following code is from the GNU C library, version 2.0.6.
  It has been reformatted and tweaked to do Unicode strstrs.
  All this licensing stuff is kinda ugly, but I didn't want
  to risk merging the licensing for fear I might break some law.
*/

/* Copyright (C) 1994, 1996 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU C Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  */

////////////////////////////////////////////////////////////////////////
//
//  UCS-2 string (UT_UCS2Char)
//
//  String is built of 16-bit units (words)
//
//  TODO: Is this really UCS-2 or UTF-16?
//  TODO:  meaning, does it support surrogates or is it intended to
//  TODO:  support them at any time in the future?
//  TODO: Correctly, UCS-2 does not support surrogates and UTF-16 does.
//  TODO: BUT Microsoft calls their native Unicode encoding UCS-2
//  TODO:  while it supports surrogates and is thus really UTF-16.
//  TODO: Surrogates are Unicode characters with codepoints above
//  TODO:  65535 which cannot therefore fit into a 2-byte word.
//  TODO: This means that TRUE UCS-2 is a single-word encoding and
//  TODO:  UTF-16 is a multi-word encoding.
//
//  NOTE: We shouldn't actually need 16-bit strings anymore since
//  NOTE:  AbiWord is now fully converted to using 32-bit Unicode
//  NOTE:  internally. The only possible needs for this is for
//  NOTE:  Windows GUI, filesystem and API functions where applicable;
//  NOTE:  and perhaps some file formats or external libraries
//
////////////////////////////////////////////////////////////////////////

// Don't ifdef out strlen since it's used by the MSWord importer...

// TODO is this really UCS-2 or UTF-16?
// TODO and are we using strlen for the number of 16-bit words
// TODO or the number of characters?
// TODO Because UTF-16 characters are sometimes expressed as 2 words

UT_uint32 UT_UCS2_strlen(const UT_UCS2Char * string)
{
	UT_uint32 i;

	for(i = 0; *string != 0; string++, i++)
		;

	return i;
}

#ifdef ENABLE_UCS2_STRINGS
/*
 * My personal strstr() implementation that beats most other algorithms.
 * Until someone tells me otherwise, I assume that this is the
 * fastest implementation of strstr() in C.
 * I deliberately chose not to comment it.  You should have at least
 * as much fun trying to understand it, as I had to write it :-).
 *
 * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de */

UT_UCS2Char * UT_UCS2_strstr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle)
{
	register const UT_UCS2Char *haystack, *needle;
	register UT_UCS2Char b, c;

	haystack = phaystack;
	needle = pneedle;

	b = *needle;
	if (b != '\0')
    {
		haystack--;                               /* possible ANSI violation */
		do
        {
			c = *++haystack;
			if (c == '\0')
				goto ret0;
        }
		while (c != b);

		c = *++needle;
		if (c == '\0')
			goto foundneedle;
		++needle;
		goto jin;

		for (;;)
        {
			register UT_UCS2Char a;
			register const UT_UCS2Char *rhaystack, *rneedle;

			do
            {
				a = *++haystack;
				if (a == '\0')
					goto ret0;
				if (a == b)
					break;
				a = *++haystack;
				if (a == '\0')
					goto ret0;
			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
			}
			while (a != b);

		jin:	a = *++haystack;
			if (a == '\0')
				goto ret0;

			if (a != c)
				goto shloop;

			rhaystack = haystack-- + 1;
			rneedle = needle;
			a = *rneedle;

			if (*rhaystack == a)
				do
				{
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = *++needle;
					if (*rhaystack != a)
						break;
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = *++needle;
				}
				while (*rhaystack == a);

			needle = rneedle;             /* took the register-poor approach */

			if (a == '\0')
				break;
        }
    }
 foundneedle:
	return static_cast<UT_UCS2Char *>(haystack);
 ret0:
	return 0;
}

UT_sint32 UT_UCS2_strcmp(const UT_UCS2Char* left, const UT_UCS2Char* right)
{
	UT_ASSERT(left);
	UT_ASSERT(right);

	while (*left && *right)
	{
		if (*left < *right)
		{
			return -1;
		}

		if (*left > *right)
		{
			return 1;
		}

		left++;
		right++;
	}

	if (*left)
	{
		return -1;
	}
	else if (*right)
	{
		return 1;
	}
	else
	{
		return 0;
	}
}

/*
  Latin-1 Unicode case-insensitive string comparison and casing done by
  Pierre Sarrazin <ps@cam.org>.
*/

/**
 * Convert a given character to uppercase
 */
UT_UCS2Char UT_UCS2_toupper(UT_UCS2Char c)
{
        if (c < 128) // in ASCII range
	  return toupper(c);

	if (XAP_EncodingManager::get_instance()->single_case())
		return c;
	/*let's trust libc! -- does not seem to work :(*/
    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(!letter || letter->type == 1)
        return c;
    return letter->other;
}


/*	Converts the given character to lowercase if it is an uppercase letter.
	Returns it unchanged if it is not.
	This function created by Pierre Sarrazin 1999-02-06
*/

UT_UCS2Char UT_UCS2_tolower(UT_UCS2Char c)
{
	if (c < 128)
		return tolower(c);
	if (XAP_EncodingManager::get_instance()->single_case())
		return c;
	/*let's trust libc!*/
    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(!letter || letter->type == 0)
        return c;
    return letter->other;
}


/*	Characters are converted to lowercase (if applicable) when they
	are read from the needle or the haystack. See UT_UCS_tolower().
	This function created by Pierre Sarrazin 1999-02-06
*/

UT_UCS2Char * UT_UCS2_stristr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle)
{
	register const UT_UCS2Char *haystack, *needle;
	register UT_UCS2Char b, c;

	haystack = phaystack;
	needle = pneedle;

	b = UT_UCS2_tolower(*needle);
	if (b != '\0')
    {
		haystack--;                               /* possible ANSI violation */
		do
        {
			c = UT_UCS2_tolower(*++haystack);
			if (c == '\0')
				goto ret0;
        }
		while (c != b);

		c = UT_UCS2_tolower(*++needle);
		if (c == '\0')
			goto foundneedle;
		++needle;
		goto jin;

		for (;;)
        {
			register UT_UCS2Char a;
			register const UT_UCS2Char *rhaystack, *rneedle;

			do
            {
				a = UT_UCS2_tolower(*++haystack);
				if (a == '\0')
					goto ret0;
				if (a == b)
					break;
				a = UT_UCS2_tolower(*++haystack);
				if (a == '\0')
					goto ret0;
			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
			}
			while (a != b);

		jin:	a = UT_UCS2_tolower(*++haystack);
			if (a == '\0')
				goto ret0;

			if (a != c)
				goto shloop;

			rhaystack = haystack-- + 1;
			rneedle = needle;
			a = UT_UCS2_tolower(*rneedle);

			if (UT_UCS2_tolower(*rhaystack) == a)
				do
				{
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = UT_UCS2_tolower(*++needle);
					if (UT_UCS2_tolower(*rhaystack) != a)
						break;
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = UT_UCS2_tolower(*++needle);
				}
				while (UT_UCS2_tolower(*rhaystack) == a);

			needle = rneedle;             /* took the register-poor approach */

			if (a == '\0')
				break;
        }
    }
 foundneedle:
	return static_cast<UT_UCS2Char *>(haystack);
 ret0:
	return 0;
}
/****************************************************************************/

UT_UCS2Char * UT_UCS2_strcpy(UT_UCS2Char * dest, const UT_UCS2Char * src)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_UCS2Char * d = dest;
	UT_UCS2Char * s = static_cast<UT_UCS2Char *>(src);

	while (*s != 0)
		*d++ = *s++;
	*d = 0;

	return dest;
}

UT_UCS2Char * UT_UCS2_strcpy_char(UT_UCS2Char * dest, const char * src)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_UCS2Char * d 		= dest;
	unsigned char * s	= static_cast<unsigned char *>(src);

	static UT_UCS2_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
	UT_UCS2Char wc;

	while (*s != 0)
	  {
		if(m.mbtowc(wc,*s))*d++=wc;
		s++;
	  }
	*d = 0;

	return dest;
}

char * UT_UCS2_strcpy_to_char(char * dest, const UT_UCS2Char * src)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_ASSERT_NOT_REACHED();

	return NULL;
}

bool UT_UCS2_cloneString(UT_UCS2Char ** dest, const UT_UCS2Char * src)
{
	UT_uint32 length = UT_UCS2_strlen(src) + 1;
	*dest = static_cast<UT_UCS2Char *>(UT_calloc(length,sizeof(UT_UCS2Char)));
	if (!*dest)
		return false;
	memmove(*dest,src,length*sizeof(UT_UCS2Char));

	return true;
}

bool UT_UCS2_cloneString_char(UT_UCS2Char ** dest, const char * src)
{
  UT_ASSERT_NOT_REACHED();
  return false;
}

bool UT_UCS2_isupper(UT_UCS2Char c)
{
	if(c < 127)
		return isupper(c)!=0;

    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(letter && letter->type == 1)
        return true;
    return false;
};

bool UT_UCS2_islower(UT_UCS2Char c)
{
	if(c < 127)
		return islower(c)!=0;

    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(!letter || letter->type == 0)
        return true;
    return false;
};

bool UT_UCS2_isspace(UT_UCS2Char c)
{
	// the whitespace table is small, so use linear search
	for (UT_uint32 i = 0; i < NrElements(whitespace_table); i++)
	{
		if(whitespace_table[i].high < c)
			continue;
		if(whitespace_table[i].low <= c)
			return true;
		// if we got here, then low > c
		return false;
	}
	return false;
};

bool UT_UCS2_isalpha(UT_UCS2Char c)
{
    FriBidiCharType type = fribidi_get_type(c);
    return (FRIBIDI_IS_LETTER(type) != 0);
};

bool UT_UCS2_isSentenceSeparator(UT_UCS2Char c)
{
	switch(c)
	{
	case '?': // fall-through
	case '!': // fall-through
	case '.':
		return true;

	default:
		return false;
	}
}

/* copies exactly n-chars from src to dest; NB! does not check for 00 i src
*/
UT_UCS2Char * UT_UCS2_strncpy(UT_UCS2Char * dest, const UT_UCS2Char * src, UT_uint32 n)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_UCS2Char * d = dest;
	UT_UCS2Char * s = static_cast<UT_UCS2Char *>(src);

	for (; d < static_cast<UT_UCS2Char *>(dest) + n;)
		*d++ = *s++;
	*d = '\0';

	return dest;
}


/* reverses str of len n; used by BiDi which always knows the len of string to process
   thus we can save ourselves searching for the 00 */
UT_UCS2Char * UT_UCS2_strnrev(UT_UCS2Char * src, UT_uint32 n)
{
    UT_UCS2Char t;
    UT_uint32 i;

    for(i = 0; i < n/2; i++)
    {
        t = *(src + i);
        *(src + i) = *(src + n - i - 1); //-1 so that we do not move the 00
        *(src + n - i - 1) = t;
    }
    return src;
}

#endif


////////////////////////////////////////////////////////////////////////
//
//  UCS string (UT_UCSChar)
//
//  String is built of units based on UT_UCSChar, which used to be
//   UT_UCS2Char and is now UT_UCS4Char
//
////////////////////////////////////////////////////////////////////////

#if 1 // i didn't get a chance to test this -- jeff
XML_Char* UT_encodeUTF8char(UT_UCSChar cIn)
{
	// convert the given unicode character into a UTF-8 sequence
	// return pointer to static buffer.

	static XML_Char sBuf[10];

	memset(sBuf,0,sizeof(sBuf));
	if (cIn < 0x0080)
	{
		sBuf[0] = static_cast<XML_Char>(cIn);
	}
	else if (cIn < 0x0800)
	{
		// 110xxxxx 10xxxxxx
		sBuf[0] = 0x00c0 | ((cIn >> 6) & 0x001f);
		sBuf[1] = 0x0080 | (    cIn    & 0x003f);
	}
	else
	{
		// 1110xxxx 10xxxxxx 10xxxxxx
		sBuf[0] = 0x00e0 | ((cIn >> 12) & 0x000f);
		sBuf[1] = 0x0080 | ((cIn >>  6) & 0x003f);
		sBuf[2] = 0x0080 | (    cIn     & 0x003f);
	}

	return sBuf;
}
#endif // --jeff

bool UT_isSmartQuotableCharacter(UT_UCSChar c)
{
	// TODO:  this is anglo-centric; really need a locale argument or
	// TODO:  something to get smart quote rules for the rest of the world
	bool result;
	switch (c)
	{
	case '"':
	case '`':
	case '\'':
		result = true;
		break;
	default:
		result = false;
		break;
	}
	return (result);
}

bool UT_isSmartQuotedCharacter(UT_UCSChar c)
{
	// TODO:  this is anglo-centric; really need a locale argument or
	// TODO:  something to get smart quote rules for the rest of the world
	bool result;
	switch (c)
	{
	case UCS_LQUOTE:
	case UCS_RQUOTE:
	case UCS_LDBLQUOTE:
	case UCS_RDBLQUOTE:
		result = true;
		break;
	default:
		result = false;
		break;
	}
	return (result);
}

////////////////////////////////////////////////////////////////////////
//
//  UCS-4 string
//
//  String is built of 32-bit units (longs)
//
//  NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
//  NOTE:  in the case of UCS-4 and UTF-32 since they really are
//  NOTE:  identical
//
////////////////////////////////////////////////////////////////////////

bool UT_UCS4_isupper(UT_UCS4Char c)
{
	if(c < 127)
		return isupper(c)!=0;

    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(letter && letter->type == 1)
        return true;
    return false;
}

bool UT_UCS4_islower(UT_UCS4Char c)
{
	if(c < 127)
		return islower(c)!=0;

    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(!letter || letter->type == 0)
        return true;
    return false;
}

bool UT_UCS4_isspace(UT_UCS4Char c)
{
	// the whitespace table is small, so use linear search
	for (UT_uint32 i = 0; i < NrElements(whitespace_table); i++)
	{
		if(whitespace_table[i].high < c)
			continue;
		if(whitespace_table[i].low <= c)
			return true;
		// if we got here, then low > c
		return false;
	}
	return false;
}

bool UT_UCS4_isalpha(UT_UCS4Char c)
{
    FriBidiCharType type = fribidi_get_type(c);
    return (FRIBIDI_IS_LETTER(type) != 0);
}

bool UT_UCS4_isSentenceSeparator(UT_UCS4Char c)
{
	switch(c)
	{
	case '?': // fall-through
	case '!': // fall-through
	case '.':
		return true;

	default:
		return false;
	}
}
/* copies exactly n-chars from src to dest; NB! does not check for 00 i src
*/
UT_UCS4Char * UT_UCS4_strncpy(UT_UCS4Char * dest, const UT_UCS4Char * src, UT_uint32 n)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_UCSChar * d = dest;
	const UT_UCSChar * s = static_cast<const UT_UCS4Char *>(src);

	for (; d < static_cast<UT_UCS4Char *>(dest) + n;)
		*d++ = *s++;
	*d = '\0';

	return dest;
}


/* reverses str of len n; used by BiDi which always knows the len of string to process
   thus we can save ourselves searching for the 00 */
UT_UCS4Char * UT_UCS4_strnrev(UT_UCS4Char * src, UT_uint32 n)
{
    UT_UCS4Char t;
    UT_uint32 i;

    for(i = 0; i < n/2; i++)
    {
        t = *(src + i);
        *(src + i) = *(src + n - i - 1); //-1 so that we do not move the 00
        *(src + n - i - 1) = t;
    }
    return src;
}


UT_UCS4Char * UT_UCS4_strstr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle)
{
	register const UT_UCS4Char *haystack, *needle;
	register UT_UCS4Char b, c;

	haystack = static_cast<const UT_UCS4Char *>(phaystack);
	needle = static_cast<const UT_UCS4Char *>(pneedle);

	b = *needle;
	if (b != '\0')
    {
		haystack--;                               /* possible ANSI violation */
		do
        {
			c = *++haystack;
			if (c == '\0')
				goto ret0;
        }
		while (c != b);

		c = *++needle;
		if (c == '\0')
			goto foundneedle;
		++needle;
		goto jin;

		for (;;)
        {
			register UT_UCS4Char a;
			register const UT_UCS4Char *rhaystack, *rneedle;

			do
            {
				a = *++haystack;
				if (a == '\0')
					goto ret0;
				if (a == b)
					break;
				a = *++haystack;
				if (a == '\0')
					goto ret0;
			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
			}
			while (a != b);

		jin:	a = *++haystack;
			if (a == '\0')
				goto ret0;

			if (a != c)
				goto shloop;

			rhaystack = haystack-- + 1;
			rneedle = needle;
			a = *rneedle;

			if (*rhaystack == a)
				do
				{
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = *++needle;
					if (*rhaystack != a)
						break;
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = *++needle;
				}
				while (*rhaystack == a);

			needle = rneedle;             /* took the register-poor approach */

			if (a == '\0')
				break;
        }
    }
 foundneedle:
	return const_cast<UT_UCS4Char *>(haystack);
 ret0:
	return 0;
}

UT_sint32 UT_UCS4_strcmp(const UT_UCS4Char* left, const UT_UCS4Char* right)
{
	UT_ASSERT(left);
	UT_ASSERT(right);

	while (*left && *right)
	{
		if (*left < *right)
		{
			return -1;
		}

		if (*left > *right)
		{
			return 1;
		}

		left++;
		right++;
	}

	if (*left)
	{
		return -1;
	}
	else if (*right)
	{
		return 1;
	}
	else
	{
		return 0;
	}
}

/*
  Latin-1 Unicode case-insensitive string comparison and casing done by
  Pierre Sarrazin <ps@cam.org>.
*/

/**
 * Convert a given character to uppercase
 */
UT_UCS4Char UT_UCS4_toupper(UT_UCS4Char c)
{
        if (c < 128) // in ASCII range
	  return toupper(c);

	if (XAP_EncodingManager::get_instance()->single_case())
		return c;
	/*let's trust libc! -- does not seem to work :(*/
    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(!letter || letter->type == 1)
        return c;
    return letter->other;
}


/*	Converts the given character to lowercase if it is an uppercase letter.
	Returns it unchanged if it is not.
	This function created by Pierre Sarrazin 1999-02-06
*/

UT_UCS4Char UT_UCS4_tolower(UT_UCS4Char c)
{
	if (c < 128)
		return tolower(c);

	if (XAP_EncodingManager::get_instance()->single_case())
		return c;
	/*let's trust libc!*/
    case_entry * letter = static_cast<case_entry *>(bsearch(&c, &case_table, NrElements(case_table),sizeof(case_entry),s_cmp_case));
    if(!letter || letter->type == 0)
        return c;
    return letter->other;
}


/*	Characters are converted to lowercase (if applicable) when they
	are read from the needle or the haystack. See UT_UCS_tolower().
	This function created by Pierre Sarrazin 1999-02-06
*/

UT_UCS4Char * UT_UCS4_stristr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle)
{
	register const UT_UCS4Char *haystack, *needle;
	register UT_UCS4Char b, c;

	haystack = static_cast<const UT_UCS4Char *>(phaystack);
	needle = static_cast<const UT_UCS4Char *>(pneedle);

	b = UT_UCS4_tolower(*needle);
	if (b != '\0')
    {
		haystack--;                               /* possible ANSI violation */
		do
        {
			c = UT_UCS4_tolower(*++haystack);
			if (c == '\0')
				goto ret0;
        }
		while (c != b);

		c = UT_UCS4_tolower(*++needle);
		if (c == '\0')
			goto foundneedle;
		++needle;
		goto jin;

		for (;;)
        {
			register UT_UCS4Char a;
			register const UT_UCS4Char *rhaystack, *rneedle;

			do
            {
				a = UT_UCS4_tolower(*++haystack);
				if (a == '\0')
					goto ret0;
				if (a == b)
					break;
				a = UT_UCS4_tolower(*++haystack);
				if (a == '\0')
					goto ret0;
			shloop: ; // need a statement here for EGCS 1.1.1 to accept it
			}
			while (a != b);

		jin:	a = UT_UCS4_tolower(*++haystack);
			if (a == '\0')
				goto ret0;

			if (a != c)
				goto shloop;

			rhaystack = haystack-- + 1;
			rneedle = needle;
			a = UT_UCS4_tolower(*rneedle);

			if (UT_UCS4_tolower(*rhaystack) == a)
				do
				{
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = UT_UCS4_tolower(*++needle);
					if (UT_UCS4_tolower(*rhaystack) != a)
						break;
					if (a == '\0')
						goto foundneedle;
					++rhaystack;
					a = UT_UCS4_tolower(*++needle);
				}
				while (UT_UCS4_tolower(*rhaystack) == a);

			needle = rneedle;             /* took the register-poor approach */

			if (a == '\0')
				break;
        }
    }
 foundneedle:
	return const_cast<UT_UCS4Char *>(haystack);
 ret0:
	return 0;
}
/****************************************************************************/

UT_uint32 UT_UCS4_strlen(const UT_UCS4Char * string)
{
	UT_uint32 i;

	for(i = 0; *string != 0; string++, i++)
		;

	return i;
}

UT_UCS4Char * UT_UCS4_strcpy(UT_UCS4Char * dest, const UT_UCS4Char * src)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_UCS4Char * d = dest;
	const UT_UCS4Char * s = static_cast<const UT_UCS4Char *>(src);

	while (*s != 0)
		*d++ = *s++;
	*d = 0;

	return dest;
}

// TODO shouldn't all of the 'char *' strings be 'unsigned char *' strings ??

UT_UCS4Char * UT_UCS4_strcpy_char(UT_UCS4Char * dest, const char * src)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_UCS4Char * d 		= dest;
	const char * s	= static_cast<const char *>(src);

	static UT_UCS4_mbtowc m(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());
	UT_UCS4Char wc;

	while (*s != 0)
	  {
		if(m.mbtowc(wc,*s))*d++=wc;
		s++;
	  }
	*d = 0;

	return dest;
}

UT_UCS4Char * UT_UCS4_strcpy_utf8_char(UT_UCS4Char * dest, const char * src)
{
	// FIXME: This could be more efficient than it is, on the other
	// hand, it should be correct

	UT_ASSERT(dest);
	UT_ASSERT(src);

	UT_UCS4String ucs4str(src); // constructs a string from UTF-8 by default
	dest = UT_UCS4_strcpy(dest, ucs4str.ucs4_str());

	return dest;
}


char * UT_UCS4_strcpy_to_char(char * dest, const UT_UCS4Char * src)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	char * 			d = dest;
	const UT_UCS4Char * 	s = static_cast<const UT_UCS4Char *>(src);

	UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());

	while (*s != 0)
	  {
		int length;
		w.wctomb_or_fallback(d,length,*s++);
		d+=length;
	  }
	*d = 0;

	return dest;
}

char * UT_UCS4_strncpy_to_char(char * dest, const UT_UCS4Char * src, int n)
{
	UT_ASSERT(dest);
	UT_ASSERT(src);

	char * 			d = dest;
	const UT_UCS4Char * 	s = static_cast<const UT_UCS4Char *>(src);

	UT_Wctomb w(XAP_EncodingManager::get_instance()->getNative8BitEncodingName());

	while (*s != 0 && n > 0)
	  {
		int length;
		w.wctomb_or_fallback(d,length,*s++, n);
		d+=length;
		n-=length;
	  }
	*d = 0;

	return dest;
}

bool UT_UCS4_cloneString(UT_UCS4Char ** dest, const UT_UCS4Char * src)
{
	UT_uint32 length = UT_UCS4_strlen(src) + 1;
	*dest = static_cast<UT_UCS4Char *>(UT_calloc(length,sizeof(UT_UCS4Char)));
	if (!*dest)
		return false;
	memmove(*dest,src,length*sizeof(UT_UCS4Char));

	return true;
}

bool UT_UCS4_cloneString_char(UT_UCS4Char ** dest, const char * src)
{
  UT_uint32 length = strlen(src) + 1;
  *dest = static_cast<UT_UCS4Char *>(UT_calloc(length,sizeof(UT_UCS4Char)));
  if (!*dest)
    return false;
  UT_UCS4_strcpy_char(*dest, src);
  
  return true;
}


/*
 this one prints floating point value but using dot as fractional separator
 independent of the current locale's settings.
*/
const char* std_size_string(float f)
{
  static char string[10];
  int i=static_cast<int>(f);
  if(f-i<0.1)
	sprintf(string, "%d", i);
  else {
          int fr = int(10*(f-i));
	sprintf(string,"%d.%d", i,fr);
  };
  return string;
}