// ut_string_class.cpp

// A simple string class for use where templates are not
// allowed.
//
// Copyright (C) 2001 Mike Nordell <tamlin@algonet.se>
// Copyright (C) 2002 Tomas Frydrych <tomas@frydrych.uklinux.net>
// Copyright (C) 2002 Dom Lachowicz <cinamod@hotmail.com>
// 
// This class is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
// 
// This class is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  
// 02111-1307, USA.
//
#include <stdio.h>
#include <stdlib.h>				// size_t
#include <string.h>				// strcmp
#include <locale.h>
#include <ctype.h>
#include <stdarg.h>

#include "ut_string.h"
#include "ut_string_class.h"
#include "ut_stringbuf.h"
#include "ut_debugmsg.h"		// UT_DEBUGMSG
#include "ut_iconv.h"
#include "ut_assert.h"			// UT_ASSERT
#include "ut_mbtowc.h"
#include "ut_bytebuf.h"

//
// This string class is intended to meet the following requirements.
//
// - It shall not use templates.
// - It shall not provide a sorting order [1].
// - It shall allow dated compilers to use it [2].
// - It shall work with non-conforming library implementations.
// - It shall not use reference counting since that is 1) not
//   platform independent (the need for some kind of locking mechanism)
//   and 2) in a multi threaded environment every single string would
//   still have to be copied, where the ref-counting would be useless
//   and finally 3) locking would slow us down.
//
// [1] It's impossible to get a sorting order other than plain strcmp
//     without adding locale information. This would make this class
//     unacceptably large, and it would still be close to impossible
//     to make it "right". Note that there is however a non-member
//     operator< to make it possible to put a UT_String in a STL
//     (std C++ library) container. It _only_ provides strcmp ordering.
//
// [2] This is somewhat arbitrary, but it basically means you should
//     be able to use it with an old compiler.
//

//////////////////////////////////////////////////////////////////

static const char pszEmpty[] = { 0 };

static const UT_UCS2Char ucs2Empty[] = { 0 };
static const UT_UCS4Char ucs4Empty[] = { 0 };

////////////////////////////////////////////////////////////////////////
//
//  8-bit string
//
//  String is built of 8-bit units (bytes)
//  Encoding could be any single-byte or multi-byte encoding
//
////////////////////////////////////////////////////////////////////////

UT_String::UT_String()
:	pimpl(new UT_Stringbuf)
{
}

UT_String::UT_String(const char* sz, size_t n)
:	pimpl(new UT_Stringbuf(sz, n ? n : (sz ? strlen(sz) : 0)))
{
}

UT_String::UT_String(const UT_String& rhs)
:	pimpl(new UT_Stringbuf(*rhs.pimpl))
{
}

UT_String::~UT_String()
{
	delete pimpl;
}


//////////////////////////////////////////////////////////////////
// accessors

size_t UT_String::size() const
{
	return pimpl->size();
}

bool UT_String::empty() const
{
	return pimpl->empty();
}

void UT_String::clear() const
{
	pimpl->clear();
}

UT_String UT_String::substr(size_t iStart, size_t nChars) const
{
	const size_t nSize = pimpl->size();

	if (iStart >= nSize || !nChars) {
		return UT_String();
	}

	const char* p = pimpl->data() + iStart;
	if (iStart + nChars > nSize) {
		nChars = nSize - iStart;
	}

	return UT_String(p, nChars);
}

const char* UT_String::c_str() const
{
	return pimpl->size() ? pimpl->data() : pszEmpty;
}

//////////////////////////////////////////////////////////////////
// mutators

UT_String& UT_String::operator=(const UT_String& rhs)
{
	if (this != &rhs) {
		*pimpl = *rhs.pimpl;
	}
	return *this;
}

UT_String& UT_String::operator=(const char* rhs)
{
  if (!rhs)
    pimpl->clear ();
  else
    pimpl->assign(rhs, strlen(rhs));
  return *this;
}

UT_String& UT_String::operator+=(const UT_String& rhs)
{
	if (this != &rhs) {
		pimpl->append(*rhs.pimpl);
	} else {
		UT_Stringbuf t(*rhs.pimpl);
		pimpl->append(t);
	}
	return *this;
}

// TODO What encoding do these functions think the
// TODO  right-hand character is in?  Same as the left-hand side?
// TODO  ASCII?  ISO-8859-1?  System encoding?
// TODO  any old 8-bit single-byte or multibyte encoding?

UT_String& UT_String::operator+=(const char* rhs)
{
	UT_return_val_if_fail(rhs, *this);
	pimpl->append(rhs, strlen(rhs));
	return *this;
}

UT_String& UT_String::operator+=(char rhs)
{
	char cs = rhs;
	pimpl->append(&cs, 1);
	return *this;
}

void UT_String::swap(UT_String& rhs)
{
	UT_Stringbuf* p = pimpl;
	pimpl = rhs.pimpl;
	rhs.pimpl = p;
}

//////////////////////////////////////////////////////////////////
// End of class members, start of free functions
//////////////////////////////////////////////////////////////////

size_t UT_String_findCh(const UT_String &st, char ch)
{
  for (size_t i = 0 ; i < st.size(); i++)
    if (st[i] == ch)
      return i;
  return (size_t)-1;
}

size_t UT_String_findRCh(const UT_String &st, char ch)
{
  for (size_t i = st.size() ; i > 0; i--)
    if (st[i] == ch)
      return i;
  return (size_t)-1;
}

static UT_uint32
UT_printf_string_upper_bound (const char* format,
			      va_list      args)
{
  UT_uint32 len = 1;

  while (*format)
    {
      bool long_int = false;
      bool extra_long = false;
      char c;

      c = *format++;

      if (c == '%')
	{
	  bool done = false;

	  while (*format && !done)
	    {
	      switch (*format++)
		{
		  char *string_arg;

		case '*':
		  len += va_arg (args, int);
		  break;
		case '1':
		case '2':
		case '3':
		case '4':
		case '5':
		case '6':
		case '7':
		case '8':
		case '9':
		  /* add specified format length, since it might exceed the
		   * size we assume it to have.
		   */
		  format -= 1;
		  len += strtol (format, const_cast<char**>(&format), 10);
		  break;
		case 'h':
		  /* ignore short int flag, since all args have at least the
		   * same size as an int
		   */
		  break;
		case 'l':
		  if (long_int)
		    extra_long = true; /* linux specific */
		  else
		    long_int = true;
		  break;
		case 'q':
		case 'L':
		  long_int = true;
		  extra_long = true;
		  break;
		case 's':
		  string_arg = va_arg (args, char *);
		  if (string_arg)
		    len += strlen (string_arg);
		  else
		    {
		      /* add enough padding to hold "(null)" identifier */
		      len += 16;
		    }
		  done = true;
		  break;
		case 'd':
		case 'i':
		case 'o':
		case 'u':
		case 'x':
		case 'X':
		    {
		      if (long_int)
			static_cast<void>(va_arg(args, long));
		      else
			static_cast<void>(va_arg(args, int));
		    }
		  len += extra_long ? 64 : 32;
		  done = true;
		  break;
		case 'D':
		case 'O':
		case 'U':
		  static_cast<void>(va_arg(args, long));
		  len += 32;
		  done = true;
		  break;
		case 'e':
		case 'E':
		case 'f':
		case 'g':
		    static_cast<void>(va_arg(args, double));
		  len += extra_long ? 64 : 32;
		  done = true;
		  break;
		case 'c':
		  static_cast<void>(va_arg(args, int));
		  len += 1;
		  done = true;
		  break;
		case 'p':
		case 'n':
		  static_cast<void>(va_arg(args, void*));
		  len += 32;
		  done = true;
		  break;
		case '%':
		  len += 1;
		  done = true;
		  break;
		default:
		  /* ignore unknow/invalid flags */
		  break;
		}
	    }
	}
      else
	len += 1;
    }

  return len;
}

#if !defined (VA_COPY)
#  if defined (__GNUC__) && defined (__PPC__) && (defined (_CALL_SYSV) || defined (_WIN32) || defined(WIN32)) || defined(__s390__) || defined(__x86_64__)
#  define VA_COPY(ap1, ap2)	  (*(ap1) = *(ap2))
#  elif defined (VA_COPY_AS_ARRAY)
#  define VA_COPY(ap1, ap2)	  memmove ((ap1), (ap2), sizeof (va_list))
#  elif defined (__GNUC__) && defined (__va_copy)
#  define VA_COPY(ap1,ap2)     __va_copy((ap1),(ap2))
#  else /* va_list is a pointer */
#  define VA_COPY(ap1, ap2)	  ((ap1) = (ap2))
#  endif /* va_list is a pointer */
#  if defined (__GNUC__)
#  define VA_COPY(ap1,ap2)     __va_copy((ap1),(ap2))
#  endif
#endif /* !VA_COPY */

UT_String& UT_String_vprintf (UT_String & inStr, const char *format,
			      va_list      args1)
{
  char *buffer;
  va_list args2;

  VA_COPY (args2, args1);

  buffer = new char [ UT_printf_string_upper_bound (format, args1) ];
  vsprintf (buffer, format, args2);
  va_end (args2);

  inStr = buffer;

  delete [] buffer;

  return inStr;
}

UT_String& UT_String_vprintf (UT_String & inStr, const UT_String & format,
			va_list      args1)
{
  return UT_String_vprintf ( inStr, format.c_str(), args1 ) ;
}

UT_String& UT_String_sprintf(UT_String & inStr, const char * inFormat, ...)
{
  va_list args;
  va_start (args, inFormat);
  UT_String_vprintf (inStr, inFormat, args);
  va_end (args);

  return inStr;
}

UT_String UT_String_sprintf(const char * inFormat, ...)
{
  UT_String outStr ("");

  va_list args;
  va_start (args, inFormat);
  UT_String_vprintf (outStr, inFormat, args);
  va_end (args);

  return outStr;
}

UT_String UT_String_vprintf(const char * inFormat, va_list args1)
{
  UT_String outStr ("");  
  return UT_String_vprintf( outStr, inFormat, args1 );
}

UT_String UT_String_vprintf(const UT_String & inFormat, va_list args1)
{
  UT_String outStr ("");
  return UT_String_vprintf( outStr, inFormat, args1 );
}

/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Return the value of the property sProp or NULL if it is not present.
 * This UT_String * should be deleted by the calling programming after it is finished with it.
 */
UT_String UT_String_getPropVal(const UT_String & sPropertyString, const UT_String & sProp)
{
	UT_String sWork(sProp);
	sWork += ":";

	const char * szWork = sWork.c_str();
	const char * szProps = sPropertyString.c_str();
	const char * szLoc = strstr(szProps,szWork);
	if(szLoc == NULL)
	{
		return UT_String();
	}
//
// Look if this is the last property in the string.
//
	const char * szDelim = strchr(szLoc,';');
	if(szDelim == NULL)
	{
//
// Remove trailing spaces
//
		UT_sint32 iSLen = strlen(szProps);
		while(iSLen > 0 && szProps[iSLen-1] == ' ')
		{
			iSLen--;
		}
//
// Calculate the location of the substring
//
		UT_sint32 offset = static_cast<UT_sint32>(reinterpret_cast<size_t>(szLoc) - reinterpret_cast<size_t>(szProps));
		offset += strlen(szWork);
		return UT_String(sPropertyString.substr(offset,(iSLen - offset)));
	}
	else
	{
		szDelim = strchr(szLoc,';');
		if(szDelim == NULL)
		{
//
// bad property string
//
			UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
			return UT_String();
		}
//
// Remove trailing spaces.
//
		while(*szDelim == ';' || *szDelim == ' ')
		{
			szDelim--;
		}
//
// Calculate the location of the substring
//
		UT_sint32 offset = static_cast<UT_sint32>(reinterpret_cast<size_t>(szLoc) - reinterpret_cast<size_t>(szProps));
		offset += strlen(szWork);
		UT_sint32 iLen = static_cast<UT_sint32>(reinterpret_cast<size_t>(szDelim) - reinterpret_cast<size_t>(szProps)) + 1;
		return UT_String(sPropertyString.substr(offset,(iLen - offset)));
	}
}
/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Add aother propety string, updating previously defined properties with
 * values in the new string.
 */
void UT_String_addPropertyString(UT_String & sPropertyString, const UT_String & sNewProp)
{
	UT_sint32 iSize = static_cast<UT_sint32>(sNewProp.size());
	UT_sint32 iBase  =0;
	UT_String sProp;
	UT_String sVal;
	UT_String sSubStr;
	const char * szWork = NULL;
	const char * szLoc = NULL;
	while(iBase < iSize)
	{
		bool bBreakAtEnd = false;
		sSubStr = sNewProp.substr(iBase, iSize-iBase);
		szWork = sSubStr.c_str();
		szLoc = strstr(szWork,":");
		if(szLoc)
		{
			sProp = sNewProp.substr(iBase,szLoc - szWork);
		}
		else
		{
			break;
		}
		iBase += szLoc-szWork+1;
		sSubStr = sNewProp.substr(iBase, iSize-iBase);
		szWork = sSubStr.c_str();
		szLoc = strstr(szWork,";");
		if(szLoc)
		{
			sVal = sNewProp.substr(iBase,szLoc - szWork);
			iBase += szLoc-szWork+1;
		}
		else
		{
			sVal = sNewProp.substr(iBase,iSize-iBase);
			bBreakAtEnd = true;
		}
		if((sProp.size()>0) && (sVal.size() >0))
		{
			UT_String_setProperty(sPropertyString,sProp,sVal);
		}
		else
		{
			break;
		}
		if(bBreakAtEnd)
		{
			break;
		}
	}
}

/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Add the property sProp with value sVal to the string of properties. If the property is already present, replace the 
 * old value with the new value.
 */
void UT_String_setProperty(UT_String & sPropertyString, const UT_String & sProp, const UT_String & sVal)
{
//
// Remove the old value if it exists and tack the new property on the end.
//
	UT_String_removeProperty(sPropertyString, sProp);
	if(sPropertyString.size() > 0)
	{
		sPropertyString += "; ";
	}
	sPropertyString += sProp;
	sPropertyString += ":";
	sPropertyString += sVal;
}

/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Remove the property sProp and it's value from the string of properties. 
 */
void UT_String_removeProperty(UT_String & sPropertyString, const UT_String & sProp)
{
	UT_String sWork ( sProp );
	sWork += ":";
	const char * szWork = sWork.c_str();
	const char * szProps = sPropertyString.c_str();
	const char * szLoc = strstr(szProps,szWork);
	if(szLoc == NULL)
	{
//
// Not here, do nothing
		return ;
	}
//
// Found it, Get left part.
//
	UT_sint32 locLeft = static_cast<UT_sint32>(reinterpret_cast<size_t>(szLoc) - reinterpret_cast<size_t>(szProps));
	UT_String sLeft;
	if(locLeft == 0)
	{
		sLeft.clear();
	}
	else
	{
		sLeft = sPropertyString.substr(0,locLeft);
	}
	locLeft = static_cast<UT_sint32>(sLeft.size());
	if(locLeft > 0)
	{
//
// If this element is the last item in the properties there is no "; ".
//
// Remove trailing ';' and ' '
//
		locLeft--;
		while(locLeft >= 0 && (sLeft[locLeft] == ';' || sLeft[locLeft] == ' '))
		{
			locLeft--;
		}
	}
	UT_String sNew;
	if(locLeft > 0)
	{
		sNew = sLeft.substr(0,locLeft+1);
	}
	else
	{
		sNew.clear();
	}
//
// Look for ";" to get right part
//
	const char * szDelim = strchr(szLoc,';');
	if(szDelim == NULL)
	{
//
// No properties after this, just assign and return
//
		sPropertyString = sNew;
	}
	else
	{
//
// Just slice off the properties and tack them onto the pre-existing sNew
//
		while(*szDelim == ';' || *szDelim == ' ')
		{
			szDelim++;
		}
		UT_sint32 offset = static_cast<UT_sint32>(reinterpret_cast<size_t>(szDelim) - reinterpret_cast<size_t>(szProps));
		UT_sint32 iLen = sPropertyString.size() - offset;
		if(sNew.size() > 0)
		{
			sNew += "; ";
		}
		sNew += sPropertyString.substr(offset,iLen);
		sPropertyString = sNew;
	}
}

//////////////////////////////////////////////////////////////////
// Helpers

bool operator==(const UT_String& s1, const UT_String& s2)
{
	return strcmp(s1.c_str(), s2.c_str()) == 0;
}

bool operator==(const UT_String& s1, const char* s2)
{
	return strcmp(s1.c_str(), s2) == 0;
}

bool operator==(const char* s1, const UT_String& s2)
{
	return s2 == s1;
}

bool operator!=(const UT_String& s1, const UT_String& s2)
{
	return !(s1 == s2);
}

bool operator!=(const UT_String& s1, const char*  s2)
{
	return !(s1 == s2);
}

bool operator!=(const char* s1, const UT_String& s2)
{
	return !(s2 == s1);
}

bool operator<(const UT_String& s1, const UT_String& s2)
{
	return strcmp(s1.c_str(), s2.c_str()) < 0;
}

UT_String operator+(const UT_String& s1, const UT_String& s2)
{
	UT_String s(s1);
	s += s2;
	return s;
}

char UT_String::operator[](size_t iPos) const
{
	UT_ASSERT(iPos <= size());
	if (iPos == size())
		return '\0';
	return pimpl->data()[iPos];
}

char& UT_String::operator[](size_t iPos)
{
	UT_ASSERT(iPos <= size());
	return pimpl->data()[iPos];
}


UT_uint32 hashcode(const UT_String& string)
{
	// from glib
	return hashcode(string.c_str());
}

UT_uint32 hashcode(const char *p)
{
	// from glib
	UT_return_val_if_fail(p,0);
	UT_uint32 h = (UT_uint32)*p;
	
	if (h)
	{
		for (p += 1; *p != '\0'; p++)
		{
			h = (h << 5) - h + *p;
		}
	}
	
	return h;
}

////////////////////////////////////////////////////////////////////////
//
//  UTF-8 string: encoding is *always* UTF-8
//
////////////////////////////////////////////////////////////////////////

static const char * s_UTF8_GenericBaseID = "UT_UTF8String";

const char * UT_UTF8String::GenericBaseID () const
{
	return s_UTF8_GenericBaseID;
}

UT_UTF8String::UT_UTF8String () :
	pimpl(new UT_UTF8Stringbuf)
{
	// 
}

UT_UTF8String::UT_UTF8String (const char * sz, size_t n /* == 0 => null-termination */) :
	pimpl(new UT_UTF8Stringbuf(sz,n))
{
	// 
}

UT_UTF8String::UT_UTF8String (const char *str, const char *encoding)
{
	UT_uint32 iRead, iWritten;
	char *pUTF8Buf = UT_convert(str,
				    strlen(str),
				    encoding,
				    "UTF-8",
				    &iRead,
				    &iWritten);
	pimpl = new UT_UTF8Stringbuf(pUTF8Buf);
	FREEP(pUTF8Buf);
}


UT_UTF8String::UT_UTF8String (const UT_UTF8String & rhs) :
	pimpl(new UT_UTF8Stringbuf(*rhs.pimpl))
{
	// 
}

UT_UTF8String::UT_UTF8String (const UT_UCS4String & rhs) :
	pimpl(new UT_UTF8Stringbuf)
{
	if (rhs.size ()) appendUCS4 (rhs.ucs4_str (), rhs.size ());
}

UT_UTF8String::UT_UTF8String (const UT_UCS4Char * sz, size_t n) :
	pimpl(new UT_UTF8Stringbuf)
{
	appendUCS4 (sz, n);
}

UT_UTF8String::~UT_UTF8String ()
{
	delete pimpl;
}

size_t UT_UTF8String::size () const
{
	return pimpl->utf8Length ();
}

size_t UT_UTF8String::byteLength () const
{
	return pimpl->byteLength ();
}

void UT_UTF8String::dump (void) const
{
#if DEBUG
	char line[120];
	UT_sint32 i =0;
	const char * psz = utf8_str();
	while(psz && *psz)
	{
		for(i=0; (i< 60) && (*psz != 0); i++)
		{
			line[i] = *psz;
			psz++;
		}
		line[i] = 0;
		UT_DEBUGMSG(("%s \n",line));
		if(*psz == 0)
		{
			break;
		}
	}
#endif
}
bool UT_UTF8String::empty () const
{
	return pimpl->empty ();
}

void UT_UTF8String::clear () const
{
	pimpl->clear ();
}

UT_UTF8String &	UT_UTF8String::operator=(const char * rhs)
{
  // treat null string assignment as a clear
  if (!rhs)
    pimpl->clear();
  else
    pimpl->assign (rhs);

  return *this;
}

UT_UTF8String &	UT_UTF8String::operator=(const UT_UTF8String & rhs)
{
	if (this != &rhs) {
		*pimpl = *rhs.pimpl;
	}
	return *this;
}


UT_UTF8String &	UT_UTF8String::operator+=(const UT_UCS4Char            rhs)
{
	pimpl->appendUCS4 (&rhs, 1);
	return *this;
}


UT_UTF8String &	UT_UTF8String::operator+=(const char * rhs)
{
	UT_return_val_if_fail(rhs, *this);
	pimpl->append (rhs);
	return *this;
}

UT_UTF8String &	UT_UTF8String::operator+=(const UT_UTF8String & rhs)
{
	pimpl->append (*rhs.pimpl);
	return *this;
}

const char * UT_UTF8String::utf8_str () const
{
	return pimpl->utf8Length () ? pimpl->data() : pszEmpty;
}

void UT_UTF8String::assign (const char * sz, size_t n /* == 0 => null-termination */)
{
	pimpl->assign (sz, n);
}

void UT_UTF8String::append (const char * sz, size_t n /* == 0 => null-termination */)
{
	pimpl->append (sz, n);
}

void UT_UTF8String::appendBuf (const UT_ByteBuf & buf, UT_UCS4_mbtowc & converter)
{
	UT_uint32 i;
	UT_UCS4Char wc;
	const UT_Byte *ptr = buf.getPointer(0);
	
	for (i = 0; i < buf.getLength(); i++) {
		converter.mbtowc(wc, static_cast<char>(ptr[i])); 
		pimpl->appendUCS4(&wc, 1);
	}
}

void UT_UTF8String::appendUCS4 (const UT_UCS4Char * sz, size_t n /* == 0 => null-termination */)
{
	pimpl->appendUCS4 (sz, n);
}

void UT_UTF8String::appendUCS2 (const UT_UCS2Char * sz, size_t n /* == 0 => null-termination */)
{
	pimpl->appendUCS2 (sz, n);
}

/* replaces <str1> with <str2> in the current string
 */
const UT_UTF8String & UT_UTF8String::escape (const UT_UTF8String & str1, const UT_UTF8String & str2)
{
	pimpl->escape (str1, str2);
	return *this;
}

/* escapes '<', '>' & '&' in the current string
 */
const UT_UTF8String & UT_UTF8String::escapeXML ()
{
	pimpl->escapeXML ();
	return *this;
}

/* translates the current string to MIME "quoted-printable" format
 */
const UT_UTF8String & UT_UTF8String::escapeMIME ()
{
	pimpl->escapeMIME ();
	return *this;
}

const UT_UTF8String & UT_UTF8String::lowerCase ()
{
	if(!byteLength())
		return *this;
	
	UT_UTF8Stringbuf * n = pimpl->lowerCase ();
	if(n)
	{
		delete pimpl;
		pimpl = n;
	}
	
	return *this;
}


UT_UTF8String  UT_UTF8String::substr(size_t iStart, size_t nChars) const
{
	const size_t nSize = pimpl->utf8Length ();

	if (iStart >= nSize || !nChars) {
		return UT_UTF8String();
	}

	const char* p = pimpl->data() + iStart;
	if (iStart + nChars > nSize) {
		nChars = nSize - iStart;
	}

	return UT_UTF8String(p, nChars);

}

///////////////////////////////////////////////////////////////////////////
//
// Martin's property string functions for UT_UTF8Strings.....
//
///////////////////////////////////////////////////////////////////////////

/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Return the value of the property sProp or NULL if it is not present.
 * This UT_UTF8String * should be deleted by the calling programming after it is finished with it.
 */
UT_UTF8String UT_UTF8String_getPropVal(const UT_UTF8String & sPropertyString, const UT_UTF8String & sProp)
{
	UT_UTF8String sWork(sProp);
	sWork += ":";

	const char * szWork = sWork.utf8_str();
	const char * szProps = sPropertyString.utf8_str();
	const char * szLoc = strstr(szProps,szWork);
	if(szLoc == NULL)
	{
		return UT_UTF8String();
	}
//
// Look if this is the last property in the string.
//
	const char * szDelim = strchr(szLoc,';');
	if(szDelim == NULL)
	{
//
// Remove trailing spaces
//
		UT_sint32 iSLen = strlen(szProps);
		while(iSLen > 0 && szProps[iSLen-1] == ' ')
		{
			iSLen--;
		}
//
// Calculate the location of the substring
//
		UT_sint32 offset = static_cast<UT_sint32>(reinterpret_cast<size_t>(szLoc) - reinterpret_cast<size_t>(szProps));
		offset += strlen(szWork);
		return UT_UTF8String(sPropertyString.substr(offset,(iSLen - offset)));
	}
	else
	{
		szDelim = strchr(szLoc,';');
		if(szDelim == NULL)
		{
//
// bad property string
//
			UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
			return UT_UTF8String();
		}
//
// Remove trailing spaces.
//
		while(*szDelim == ';' || *szDelim == ' ')
		{
			szDelim--;
		}
//
// Calculate the location of the substring
//
		UT_sint32 offset = static_cast<UT_sint32>(reinterpret_cast<size_t>(szLoc) - reinterpret_cast<size_t>(szProps));
		offset += strlen(szWork);
		UT_sint32 iLen = static_cast<UT_sint32>(reinterpret_cast<size_t>(szDelim) - reinterpret_cast<size_t>(szProps)) + 1;
		return UT_UTF8String(sPropertyString.substr(offset,(iLen - offset)));
	}
}
/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Add aother propety string, updating previously defined properties with
 * values in the new string.
 */
void UT_UTF8String_addPropertyString(UT_UTF8String & sPropertyString, const UT_UTF8String & sNewProp)
{
	UT_sint32 iSize = static_cast<UT_sint32>(sNewProp.size());
	UT_sint32 iBase  =0;
	UT_UTF8String sProp;
	UT_UTF8String sVal;
	UT_UTF8String sSubStr;
	const char * szWork = NULL;
	const char * szLoc = NULL;
	while(iBase < iSize)
	{
		bool bBreakAtEnd = false;
		sSubStr = sNewProp.substr(iBase, iSize-iBase);
		szWork = sSubStr.utf8_str();
		szLoc = strstr(szWork,":");
		if(szLoc)
		{
			sProp = sNewProp.substr(iBase,szLoc - szWork);
		}
		else
		{
			break;
		}
		iBase += szLoc-szWork+1;
		sSubStr = sNewProp.substr(iBase, iSize-iBase);
		szWork = sSubStr.utf8_str();
		szLoc = strstr(szWork,";");
		if(szLoc)
		{
			sVal = sNewProp.substr(iBase,szLoc - szWork);
			iBase += szLoc-szWork+1;
		}
		else
		{
			sVal = sNewProp.substr(iBase,iSize-iBase);
			bBreakAtEnd = true;
		}
		if((sProp.size()>0) && (sVal.size() >0))
		{
			UT_UTF8String_setProperty(sPropertyString,sProp,sVal);
		}
		else
		{
			break;
		}
		if(bBreakAtEnd)
		{
			break;
		}
	}
}

/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Add the property sProp with value sVal to the string of properties. If the property is already present, replace the 
 * old value with the new value.
 */
void UT_UTF8String_setProperty(UT_UTF8String & sPropertyString, const UT_UTF8String & sProp, const UT_UTF8String & sVal)
{
//
// Remove the old value if it exists and tack the new property on the end.
//
	UT_UTF8String_removeProperty(sPropertyString, sProp);
	if(sPropertyString.size() > 0)
	{
		sPropertyString += "; ";
	}
	sPropertyString += sProp;
	sPropertyString += ":";
	sPropertyString += sVal;
}

/*!
 * Assuming a string of standard abiword properties eg. "fred:nerk; table-width:1.0in; table-height:10.in"
 * Remove the property sProp and it's value from the string of properties. 
 */
void UT_UTF8String_removeProperty(UT_UTF8String & sPropertyString, const UT_UTF8String & sProp)
{
//
// Warning, warning!!! lots of brutal const casts and assignments into
// strings to handle utf8 encoding.
//
	UT_UTF8String sWork ( sProp );
	sWork += ":";
	const char * szWork = sWork.utf8_str();
	const char * szProps = sPropertyString.utf8_str();
	const char * szLoc = strstr(szProps,szWork);
	if(szLoc == NULL)
	{
//
// Not here, do nothing
		return ;
	}
//
// Found it, Get left part.
//
	UT_sint32 locLeft = static_cast<UT_sint32>(reinterpret_cast<size_t>(szLoc) - reinterpret_cast<size_t>(szProps));
	UT_UTF8String sLeft;
	if(locLeft == 0)
	{
		sLeft.clear();
	}
	else
	{
		UT_UTF8String sTmp =  sPropertyString;
		char * szTmp = const_cast<char *>(sTmp.utf8_str());
		szTmp[locLeft] = 0; 
		sLeft = szTmp;
	}
	char * szLeft = const_cast<char *>(sLeft.utf8_str());
	locLeft--;
	if(locLeft > 0)
	{
//
// If this element is the last item in the properties there is no "; ".
//
// Remove trailing ';' and ' '
//
		while(locLeft >= 0 && (szLeft[locLeft] == ';' || szLeft[locLeft] == ' '))
		{
			locLeft--;
		}
	}
	UT_UTF8String sNew;
	if(locLeft > 0)
	{
		szLeft[locLeft+1] = 0;
		sNew = szLeft;
	}
	else
	{
		sNew.clear();
	}
//
// Look for ";" to get right part
//
	const char * szDelim = strchr(szLoc,';');
	if(szDelim == NULL)
	{
//
// No properties after this, just assign and return
//
		sPropertyString = sNew;
	}
	else
	{
//
// Just slice off the properties and tack them onto the pre-existing sNew
//
		while(*szDelim == ';' || *szDelim == ' ')
		{
			szDelim++;
		}
		UT_UTF8String sRight = szDelim;
		if(sNew.size() > 0)
		{
			sNew += "; ";
		}
		sNew += sRight;
		sPropertyString = sNew;
	}
}

/////////////////////////////////////////////////////////////////////////////

UT_UCS4String UT_UTF8String::ucs4_str ()
{
	UT_UCS4String ucs4string;

	const char * utf8string = pimpl->data ();
	size_t bytelength = pimpl->byteLength ();

	while (true)
	{
		UT_UCS4Char ucs4 = UT_UCS4Stringbuf::UTF8_to_UCS4 (utf8string, bytelength);
		if (ucs4 == 0) break;
		ucs4string += ucs4;
	}
	return ucs4string;
}

bool operator==(const UT_UTF8String& s1, const UT_UTF8String& s2)
{
	return strcmp(s1.utf8_str(), s2.utf8_str()) == 0;
}

bool operator!=(const UT_UTF8String& s1, const UT_UTF8String& s2)
{
	return strcmp(s1.utf8_str(), s2.utf8_str()) != 0;
}

bool operator==(const UT_UTF8String& s1, const char * s2)
{
	return s2 ? (strcmp(s1.utf8_str(), s2) == 0) : false;
}

bool operator!=(const UT_UTF8String& s1, const char * s2)
{
	return s2 ? (strcmp(s1.utf8_str(), s2) != 0) : true;
}

UT_UTF8String operator+(const UT_UTF8String & s1, const UT_UTF8String & s2)
{
	UT_UTF8String s(s1);
	s += s2;
	return s;
}

UT_UTF8String UT_UTF8String_sprintf(const char * inFormat, ...)
{
  UT_String str ("");

  va_list args;
  va_start (args, inFormat);
  UT_String_vprintf (str, inFormat, args);
  va_end (args);

  // create & return a validated UTF-8 string based on the input
  return UT_UTF8String(str.c_str());
}

UT_UTF8String & UT_UTF8String_sprintf(UT_UTF8String & inStr, const char * inFormat, ...)
{
  UT_String str ("");

  va_list args;
  va_start (args, inFormat);
  UT_String_vprintf (str, inFormat, args);
  va_end (args);

  // create a validated UTF-8 string based on the input
  inStr = str.c_str();
  return inStr;
}

////////////////////////////////////////////////////////////////////////
//
//  UCS-4 string
//
//  String is built of 32-bit units (longs)
//
//  NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
//  NOTE:  in the case of UCS-4 and UTF-32 since they really are
//  NOTE:  identical
//
////////////////////////////////////////////////////////////////////////

UT_UCS4String::UT_UCS4String()
:	pimpl(new UT_UCS4Stringbuf)
{
}

UT_UCS4String::UT_UCS4String(const UT_UCS4Char* sz, size_t n)
:	pimpl(new UT_UCS4Stringbuf(sz, n ? n : (sz) ? UT_UCS4_strlen(sz) : 0))
{
}

UT_UCS4String::UT_UCS4String(const UT_UCS4String& rhs)
:	pimpl(new UT_UCS4Stringbuf(*rhs.pimpl))
{
}

/* construct from a string in UTF-8 format
 */
UT_UCS4String::UT_UCS4String(const char * utf8_str, size_t bytelength /* 0 == zero-terminate */)
:	pimpl(new UT_UCS4Stringbuf)
{
	if (bytelength == 0) {
		if (utf8_str == 0) return;
		bytelength = strlen (utf8_str);
	}
	while (true) {
		UT_UCS4Char ucs4 = UT_UCS4Stringbuf::UTF8_to_UCS4 (utf8_str, bytelength);
		if (ucs4 == 0) break; // end-of-string
		pimpl->append (&ucs4, 1);
	}
}

/* construct from a string in UTF-8 format
 * if (strip_whitespace == true) replace all white space sequences with a single UCS_SPACE
 * if (strip_whitespace != true) replace CR-LF & CR by LF
 * non-breaking spaces (&nbsp; UCS_NBSP 0x0a) are not white space; see UT_UCS4_isspace()
 */
UT_UCS4String::UT_UCS4String(const char * utf8_str, size_t bytelength /* 0 == zero-terminate */, bool strip_whitespace)
:	pimpl(new UT_UCS4Stringbuf)
{
	if (bytelength == 0) {
		if (utf8_str == 0) return;
		bytelength = strlen (utf8_str);
	}
	UT_UCS4Char ucs4a = UT_UCS4Stringbuf::UTF8_to_UCS4 (utf8_str, bytelength);
	while (true) {
		if (ucs4a == 0) break; // end-of-string
		UT_UCS4Char ucs4b = UT_UCS4Stringbuf::UTF8_to_UCS4 (utf8_str, bytelength);
		if (UT_UCS4_isspace (ucs4a)) {
			if (strip_whitespace) {
				if (!UT_UCS4_isspace (ucs4b)) {
					ucs4a = UCS_SPACE;
					pimpl->append (&ucs4a, 1);
					ucs4a = ucs4b;
				}
			} else if (ucs4a == UCS_CR) {
				if (ucs4b == UCS_LF) {
					ucs4a = ucs4b;
				} else {
					ucs4a = UCS_LF;
					pimpl->append (&ucs4a, 1);
					ucs4a = ucs4b;
				}
			} else {
				pimpl->append (&ucs4a, 1);
				ucs4a = ucs4b;
			}
		} else {
			pimpl->append (&ucs4a, 1);
			ucs4a = ucs4b;
		}
	}
}

UT_UCS4String::~UT_UCS4String()
{
	delete pimpl;
}


//////////////////////////////////////////////////////////////////
// accessors

size_t UT_UCS4String::size() const
{
	return pimpl->size();
}

bool UT_UCS4String::empty() const
{
	return pimpl->empty();
}

void UT_UCS4String::clear() const
{
	pimpl->clear();
}

UT_UCS4String UT_UCS4String::substr(size_t iStart, size_t nChars) const
{
	const size_t nSize = pimpl->size();

	if (iStart >= nSize || !nChars) {
		return UT_UCS4String();
	}

	const UT_UCS4Char* p = pimpl->data() + iStart;
	if (iStart + nChars > nSize) {
		nChars = nSize - iStart;
	}

	return UT_UCS4String(p, nChars);
}

const UT_UCS4Char* UT_UCS4String::ucs4_str() const
{
	return pimpl->size() ? pimpl->data() : ucs4Empty;
}

const char* UT_UCS4String::utf8_str()
{
	return pimpl->size() ? pimpl->utf8_data() : pszEmpty;
}

//////////////////////////////////////////////////////////////////
// mutators

UT_UCS4String& UT_UCS4String::operator=(const UT_UCS4String& rhs)
{
	if (this != &rhs) {
		*pimpl = *rhs.pimpl;
	}
	return *this;
}

UT_UCS4String& UT_UCS4String::operator=(const UT_UCS4Char* rhs)
{
	UT_return_val_if_fail(rhs, *this);
	pimpl->assign(rhs, UT_UCS4_strlen(rhs));
	return *this;
}

UT_UCS4String& UT_UCS4String::operator+=(const UT_UCS4String& rhs)
{
	if (this != &rhs) {
		pimpl->append(*rhs.pimpl);
	} else {
		UT_UCS4Stringbuf t(*rhs.pimpl);
		pimpl->append(t);
	}
	return *this;
}

UT_UCS4String& UT_UCS4String::operator+=(const UT_UCS4Char* rhs)
{
	UT_return_val_if_fail(rhs, *this);
	pimpl->append(rhs, UT_UCS4_strlen(rhs));
	return *this;
}

UT_UCS4String& UT_UCS4String::operator+=(UT_UCS4Char rhs)
{
	UT_UCS4Char cs = rhs;
	pimpl->append(&cs, 1);
	return *this;
}

// TODO What encoding do these functions think the 8-bit
// TODO  character is in?  ASCII?  ISO-8859-1?  System encoding?
// TODO  any old 8-bit single-byte or multibyte encoding?

UT_UCS4String& UT_UCS4String::operator+=(char rhs)
{
  return this->operator+=(static_cast<unsigned char>(rhs));
}

UT_UCS4String& UT_UCS4String::operator+=(unsigned char rhs)
{
	UT_UCS4Char cs[2];
	char rs[2];

	rs[0] = static_cast<char>(rhs); rs[1] = 0;
	UT_UCS4_strcpy_char (cs, rs);

	pimpl->append(cs, 1);
	return *this;
}

void UT_UCS4String::swap(UT_UCS4String& rhs)
{
	UT_UCS4Stringbuf* p = pimpl;
	pimpl = rhs.pimpl;
	rhs.pimpl = p;
}


//////////////////////////////////////////////////////////////////
// End of class members, start of free functions
//////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////
// Helpers

bool operator==(const UT_UCS4String& s1, const UT_UCS4String& s2)
{
	return UT_UCS4_strcmp(s1.ucs4_str(), s2.ucs4_str()) == 0;
}

bool operator==(const UT_UCS4String& s1, const UT_UCS4Char* s2)
{
	return UT_UCS4_strcmp(s1.ucs4_str(), s2) == 0;
}

bool operator==(const UT_UCS4Char* s1, const UT_UCS4String& s2)
{
	return s2 == s1;
}

bool operator!=(const UT_UCS4String& s1, const UT_UCS4String& s2)
{
	return !(s1 == s2);
}

bool operator!=(const UT_UCS4String& s1, const UT_UCS4Char*  s2)
{
	return !(s1 == s2);
}

bool operator!=(const UT_UCS4Char* s1, const UT_UCS4String& s2)
{
	return !(s2 == s1);
}

bool operator<(const UT_UCS4String& s1, const UT_UCS4String& s2)
{
	return UT_UCS4_strcmp(s1.ucs4_str(), s2.ucs4_str()) < 0;
}

UT_UCS4String operator+(const UT_UCS4String& s1, const UT_UCS4String& s2)
{
	UT_UCS4String s(s1);
	s += s2;
	return s;
}

UT_UCS4Char UT_UCS4String::operator[](size_t iPos) const
{
	UT_ASSERT(iPos <= size());
	if (iPos == size())
		return '\0';
	return pimpl->data()[iPos];
}

UT_UCS4Char& UT_UCS4String::operator[](size_t iPos)
{
	UT_ASSERT(iPos <= size());
	return pimpl->data()[iPos];
}