/* AbiWord
 * Copyright (C) 1998 AbiSource, Inc.
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  
 * 02111-1307, USA.
 */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_LIBXML2
#include <glib.h>
#endif

#include "ut_types.h"
#include "ut_assert.h"
#include "ut_debugmsg.h"
#include "ut_string.h"
#include "ie_imp_XML.h"
#include "ie_types.h"
#include "pd_Document.h"
#include "ut_bytebuf.h"

#include "xap_EncodingManager.h"
/*****************************************************************/	
/*****************************************************************/	

#define X_TestParseState(ps)	((m_parseState==(ps)))

#define X_VerifyParseState(ps)	do {  if (!(X_TestParseState(ps)))			\
									  {  m_error = UT_IE_BOGUSDOCUMENT;	\
										 return; } } while (0)

#define X_CheckDocument(b)		do {  if (!(b))								\
									  {  m_error = UT_IE_BOGUSDOCUMENT;	\
										 return; } } while (0)

#define X_CheckError(v)			do {  if (!(v))								\
									  {  m_error = UT_ERROR;			\
										 return; } } while (0)

#define	X_EatIfAlreadyError()	do {  if (m_error) return; } while (0)

/*****************************************************************
******************************************************************
** C-style callback functions that we register with the XML parser
******************************************************************
*****************************************************************/

#ifndef HAVE_LIBXML2
static void startElement(void *userData, const XML_Char *name, const XML_Char **atts)
{
	IE_Imp_XML* pDocReader = (IE_Imp_XML*) userData;
	pDocReader->_startElement(name, atts);
}

static void endElement(void *userData, const XML_Char *name)
{
	IE_Imp_XML* pDocReader = (IE_Imp_XML*) userData;
	pDocReader->_endElement(name);
}

static void charData(void* userData, const XML_Char *s, int len)
{
	IE_Imp_XML* pDocReader = (IE_Imp_XML*) userData;
	pDocReader->_charData(s, len);
}
#endif /* HAVE_LIBXML2 */

/*****************************************************************/
/*****************************************************************/

UT_Bool IE_Imp_XML::_openFile(const char * szFilename) 
{
    m_fp = fopen(szFilename, "r");
    return (m_fp != NULL);
}

UT_uint32 IE_Imp_XML::_readBytes(char * buf, UT_uint32 length) 
{
    return fread(buf, 1, length, m_fp);
}

void IE_Imp_XML::_closeFile(void) 
{
    if (m_fp) {
	fclose(m_fp);
    }
}

UT_Error IE_Imp_XML::importFile(const char * szFilename)
{
#ifdef HAVE_LIBXML2
	xmlDocPtr dok = xmlParseFile(szFilename);
	if (dok == NULL)
	  {
	    UT_DEBUGMSG(("Could not open and parse file %s\n",
			 szFilename));
	    m_error = UT_IE_FILENOTFOUND;
	  }
	else
	  {
	    xmlNodePtr node = xmlDocGetRootElement(dok);
	    _scannode(dok,node,0);
	    xmlFreeDoc(dok);
	    m_error = UT_OK;
	  }
#else
	XML_Parser parser = NULL;
	int done = 0;
	char buf[4096];

	if (!_openFile(szFilename))
	{
		UT_DEBUGMSG(("Could not open file %s\n",szFilename));
		m_error = UT_IE_FILENOTFOUND;
		goto Cleanup;
	}
	
	parser = XML_ParserCreate(NULL);
	XML_SetUserData(parser, this);
	XML_SetElementHandler(parser, startElement, endElement);
	XML_SetCharacterDataHandler(parser, charData);
	XML_SetUnknownEncodingHandler(parser,XAP_EncodingManager::XAP_XML_UnknownEncodingHandler,NULL);

	while (!done)
	{
		size_t len = _readBytes(buf, sizeof(buf));
		done = (len < sizeof(buf));

#if 1
        // TODO - remove this then not needed anymore. In ver 0.7.7 and erlier, AbiWord export inserted 
        // chars below 0x20. Most of these are invalid XML and can't be imported.
        // See bug #762.
        for( UT_uint32 n1 = 0; n1 < len; n1++ )
	        if( buf[n1] >= 0x00 && buf[n1] < 0x20 && buf[n1] != 0x09 && buf[n1] != 0x0a && buf[n1] != 0x0d )
		        buf[n1] = 0x0d;
#endif

		if (!XML_Parse(parser, buf, len, done)) 
		{
			UT_DEBUGMSG(("%s at line %d\n",
						 XML_ErrorString(XML_GetErrorCode(parser)),
						 XML_GetCurrentLineNumber(parser)));
			m_error = UT_IE_BOGUSDOCUMENT;
			goto Cleanup;
		}

		if (m_error)
		{
			UT_DEBUGMSG(("Problem reading document\n"));
			goto Cleanup;
		}
	} 
	
	m_error = UT_OK;

Cleanup:
	if (parser)
		XML_ParserFree(parser);
	_closeFile();
#endif /* HAVE_LIBXML2 */
	if(m_error ==  UT_IE_BOGUSDOCUMENT)
	  {
	    UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
	  }
	return m_error;
}

/*****************************************************************/
/*****************************************************************/

IE_Imp_XML::~IE_Imp_XML()
{
	FREEP(m_currentDataItemName);
	FREEP(m_currentDataItemMimeType);
}

IE_Imp_XML::IE_Imp_XML(PD_Document * pDocument, UT_Bool whiteSignificant)
	: IE_Imp(pDocument)
{
	m_error = UT_OK;
	m_parseState = _PS_Init;
	m_lenCharDataSeen = 0;
	m_lenCharDataExpected = 0;
	m_bSeenCR = UT_FALSE;
	m_bWhiteSignificant = whiteSignificant;
	m_bWasSpace = UT_FALSE;

	m_currentDataItemName = NULL;
	m_currentDataItemMimeType = NULL;
}

/*****************************************************************/
/*****************************************************************/

void IE_Imp_XML::_charData(const XML_Char *s, int len)
{
	// TODO XML_Char is defined in the xml parser
	// TODO as a 'char' not as a 'unsigned char'.
	// TODO does this cause any problems ??
	
	X_EatIfAlreadyError();				// xml parser keeps running until buffer consumed

	switch (m_parseState)
	{
	default:
		{
			xxx_UT_DEBUGMSG(("charData DISCARDED [length %d]\n",len));
			return;
		}
		
     case _PS_Field:
         {
            // discard contents of the field - force recalculation
            // this gives us a higher chance of correcting fields 
            // with the wrong values
            return;
        }
 	
	case _PS_Block:
		{
			UT_ASSERT(sizeof(XML_Char) == sizeof(UT_Byte));
			UT_ASSERT(sizeof(XML_Char) != sizeof(UT_UCSChar));

			// parse UTF-8 text and convert to Unicode.
			// also take care of some white-space issues:
			//    [] convert CRLF to SP.
			//    [] convert CR to SP.
			//    [] convert LF to SP.

			UT_Byte * ss = (UT_Byte *)s;
			UT_Byte currentChar;
			UT_UCSChar buf[1024];
			int bufLen = 0;

			for (int k=0; k<len; k++)
			{
				if (bufLen == NrElements(buf))		// pump it out in chunks
				{
					X_CheckError(m_pDocument->appendSpan(buf,bufLen));
					bufLen = 0;
				}

				currentChar = ss[k];

				if ((ss[k] < 0x80) && (m_lenCharDataSeen > 0))
				{
					// is it us-ascii and we are in a UTF-8
					// multi-byte sequence.  puke.
					X_CheckError(0);
				}

				if (currentChar == UCS_CR)
				{
					buf[bufLen++] = UCS_SPACE;		// substitute a SPACE
					m_bSeenCR = UT_TRUE;
					continue;
				}

				// only honor one space
				// if !m_bWhiteSignificant (XHTML, WML)
				// else just blissfully ignore everything
				// (ABW)
				if (!m_bWhiteSignificant)
				{
				  if(UT_UCS_isspace(currentChar))
				    {
				      if(!m_bWasSpace)
					{
					  buf[bufLen++] = UCS_SPACE;
					  m_bWasSpace = UT_TRUE;
					}
				      continue;
				    }
				  else
				    {
				      m_bWasSpace = UT_FALSE;
				    }
				}

				if (currentChar == UCS_LF)				// LF
				{
					if (!m_bSeenCR)					// if not immediately after a CR,
						buf[bufLen++] = UCS_SPACE;	// substitute a SPACE.  otherwise, eat.
					m_bSeenCR = UT_FALSE;
					continue;
				}
				
				m_bSeenCR = UT_FALSE;

				if (currentChar < 0x80)					// plain us-ascii part of latin-1
				{
					buf[bufLen++] = ss[k];			// copy as is.
				}
				else if ((currentChar & 0xf0) == 0xf0)	// lead byte in 4-byte surrogate pair
				{
					// surrogate pairs are defined in section 3.7 of the
					// unicode standard version 2.0 as an extension
					// mechanism for rare characters in future extensions
					// of the unicode standard.
					UT_ASSERT(m_lenCharDataSeen == 0);
					UT_ASSERT(UT_NOT_IMPLEMENTED);
				}
				else if ((currentChar & 0xe0) == 0xe0)		// lead byte in 3-byte sequence
				{
					UT_ASSERT(m_lenCharDataSeen == 0);
					m_lenCharDataExpected = 3;
					m_charDataSeen[m_lenCharDataSeen++] = currentChar;
				}
				else if ((currentChar & 0xc0) == 0xc0)		// lead byte in 2-byte sequence
				{
					UT_ASSERT(m_lenCharDataSeen == 0);
					m_lenCharDataExpected = 2;
					m_charDataSeen[m_lenCharDataSeen++] = currentChar;
				}
				else if ((currentChar & 0x80) == 0x80)		// trailing byte in multi-byte sequence
				{
					UT_ASSERT(m_lenCharDataSeen > 0);
					m_charDataSeen[m_lenCharDataSeen++] = currentChar;
					if (m_lenCharDataSeen == m_lenCharDataExpected)
					{
						buf[bufLen++] = UT_decodeUTF8char(m_charDataSeen,m_lenCharDataSeen);
						m_lenCharDataSeen = 0;
					}
				}
			}

			// flush out the last piece of a buffer

			if (bufLen > 0)
				X_CheckError(m_pDocument->appendSpan(buf,bufLen));
			return;
		}

	case _PS_DataItem:
		{
#define MyIsWhite(c)			(((c)==' ') || ((c)=='\t') || ((c)=='\n') || ((c)=='\r'))
			
		   
		   	if (m_currentDataItemEncoded)
		        {

			   	// DataItem data consists of Base64 encoded data with
			   	// white space added for readability.  strip out any
			   	// white space and put the rest in the ByteBuf.

			   	UT_ASSERT((sizeof(XML_Char) == sizeof(UT_Byte)));
			
			   	const UT_Byte * ss = (UT_Byte *)s;
			   	const UT_Byte * ssEnd = ss + len;
			   	while (ss < ssEnd)
			   	{
					while ((ss < ssEnd) && MyIsWhite(*ss))
						ss++;
					UT_uint32 k=0;
					while ((ss+k < ssEnd) && ( ! MyIsWhite(ss[k])))
						k++;
					if (k > 0)
						m_currentDataItem.ins(m_currentDataItem.getLength(),ss,k);

					ss += k;
			   	}

			   	return;
			}
		   	else
		        {
			   	m_currentDataItem.append((UT_Byte*)s, len);
			}
#undef MyIsWhite
		}
	}
}

/*****************************************************************/
/*****************************************************************/

UT_uint32 IE_Imp_XML::_getInlineDepth(void) const
{
	return m_stackFmtStartIndex.getDepth();
}

UT_Bool IE_Imp_XML::_pushInlineFmt(const XML_Char ** atts)
{
	UT_uint32 start = m_vecInlineFmt.getItemCount()+1;
	UT_uint32 k;

	for (k=0; (atts[k]); k++)
	{
		XML_Char * p;
		if (!UT_XML_cloneString(p,atts[k]))
			return UT_FALSE;
		if (m_vecInlineFmt.addItem(p)!=0)
			return UT_FALSE;
	}
	if (!m_stackFmtStartIndex.push((void*)start))
		return UT_FALSE;
	return UT_TRUE;
}

void IE_Imp_XML::_popInlineFmt(void)
{
	UT_uint32 start;
	if (!m_stackFmtStartIndex.pop((void **)&start))
		return;
	UT_uint32 k;
	UT_uint32 end = m_vecInlineFmt.getItemCount();
	for (k=end; k>=start; k--)
	{
		const XML_Char * p = (const XML_Char *)m_vecInlineFmt.getNthItem(k-1);
		m_vecInlineFmt.deleteNthItem(k-1);
		if (p)
			free((void *)p);
	}
}

const XML_Char * IE_Imp_XML::_getXMLPropValue(const XML_Char *name, 
					      const XML_Char ** atts)
{
  // find the 'name="value"' pair and return the "value".
  // ignore everything else

  // quick out
  if(!name || !atts)
    return NULL;

  for (const XML_Char ** a = atts; (*a); a++)
    if(a[0] && (UT_XML_stricmp(a[0],name) == 0))
      return a[1];
  
  return NULL;
}

void IE_Imp_XML::pasteFromBuffer(PD_DocumentRange * pDocRange,
				       unsigned char * pData, UT_uint32 lenData)
{
	UT_ASSERT(UT_NOT_IMPLEMENTED);
}

#ifdef HAVE_LIBXML2
void IE_Imp_XML::_scannode(xmlDocPtr dok, xmlNodePtr cur, int c)
{
  while (cur != NULL)
    {
      if (strcmp("text", (char*) cur->name) == 0)
	{
	  XML_Char* s = cur->content; // xmlNodeListGetString(dok, cur, 1);
	  _charData(s, strlen((char*) s));
	}
      else
	{
	  XML_Char *prop = NULL;
	  const XML_Char* props[3] = { NULL, NULL, NULL };
	  if (cur->properties)
	    {
	      props[0] = cur->properties->name;
	      props[1] = cur->properties->children->content;
	    }
	  _startElement(cur->name, props);
	  if (prop) g_free(prop);
	}
      _scannode(dok, cur->children, c + 1);
      if (strcmp("text", (char*) cur->name) != 0)
	_endElement(cur->name);
      cur = cur->next;
    }
}
#endif /* HAVE_LIBXML2 */