/* * AbiSource Program Utilities * Copyright (C) 2001 Dom Lachowicz * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ // zipios++ stuff for handling ZIP files #include "zipios++/zipios-config.h" #include "zipios++/zipfile.h" #include "zipios++/meta-iostreams.h" #include "zipios++/fcollexceptions.h" #include "zipios++/zipinputstream.h" #include "zipios++/zipoutputstream.h" // abiword stuff #include "ut_xml.h" #include "ut_string.h" #include "ut_string_class.h" #include "ut_bytebuf.h" #include "xap_Module.h" #include "ie_imp.h" #include "pd_Document.h" #include "xap_EncodingManager.h" #include "ut_assert.h" #include "ut_debugmsg.h" using namespace zipios ; /*****************************************************************************/ /*****************************************************************************/ /*! * Class used to import OpenWriter documents */ class ABI_EXPORT IE_Imp_OpenWriter : public IE_Imp { public: IE_Imp_OpenWriter (PD_Document * pDocument); virtual ~IE_Imp_OpenWriter (); virtual UT_Error importFile(const char * szFilename); PD_Document * getDocument (); private: static UT_Error readStreamIntoByteBuf ( ZipFile & oo, const char * stream, UT_ByteBuf & bytebuf ); UT_Error _handleMetaStream ( UT_ByteBuf &stm ); UT_Error _handleSettingsStream ( UT_ByteBuf &stm ); UT_Error _handleStylesStream ( UT_ByteBuf &stm ); UT_Error _handleContentStream ( UT_ByteBuf &stm ); }; /*****************************************************************************/ /*****************************************************************************/ class ABI_EXPORT IE_Imp_OpenWriter_Sniffer : public IE_ImpSniffer { public: /*! * Recognize the well-known suffixes, if any */ virtual UT_Confidence_t recognizeSuffix (const char * szSuffix) { if (!UT_stricmp(szSuffix, ".sxw")) return UT_CONFIDENCE_PERFECT; return UT_CONFIDENCE_ZILCH; } /*! * Recognize the contents as best we can */ virtual UT_Confidence_t recognizeContents (const char * szBuf, UT_uint32 iNumbytes) { // BOGUS - will identify any zip file, also contains null characters static const char * magic = "PK"; UT_uint32 magic_len = strlen ( magic ); if (iNumbytes < magic_len) return UT_CONFIDENCE_ZILCH; if ( !strncmp (szBuf, magic, magic_len) ) return UT_CONFIDENCE_SOSO; return UT_CONFIDENCE_ZILCH; } /*! * Construct an importer for ourselves */ virtual UT_Error constructImporter (PD_Document * pDocument, IE_Imp ** ppie) { IE_Imp_OpenWriter * p = new IE_Imp_OpenWriter(pDocument); *ppie = p; return UT_OK; } /*! * Get the dialog labels */ bool getDlgLabels (const char ** szDesc, const char ** szSuffixList, IEFileType * ft) { *szDesc = "OpenWriter Documents (.sxw)"; *szSuffixList = "*.sxw"; *ft = getFileType(); return true; } }; /*****************************************************************************/ /*****************************************************************************/ /*! * Create a new OpenWriter importer object */ IE_Imp_OpenWriter::IE_Imp_OpenWriter (PD_Document * pDocument) : IE_Imp ( pDocument ) { } /*! * Destroy an OpenWriter importer object */ IE_Imp_OpenWriter::~IE_Imp_OpenWriter () { } /*! * Import the given file */ UT_Error IE_Imp_OpenWriter::importFile(const char * szFilename) { try { ZipFile oo ( szFilename ); UT_ByteBuf stm; UT_Error error = UT_OK; if ( readStreamIntoByteBuf ( oo, "meta.xml", stm ) == UT_OK ) error = _handleMetaStream ( stm ); if ( error != UT_OK ) return error; if ( readStreamIntoByteBuf ( oo, "settings.xml", stm ) == UT_OK ) error = _handleSettingsStream ( stm ); if ( error != UT_OK ) return error; if ( readStreamIntoByteBuf ( oo, "styles.xml", stm ) == UT_OK ) error = _handleStylesStream ( stm ); if ( error != UT_OK ) return error; if ( readStreamIntoByteBuf ( oo, "content.xml", stm ) == UT_OK ) error = _handleContentStream ( stm ); if ( error != UT_OK ) return error; return UT_OK; } catch (...) { return UT_ERROR; } } /*! * */ PD_Document * IE_Imp_OpenWriter::getDocument () { return getDoc (); } /*! * Static utility method to read a file/stream embedded inside of the * zipfile into the byte-buffer */ UT_Error IE_Imp_OpenWriter::readStreamIntoByteBuf ( ZipFile & oo, const char * stream, UT_ByteBuf & bytebuf ) { bytebuf.truncate (0); try { istream * pstream = oo.getInputStream (stream); if (!pstream) return false; char buf[4096]; UT_sint32 nread = 0; while ((nread = pstream->rdbuf()->sgetn(buf, sizeof(buf))) > 0) { bytebuf.append ( (const UT_Byte *)buf, nread ); } delete pstream; return UT_OK; } catch (...) { return UT_ERROR; } } /*****************************************************************************/ /*****************************************************************************/ /*! * Class whose responsibility is to turn UTF-8 strings into UCS-2 ones * This class should probably be moved into some more general * place inside of the AbiWord tree, because it's really quite useful * and duplicate code is used in a lot of importers/exporters and in other * places. Note: does not depend on a working iconv implementation */ class ABI_EXPORT UTF8_To_UCS2_Manager { private: XML_Char m_charDataSeen[4]; UT_uint32 m_lenCharDataSeen; UT_uint32 m_lenCharDataExpected; bool m_bSeenCR; public: UTF8_To_UCS2_Manager () : m_lenCharDataSeen(0), m_lenCharDataExpected(0), m_bSeenCR(false) { } ~UTF8_To_UCS2_Manager () { } /*! * Convert UCS2 to UTF8 */ UT_Error toutf8 (const UT_UCS2String &in, UT_String &out) { return toutf8 ((const UT_UCSChar *)in.ucs_str(), in.size(), out); } /*! * Convert UCS2 to UTF8 */ UT_Error toutf8 (const UT_UCSChar * data, UT_uint32 length, UT_String &out) { const UT_UCSChar * pData = 0; UT_ASSERT(sizeof(UT_Byte) == sizeof(char)); for (pData=data; (pData 0x007f) { if(XAP_EncodingManager::get_instance()->isUnicodeLocale() || (XAP_EncodingManager::get_instance()->try_nativeToU(0xa1) == 0xa1)) { XML_Char * pszUTF8 = UT_encodeUTF8char(*pData++); while (*pszUTF8) { out += (char)*pszUTF8; pszUTF8++; } } else { /* Try to convert to native encoding and if character fits into byte, output raw byte. This is somewhat essential for single-byte non-latin languages like russian or polish - since tools like grep and sed can be used then for these files without any problem. Networks and mail transfers are 8bit clean these days. - VH */ UT_UCSChar c = XAP_EncodingManager::get_instance()->try_UToNative(*pData); if (c==0 || c>255) { char localBuf[20]; char * plocal = localBuf; sprintf(localBuf,"&#x%x;",*pData++); out += plocal; } else { out += (char)c; pData++; } } } else { out += (char)*pData++; } break; } return UT_OK; } /*! * Convert UTF8 to UCS2 */ UT_Error toucs2 (const UT_String & in, UT_UCS2String & out) { return toucs2 ((const XML_Char *)in.c_str(), in.size(), out); } /*! * Convert UTF8 to UCS2 */ UT_Error toucs2 (const XML_Char *s, int len, UT_UCS2String &buf) { UT_ASSERT(sizeof(XML_Char) == sizeof(UT_Byte)); UT_ASSERT(sizeof(XML_Char) != sizeof(UT_UCSChar)); // parse UTF-8 text and convert to Unicode. // also take care of some white-space issues: // [] convert CRLF to SP. // [] convert CR to SP. // [] convert LF to SP. // ignored words processing doesn't care about the // white-space stuff, but it does no harm UT_Byte * ss = (UT_Byte *)s; UT_Byte currentChar; for (int k=0; k 0)) { // is it us-ascii and we are in a UTF-8 // multi-byte sequence. puke. UT_ASSERT(UT_SHOULD_NOT_HAPPEN); return UT_ERROR; } if (currentChar == UCS_CR) { buf += UCS_LF; m_bSeenCR = true; continue; } if (currentChar == UCS_LF) // LF { buf += UCS_LF; m_bSeenCR = false; continue; } m_bSeenCR = false; if (currentChar < 0x80) // plain us-ascii part of latin-1 { buf += ss[k]; // copy as is. } else if ((currentChar & 0xf0) == 0xf0) // lead byte in 4-byte surrogate pair { // surrogate pairs are defined in section 3.7 of the // unicode standard version 2.0 as an extension // mechanism for rare characters in future extensions // of the unicode standard. UT_ASSERT(m_lenCharDataSeen == 0); UT_ASSERT(UT_NOT_IMPLEMENTED); return UT_ERROR; } else if ((currentChar & 0xe0) == 0xe0) // lead byte in 3-byte sequence { UT_ASSERT(m_lenCharDataSeen == 0); m_lenCharDataExpected = 3; m_charDataSeen[m_lenCharDataSeen++] = currentChar; } else if ((currentChar & 0xc0) == 0xc0) // lead byte in 2-byte sequence { UT_ASSERT(m_lenCharDataSeen == 0); m_lenCharDataExpected = 2; m_charDataSeen[m_lenCharDataSeen++] = currentChar; } else if ((currentChar & 0x80) == 0x80) // trailing byte in multi-byte sequence { UT_ASSERT(m_lenCharDataSeen > 0); m_charDataSeen[m_lenCharDataSeen++] = currentChar; if (m_lenCharDataSeen == m_lenCharDataExpected) { buf += UT_decodeUTF8char(m_charDataSeen,m_lenCharDataSeen); m_lenCharDataSeen = 0; } } } return UT_OK; } }; /*****************************************************************************/ /*****************************************************************************/ /*! * Baseclass for all OpenWriter listeners, basically a shim class * to expose a GetDocument() and a GetImporter() method */ class ABI_EXPORT OpenWriter_Stream_Listener : public virtual UT_XML::Listener { private: IE_Imp_OpenWriter * m_pImporter; protected: inline IE_Imp_OpenWriter * getImporter () { return m_pImporter; } inline PD_Document * getDocument() { return m_pImporter->getDocument(); } public: OpenWriter_Stream_Listener ( IE_Imp_OpenWriter * importer ) : m_pImporter ( importer ) { } virtual ~OpenWriter_Stream_Listener () { } }; /*****************************************************************************/ /*****************************************************************************/ /*! * Class to handle meta-streams */ class ABI_EXPORT OpenWriter_MetaStream_Listener : public OpenWriter_Stream_Listener { public: OpenWriter_MetaStream_Listener ( IE_Imp_OpenWriter * importer ) : OpenWriter_Stream_Listener ( importer ) { } virtual ~OpenWriter_MetaStream_Listener () { } virtual void startElement (const XML_Char * name, const XML_Char ** atts) { } virtual void endElement (const XML_Char * name) { } virtual void charData (const XML_Char * buffer, int length) { } private: }; /*! * Handle the meta-stream */ UT_Error IE_Imp_OpenWriter::_handleMetaStream ( UT_ByteBuf & stm ) { UT_XML reader; OpenWriter_MetaStream_Listener listener ( this ); reader.setListener ( &listener ); return reader.parse ( &stm ); } /*****************************************************************************/ /*****************************************************************************/ /*! * Class to handle the settings stream */ class ABI_EXPORT OpenWriter_SettingsStream_Listener : public OpenWriter_Stream_Listener { public: OpenWriter_SettingsStream_Listener ( IE_Imp_OpenWriter * importer ) : OpenWriter_Stream_Listener ( importer ) { } virtual ~OpenWriter_SettingsStream_Listener () { } virtual void startElement (const XML_Char * name, const XML_Char ** atts) { } virtual void endElement (const XML_Char * name) { } virtual void charData (const XML_Char * buffer, int length) { } private: }; /*! * Handle the setting-stream */ UT_Error IE_Imp_OpenWriter::_handleSettingsStream ( UT_ByteBuf & stm ) { UT_XML reader; OpenWriter_SettingsStream_Listener listener ( this ); reader.setListener ( &listener ); return reader.parse ( &stm ); } /*****************************************************************************/ /*****************************************************************************/ /*! * Class to handle the styles stream */ class ABI_EXPORT OpenWriter_StylesStream_Listener : public OpenWriter_Stream_Listener { public: OpenWriter_StylesStream_Listener ( IE_Imp_OpenWriter * importer ) : OpenWriter_Stream_Listener ( importer ) { } virtual ~OpenWriter_StylesStream_Listener () { } virtual void startElement (const XML_Char * name, const XML_Char ** atts) { } virtual void endElement (const XML_Char * name) { } virtual void charData (const XML_Char * buffer, int length) { } private: }; /*! * Handle the styles-stream */ UT_Error IE_Imp_OpenWriter::_handleStylesStream ( UT_ByteBuf & stm ) { UT_XML reader; OpenWriter_StylesStream_Listener listener ( this ); reader.setListener ( &listener ); return reader.parse ( &stm ); } /*****************************************************************************/ /*****************************************************************************/ /*! * Class to handle the content stream */ class ABI_EXPORT OpenWriter_ContentStream_Listener : public OpenWriter_Stream_Listener { private: UTF8_To_UCS2_Manager m_manager; bool m_bInParagraph; public: OpenWriter_ContentStream_Listener ( IE_Imp_OpenWriter * importer ) : OpenWriter_Stream_Listener ( importer ), m_bInParagraph(false) { } virtual ~OpenWriter_ContentStream_Listener () { } virtual void startElement (const XML_Char * name, const XML_Char ** atts) { if ( !UT_strcmp(name, "text:p" ) ) { getDocument()->appendStrux(PTX_Block, NULL); m_bInParagraph = true; } } virtual void endElement (const XML_Char * name) { if ( !UT_strcmp(name, "text:p" ) ) { m_bInParagraph = false; } } virtual void charData (const XML_Char * buffer, int length) { UT_UCS2String ucs2; if ( m_bInParagraph && UT_OK == m_manager.toucs2 ( buffer, length, ucs2 ) ) { if ( ucs2.size () > 0 ) { UT_DEBUGMSG(("DOM: appending %d chars\n", ucs2.size())); getDocument()->appendSpan ( ucs2.ucs_str(), ucs2.size () ); } } } }; /*! * Handle the content-stream */ UT_Error IE_Imp_OpenWriter::_handleContentStream ( UT_ByteBuf & stm ) { UT_XML reader; OpenWriter_ContentStream_Listener listener ( this ); reader.setListener ( &listener ); // quick hack to append us a section without actually parsing the xml document and recognizing its tags getDocument()->appendStrux(PTX_Section, NULL); return reader.parse ( &stm ); } /****************************************************************************/ /****************************************************************************/ #if 0 #include "ie_exp.h" // start of an OpenWriter Export Plugin class ABI_EXPORT IE_Exp_OpenWriter : public IE_Exp { private: UTF8_To_UCS2_Manager m_manager; ZipOutputStream * m_zos; public: IE_Exp_OpenWriter (PD_Document * pDocument) : IE_Exp (pDocument), m_zos(0) { } virtual ~IE_Exp_OpenWriter () { _closeFile (); } protected: /*! * Writes the following UCS2 string to the open stream */ UT_sint32 _writeUCS2 (const UT_UCSChar * ucs2) { if(!ucs2) return 0; return _writeUCS2 (ucs2, UT_UCS_strlen(ucs2)); } /*! * Writes the following UCS2 string to the open stream */ UT_sint32 _writeUCS2 (const UT_UCSChar * ucs2, UT_uint32 len) { if(!ucs2 || !len) return 0; UT_String utf8; if (UT_OK == m_manager.toutf8 (ucs2, len, utf8)) { m_zos->write (utf8.c_str(), utf8.size()); } return len; } /*! * Writes the ASCII string to the open stream */ UT_sint32 _write (const char * str) { if(!str) return 0; return _write (str, strlen(str)); } /*! * Writes the ASCII string to the open string */ UT_uint32 _write (const char * str, UT_uint32 len) { if(!str || !len) return 0; m_zos->write(str, len); return len; } /*! * Create a substream inside of the current zip file */ UT_Error _createSubStream ( const char * szFilename ) { try { // the next thing that we write to will be this file m_zos->putNextEntry( ZipCDirEntry( szFilename ) ) ; return UT_OK; } catch (...) { return UT_ERROR; } } /*! * Open the zip file */ virtual bool _openFile(const char * szFilename) { try { m_zos = new ZipOutputStream ( szFilename ); return true; } catch (...) { return false; } } /*! * Close the zip file */ virtual bool _closeFile () { DELETEP(m_zos); return true; } /*! * Actually create a listener capable of handling the document's * callbacks and write out the document to disk */ virtual UT_Error _writeDocument(void) { OpenWriter_OutputDevice * pListener = new OpenWriter_OutputDevice(getDoc(),this); if (!pListener) return UT_IE_NOMEMORY; if (!getDoc()->tellListener(static_cast(pListener))) return UT_ERROR; DELETEP(pListener); return UT_OK; } }; /*! * Export sniffer */ class ABI_EXPORT IE_Exp_OpenWriter_Sniffer : public IE_ExpSniffer { /*! * Recognize this suffix */ bool recognizeSuffix(const char * szSuffix) { return (!UT_stricmp(szSuffix,".sxw")); } /*! * Construct an importer for us */ UT_Error constructExporter(PD_Document * pDocument, IE_Exp ** ppie) { IE_Exp_OpenWriter * p = new IE_Exp_OpenWriter(pDocument); *ppie = p; return UT_OK; } /*! * Get the dialog labels */ bool getDlgLabels(const char ** pszDesc, const char ** pszSuffixList, IEFileType * ft) { *pszDesc = "OpenWriter (.sxw)"; *pszSuffixList = "*.sxw"; *ft = getFileType(); return true; } }; #endif /****************************************************************************/ /****************************************************************************/ // completely generic C-interface code to allow this to be a plugin ABI_PLUGIN_DECLARE("OpenWriter") // we use a reference-counted sniffer static IE_Imp_OpenWriter_Sniffer * m_sniffer = 0; ABI_FAR_CALL int abi_plugin_register (XAP_ModuleInfo * mi) { if (!m_sniffer) { m_sniffer = new IE_Imp_OpenWriter_Sniffer (); } else { m_sniffer->ref(); } UT_ASSERT (m_sniffer); mi->name = "OpenWriter Importer"; mi->desc = "Import Sun's OpenWriter documents"; mi->version = ABI_VERSION_STRING; mi->author = "Dom Lachowicz "; mi->usage = "No Usage"; IE_Imp::registerImporter (m_sniffer); return 1; } ABI_FAR_CALL int abi_plugin_unregister (XAP_ModuleInfo * mi) { mi->name = 0; mi->desc = 0; mi->version = 0; mi->author = 0; mi->usage = 0; UT_ASSERT (m_sniffer); IE_Imp::unregisterImporter (m_sniffer); if (!m_sniffer->unref()) { m_sniffer = 0; } return 1; } ABI_FAR_CALL int abi_plugin_supports_version (UT_uint32 major, UT_uint32 minor, UT_uint32 release) { return 1; } /****************************************************************************/ /****************************************************************************/