/* AbiWord: ie_imp_HTML - plugin for non-XHTML HTML * Copyright (C) 2002 Francis James Franklin <fjf@alinameridon.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include <stdio.h> #include <string.h> #include <ctype.h> // AbiWord includes #include "ut_misc.h" #include "ut_bytebuf.h" #include "ut_growbuf.h" #include "ut_assert.h" #include "ut_debugmsg.h" #include "pd_Document.h" #include "pd_Style.h" // Local includes #include "ie_imp_HTML.h" #include "HTML_Parser.h" #include "HTML_colors.h" #include "HTML_tags.h" #include "HTML_utils.h" #include "HTML_TokenStack.h" #include "HTML_TextState.h" /*****************************************************************/ /*****************************************************************/ #include "xap_Module.h" ABI_PLUGIN_DECLARE("HTML") // we use a reference-counted sniffer static IE_Imp_HTML_Sniffer * m_sniffer = 0; ABI_FAR_CALL int abi_plugin_register (XAP_ModuleInfo * mi) { if (!m_sniffer) { m_sniffer = new IE_Imp_HTML_Sniffer (); } else { m_sniffer->ref (); } mi->name = "HTML Importer"; mi->desc = "Import HTML Documents (Experimental)"; mi->version = ABI_PLUGIN_VERSION; mi->author = "Abi, the Ant, & fjf, her friend."; mi->usage = "See back of Prayer Wheel for instructions."; IE_Imp::registerImporter (m_sniffer); return 1; } ABI_FAR_CALL int abi_plugin_unregister (XAP_ModuleInfo * mi) { mi->name = 0; mi->desc = 0; mi->version = 0; mi->author = 0; mi->usage = 0; UT_ASSERT (m_sniffer); IE_Imp::unregisterImporter (m_sniffer); if (!m_sniffer->unref ()) { m_sniffer = 0; // Shouldn't this be deleted? } return 1; } ABI_FAR_CALL int abi_plugin_supports_version (UT_uint32 major, UT_uint32 minor, UT_uint32 release) { #if 0 /* Is there a correct way to do this? */ if ((major == 1) && (minor == 0) /* && (release >= 0) */) return 1; /* Strict check for development versions */ return ((( major == (UT_uint32) ABI_PLUGIN_MAJOR) && ( minor == (UT_uint32) ABI_PLUGIN_MINOR) && (release == (UT_uint32) ABI_PLUGIN_MICRO)) ? 1 : 0); #else return 1 ; #endif } /*****************************************************************/ /*****************************************************************/ UT_Confidence_t IE_Imp_HTML_Sniffer::recognizeContents (const char * szBuf, UT_uint32 iNumbytes) { #ifdef USE_HTML_PARSER_SNIFFER HTML_Parser parser; return ( parser.sniff (szBuf, iNumbytes) ? UT_CONFIDENCE_PERFECT : UT_CONFIDENCE_ZILCH ); #else UT_uint32 i = 0; const char * ptr = szBuf; while (isspace ((int) (*ptr))) { ptr++; i++; if (i == iNumbytes) break; } if (i == iNumbytes) return UT_CONFIDENCE_ZILCH; if (*ptr != '<') return UT_CONFIDENCE_ZILCH; UT_uint8 found_html = UT_CONFIDENCE_ZILCH; ptr++; i++; while (i < iNumbytes - 4) { if (UT_strnicmp (ptr, "html", 4) == 0) { found_html = UT_CONFIDENCE_PERFECT; break; } ptr++; i++; } return found_html; #endif } UT_Confidence_t IE_Imp_HTML_Sniffer::recognizeSuffix (const char * szSuffix) { if (!(UT_stricmp (szSuffix, ".html")) || !(UT_stricmp (szSuffix, ".htm"))) return UT_CONFIDENCE_PERFECT; return UT_CONFIDENCE_ZILCH; } UT_Error IE_Imp_HTML_Sniffer::constructImporter (PD_Document * pDocument, IE_Imp ** ppie) { IE_Imp_HTML * p = new IE_Imp_HTML (pDocument); *ppie = p; return UT_OK; } bool IE_Imp_HTML_Sniffer::getDlgLabels (const char ** pszDesc, const char ** pszSuffixList, IEFileType * ft) { *pszDesc = "HTML (.html, .htm)"; *pszSuffixList = "*.html; *.htm"; *ft = getFileType (); return true; } /*****************************************************************/ /*****************************************************************/ IE_Imp_HTML::IE_Imp_HTML (PD_Document * pDocument) : IE_Imp(pDocument), m_hasContent(false), m_stripWhitespace(true), m_stripFirstspace(true), m_inSection(false), m_inBlock(false), m_bookmarkOpen(false), m_hyperlinkOpen(false), m_bookmark(0), m_hyperlink(0), m_DocumentPlace(dp_PreDocument), m_CharDataMode(cdm_Ignore), m_pHTML(0), m_pBB(0), m_error(UT_OK) { // } IE_Imp_HTML::~IE_Imp_HTML () { FREEP(m_bookmark); FREEP(m_hyperlink); DELETEP (m_pHTML); DELETEP (m_pBB); } /*****************************************************************/ /*****************************************************************/ UT_Error IE_Imp_HTML::importFile (const char * szFilename) { if (!m_TokenStack.reset ()) return UT_OUTOFMEM; if (!m_TextState.reset ()) return UT_OUTOFMEM; m_hasContent = false; m_inSection = false; m_inBlock = false; m_pHTML = new HTML_Parser; m_pHTML->setListener (this); m_DocumentPlace = dp_PreDocument; m_CharDataMode = cdm_Ignore; m_error = UT_OK; if (!m_pHTML->parse (szFilename)) m_error = ((m_error == UT_OK) ? UT_IE_BOGUSDOCUMENT : m_error); DELETEP (m_pHTML); if ((m_error == UT_OK) && !m_hasContent) // If no errors & document has no content, add an empty block. if (!_Block ()) m_error = UT_ERROR; return m_error; } /*****************************************************************/ /*****************************************************************/ bool IE_Imp_HTML::_Section (const char * type /* = 0 */) { const char * attribute[4]; const char ** atts = 0; if (type) { attribute[0] = "type"; attribute[1] = type; attribute[2] = 0; attribute[3] = 0; atts = (const char **) attribute; } fprintf (stderr, "Adding Section...\n"); if (getDoc()->appendStrux (PTX_Section, atts)) { m_inSection = true; m_inBlock = false; return true; } return false; } /* Just a simple block. Lists require many more args */ bool IE_Imp_HTML::_Block (const char * style /* = 0 */, const char * props /* = 0 */) { if (!m_inSection) if (!_Section ()) return false; if (m_inBlock) // close any open links before starting new block { if (m_bookmarkOpen) bookmark (false); if (m_hyperlinkOpen) hyperlink (false); } if (style == 0) style = "Normal"; if (props == 0) props = "text-align:left"; const char * attribute[6]; attribute[0] = "style"; attribute[1] = style; attribute[2] = "props"; attribute[3] = props; attribute[4] = 0; attribute[5] = 0; const char ** atts = (const char **) attribute; fprintf (stderr, "Adding Block...\n"); if (getDoc()->appendStrux (PTX_Block, atts)) { // If no content is added, a blank document will be returned from the importer m_hasContent = true; m_inBlock = true; if (m_bookmark) bookmark (true); // open link if pending if (m_hyperlink) hyperlink (true); // open link if pending return true; } return false; } /*****************************************************************/ /*****************************************************************/ void IE_Imp_HTML::bookmark (bool start) { if (m_bookmark == 0) return; if (!m_inBlock) // There's an odd case where bookmark/hyperlink exists outside of any block structure... { if (!start) { free (m_bookmark); m_bookmark = 0; m_bookmarkOpen = start; } return; } if (m_bookmarkOpen == start) return; const char * attribute[6]; if (start) { attribute[0] = "type"; attribute[1] = "start"; } else { attribute[0] = "type"; attribute[1] = "end"; } attribute[2] = "name"; attribute[3] = m_bookmark; attribute[4] = 0; attribute[5] = 0; const char ** atts = (const char **) attribute; getDoc()->appendObject (PTO_Bookmark, atts); if (!start) { free (m_bookmark); m_bookmark = 0; } m_bookmarkOpen = start; } void IE_Imp_HTML::hyperlink (bool start) { if (m_hyperlink == 0) return; if (!m_inBlock) // There's an odd case where bookmark/hyperlink exists outside of any block structure... { if (!start) { free (m_hyperlink); m_hyperlink = 0; m_hyperlinkOpen = start; } return; } if (m_hyperlinkOpen == start) return; const char * attribute[4]; attribute[0] = "xlink:href"; attribute[1] = m_hyperlink; attribute[2] = 0; attribute[3] = 0; const char ** atts = (const char **) attribute; if (start) getDoc()->appendObject (PTO_Hyperlink, atts); else getDoc()->appendObject (PTO_Hyperlink, 0); if (!start) { free (m_hyperlink); m_hyperlink = 0; } m_hyperlinkOpen = start; } /*****************************************************************/ /*****************************************************************/ /* Call-back used by defineStyles */ void IE_Imp_HTML::addStyle (const char ** atts) { PD_Style * pStyle = 0; if (getDoc()->getStyle (atts[7], &pStyle)) { if (pStyle->addAttributes (atts)) { pStyle->getBasedOn (); pStyle->getFollowedBy (); } } else getDoc()->appendStyle (atts); } /*****************************************************************/ /*****************************************************************/ void IE_Imp_HTML::charData (const char * buffer, int length) { for (int i = 0; i < length; i++) fputc ((int) buffer[i], stderr); // DEBUG if (m_CharDataMode == cdm_Ignore) return; if (m_pBB == 0) m_pBB = new UT_ByteBuf; m_pBB->append ((UT_Byte *) buffer, (UT_uint32) length); // How safe is this ?? Assumes sizeof(xmlChar)==sizeof(UT_Byte) } void IE_Imp_HTML::flushData () { if (m_pBB == 0) return; if (m_pBB->getLength ()) stripData (); if (m_pBB->getLength ()) { /* Set text properties */ char * style = m_TextState.getStyle (); char * props = m_TextState.getProps (); if (!m_inBlock) if (!_Block (style)) { m_error = UT_ERROR; return; } if (style && props) { const char * attribute[6]; attribute[0] = "style"; attribute[1] = style; if (*props) { attribute[2] = "props"; attribute[3] = props; } else { attribute[2] = 0; attribute[3] = 0; } attribute[4] = 0; attribute[5] = 0; const char ** atts = (const char **) attribute; fprintf (stderr, "Adding Format... style=\"%s\" props=\"%s\"\n", style, props); getDoc()->appendFmt (atts); } fprintf (stderr, "Added Format\n"); /* Add accumulated char. data to document */ UT_GrowBuf * pGB = new UT_GrowBuf; UT_decodeUTF8string ((const char *) m_pBB->getPointer (0), m_pBB->getLength (), pGB); if (pGB->getLength ()) { fprintf (stderr, "Adding Span...\n"); getDoc()->appendSpan ((UT_UCSChar *) pGB->getPointer (0), pGB->getLength ()); } /* Clear buffer */ m_pBB->truncate (0); } } void IE_Imp_HTML::stripData () { UT_uint32 len = m_pBB->getLength (); UT_uint32 pos = 0; const UT_Byte * buf = m_pBB->getPointer (pos); bool deleteSpace = ((m_stripWhitespace && m_stripFirstspace) ? true : false); /* m_stripFirstspace is set by TT_H? and TT_P; "Firstspace" is the 1st space after the <h?> or <p> */ m_stripFirstspace = false; while (pos < len) { buf = m_pBB->getPointer (pos); if (buf[0] == UCS_CR) // always convert CRLF & CR to LF. { if ((pos + 1) < len) if (buf[1] == UCS_LF) { m_pBB->del (pos, 1); len = m_pBB->getLength (); continue; } UT_Byte lf = UCS_LF; m_pBB->overwrite (pos, &lf, 1); continue; } if (m_stripWhitespace && UT_UCS_isspace (buf[0])) // if stripping, convert all whitespace to space. if (buf[0] != UCS_SPACE) { UT_Byte sp = UCS_SPACE; m_pBB->overwrite (pos, &sp, 1); continue; } if (deleteSpace && (buf[0] == UCS_SPACE)) { m_pBB->del (pos, 1); len = m_pBB->getLength (); continue; } deleteSpace = false; if (m_stripWhitespace && (buf[0] == UCS_SPACE)) { deleteSpace = true; // if stripping, remove all subsequent whitespace pos++; continue; } if (buf[0] < 0x80) // plain us-ascii part of latin-1 { pos++; continue; } if ((buf[0] & 0xf0) == 0xf0) // lead byte in 4-byte surrogate pair { // surrogate pairs are defined in section 3.7 of the unicode standard version 2.0 as an extension // mechanism for rare characters in future extensions of the unicode standard. m_pBB->del (pos, ((len - pos) >= 4) ? 4 : (len - pos)); len = m_pBB->getLength (); UT_ASSERT (UT_NOT_IMPLEMENTED); continue; } if ((buf[0] & 0xe0) == 0xe0) // lead byte in 3-byte sequence { if ((len - pos) >= 3) { pos += 3; } else { m_pBB->del (pos, len - pos); len = m_pBB->getLength (); } continue; } if ((buf[0] & 0xc0) == 0xc0) // lead byte in 2-byte sequence { if ((len - pos) >= 2) { pos += 2; } else { m_pBB->del (pos, len - pos); len = m_pBB->getLength (); } continue; } /* Hopefully, shouldn't get here... delete & pray! */ m_pBB->del (pos, 1); len = m_pBB->getLength (); } } void IE_Imp_HTML::startElement (const char * name, const char ** atts) { fprintf (stderr, "<%s>", name); // DEBUG int token = s_mapNameToToken (name); switch (m_DocumentPlace) // libxml2 does all the hard work; the checks here are redundant, probably { case dp_PreDocument: if (token == TT_HTML) m_DocumentPlace = dp_PreHead; else m_error = UT_ERROR; break; case dp_PreHead: if (token == TT_HEAD) m_DocumentPlace = dp_Head; else if (token == TT_BODY) m_DocumentPlace = dp_Body; else m_error = UT_ERROR; break; case dp_Head: break; case dp_PreBody: if (token == TT_BODY) m_DocumentPlace = dp_Body; else if (token == TT_HEAD) m_DocumentPlace = dp_Head; // Allow multiple <head>[]</head> - unfortunately possible! else m_error = UT_ERROR; break; case dp_Body: break; case dp_PostBody: m_error = UT_ERROR; break; case dp_PostDocument: m_error = UT_ERROR; break; } if (m_error != UT_OK) { m_pHTML->stop (); return; } if ((m_CharDataMode == cdm_Body)) if (!m_TextState.push ()) { m_error = UT_OUTOFMEM; m_pHTML->stop (); return; } _Element (token, name, atts, true); if (m_error == UT_OK) if (!m_TokenStack.push (token)) { m_error = UT_OUTOFMEM; m_pHTML->stop (); } } void IE_Imp_HTML::endElement (const char * name) { fprintf (stderr, "</%s>", name); // DEBUG int token = s_mapNameToToken (name); if (m_TokenStack.pop () != token) m_error = UT_ERROR; // the pop is the important part if (m_error != UT_OK) { m_pHTML->stop (); return; } switch (m_DocumentPlace) // libxml2 does all the hard work; the checks here are redundant, probably { case dp_PreDocument: m_error = UT_ERROR; break; case dp_PreHead: m_error = UT_ERROR; break; case dp_Head: if (token == TT_HEAD) m_DocumentPlace = dp_PreBody; break; case dp_PreBody: m_error = UT_ERROR; break; case dp_Body: if (token == TT_BODY) m_DocumentPlace = dp_PostBody; break; case dp_PostBody: if (token == TT_HTML) m_DocumentPlace = dp_PostDocument; else m_error = UT_ERROR; break; case dp_PostDocument: m_error = UT_ERROR; break; } if (m_error != UT_OK) { m_pHTML->stop (); return; } _Element (token, name, 0, false); if ((m_CharDataMode == cdm_Body) && (m_error == UT_OK)) if (!m_TextState.pop ()) { m_error = UT_ERROR; m_pHTML->stop (); } } void IE_Imp_HTML::_Element (int token, const char * name, const char ** atts, bool start) { if (m_CharDataMode == cdm_Body) flushData (); switch (token) { case TT_OTHER: // 0 // ? Tag not recognized (not an error, though) break; case TT_HTML: // 1 // <html> Document main/first tag break; case TT_HEAD: // 2 // <head> (child of <html>; 1 instance; precedes <body>) break; case TT_BODY: // 3 // <body> (child of <html>; 1 instance; follows <head>) if (start) m_TextState.defineStyles (this); break; case TT_TITLE: // 4 // <title> (child of <head>) if (!m_TokenStack.child_of_head ()) break; break; case TT_STYLE: // 5 // <style> (child of <head>) if (!m_TokenStack.child_of_head ()) break; break; case TT_DIV: // 6 // <div> [B] Used with style sheets to produce layers, boxes, etc. // AbiWord uses these in its XHTML output to denote sections // but this seems an uncommon use. break; case TT_SPAN: // 7 // <span> Like <div>, this is used in conjuction with style sheets break; case TT_LAYER: // 8 // <layer> Layer break; case TT_LINK: // 9 // <link> (child of <head>; no children; no end tag) if (!m_TokenStack.child_of_head ()) break; break; case TT_BASE: // 10 // <base> (child of <head>; no children; no end tag) if (!m_TokenStack.child_of_head ()) break; break; case TT_META: // 11 // <meta> (child of <head>; no children; no end tag) if (!m_TokenStack.child_of_head ()) break; break; case TT_A: // 21 // <a> Anchor or Hyperlink (can't nest links) if (m_TokenStack.descendant_of (TT_A)) break; if (start) { const char * name = s_attrLookup (atts, "id"); // "id" is preferred by XHTML 1.1 standard if (name == 0) name = s_attrLookup (atts, "name"); if (name) // bookmark { m_bookmarkOpen = false; m_bookmark = UT_strdup (name); bookmark (start); break; } const char * link = s_attrLookup (atts, "xlink:href"); if (link == 0) link = s_attrLookup (atts, "href"); if (link) { if (link[0] == '#') // in-doc reference to bookmark if (link[1]) { m_hyperlinkOpen = false; m_hyperlink = UT_strdup (link); hyperlink (start); break; } // else: external reference } } else { if (m_bookmarkOpen) { bookmark (start); // start == false m_bookmarkOpen = false; } if (m_hyperlinkOpen) { hyperlink (start); // start == false m_hyperlinkOpen = false; } } break; case TT_H1: // 22 // <h1> Headings if (!m_TokenStack.child_of_body ()) break; if (start) { if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedHeading1, false); // don't truncate TextState else m_TextState.reset (TextState::bs_Heading1); m_stripFirstspace = true; m_inBlock = false; m_CharDataMode = cdm_Body; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // this resets more than necessary... else m_CharDataMode = cdm_Ignore; } break; case TT_H2: // 23 // <h2> if (!m_TokenStack.child_of_body ()) break; if (start) { if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedHeading2, false); // don't truncate TextState else m_TextState.reset (TextState::bs_Heading2); m_stripFirstspace = true; m_inBlock = false; m_CharDataMode = cdm_Body; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // this resets more than necessary... else m_CharDataMode = cdm_Ignore; } break; case TT_H3: // 24 // <h3> if (!m_TokenStack.child_of_body ()) break; if (start) { if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedHeading3, false); // don't truncate TextState else m_TextState.reset (TextState::bs_Heading3); m_stripFirstspace = true; m_inBlock = false; m_CharDataMode = cdm_Body; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // this resets more than necessary... else m_CharDataMode = cdm_Ignore; } break; case TT_H4: // 25 // <h4> Normal size but bold, others relative (usually) (?) if (!m_TokenStack.child_of_body ()) break; if (start) { if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedHeading4, false); // don't truncate TextState else m_TextState.reset (TextState::bs_Heading4); m_stripFirstspace = true; m_inBlock = false; m_CharDataMode = cdm_Body; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // this resets more than necessary... else m_CharDataMode = cdm_Ignore; } break; case TT_H5: // 26 // <h5> if (!m_TokenStack.child_of_body ()) break; if (start) { if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedHeading5, false); // don't truncate TextState else m_TextState.reset (TextState::bs_Heading5); m_stripFirstspace = true; m_inBlock = false; m_CharDataMode = cdm_Body; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // this resets more than necessary... else m_CharDataMode = cdm_Ignore; } break; case TT_H6: // 27 // <h6> if (!m_TokenStack.child_of_body ()) break; if (start) { if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedHeading6, false); // don't truncate TextState else m_TextState.reset (TextState::bs_Heading6); m_stripFirstspace = true; m_inBlock = false; m_CharDataMode = cdm_Body; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // this resets more than necessary... else m_CharDataMode = cdm_Ignore; } break; case TT_ADDRESS: // 29 // <address> Author's address { set italics } if (start) m_TextState.setItalic (); break; /* Structured Text: Phrase elements */ case TT_EM: // 40 // <em> Emphasis { toggle italics } if (start) m_TextState.toggleItalic (); break; case TT_STRONG: // 41 // <strong> Stronger emphasis { set bold } if (start) m_TextState.setBold (); break; case TT_DFN: // 42 // <dfn> Defining instance { ? } if (start) m_TextState.setUnderline (); break; case TT_CODE: // 43 // <code> Computer code { use fixed width font } if (start) m_TextState.setFontFamily ("Courier"); break; case TT_SAMP: // 44 // <samp> Sample output { use fixed width font } if (start) m_TextState.setFontFamily ("Courier"); break; case TT_KBD: // 45 // <kbd> Text to be entered by user { use fixed width font } if (start) m_TextState.setFontFamily ("Courier"); break; case TT_VAR: // 46 // <var> Variable/Argument { use fixed width font } if (start) m_TextState.setFontFamily ("Courier"); break; case TT_CITE: // 47 // <cite> Citation/Reference { no action } break; case TT_ABBR: // 48 // <abbr> Abbreviation { no action } break; case TT_ACRONYM: // 49 // <acronym> Acronym { no action } break; /* Structured Text: Quotations */ case TT_Q: // 50 // <q> Quote { quotation mark e.g. `, ', ", <<, - a job for smart quotes? } break; case TT_BLOCKQUOTE: // 51 // <blockquote> [B] Block quote { separate paragraph; indented } break; /* Structured Text: Subscripts and superscripts */ case TT_SUB: // 52 // <sub> Subscript if (m_TokenStack.descendant_of (TT_PRE)) break; // Is this possible? if (start) m_TextState.setScriptStyle (TextState::ss_Sub); break; case TT_SUP: // 53 // <sup> Superscript if (m_TokenStack.descendant_of (TT_PRE)) break; // Is this possible? if (start) m_TextState.setScriptStyle (TextState::ss_Super); break; /* Lines And Paragraphs: Paragraphs */ case TT_P: // 54 // <p> [B] New paragraph // Can't contain other block elements, inc. self // (Ignore empty paragraphs.) if (!m_TokenStack.child_of_body ()) break; if (start) { if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // don't truncate TextState else m_TextState.reset (TextState::bs_Normal); m_stripFirstspace = true; m_inBlock = false; m_CharDataMode = cdm_Body; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; if (m_TokenStack.descendant_of (TT_PRE)) m_TextState.reset (TextState::bs_FixedNormal, false); // this resets more than necessary... else m_CharDataMode = cdm_Ignore; } break; /* Lines And Paragraphs: Controlling line breaks */ case TT_BR: // 55 // <br> Forced line break if (start) m_inBlock = false; break; /* Lines And Paragraphs: Preformatted text */ case TT_PRE: // 56 // <pre> Preformatted text { use fixed width font; white space critical } // Can't contain: <img>, <object>, <big>, <small>, <sub>, <sup> if (!m_TokenStack.child_of_body ()) break; if (m_TokenStack.descendant_of (TT_PRE)) break; // Is this possible? if (start) { m_TextState.reset (TextState::bs_FixedNormal); m_inBlock = false; m_CharDataMode = cdm_Body; m_stripWhitespace = false; } else { if (m_bookmarkOpen) bookmark (start); // start == false if (m_hyperlinkOpen) hyperlink (start); // start == false m_inBlock = false; m_CharDataMode = cdm_Ignore; m_stripWhitespace = true; } break; /* Lines And Paragraphs: Marking document changes */ case TT_INS: // 57 // <ins> Insert { set color red } if (start) m_TextState.setForeground (CC_red); break; case TT_DEL: // 58 // <del> Delete { set color red; set strike-through } if (start) { m_TextState.setForeground (CC_red); m_TextState.setLineThrough (); } break; /* Lists: Unordered lists, ordered lists, and list items */ case TT_OL: // 59 // <ol> Ordered list (children must be <li>) break; case TT_UL: // 60 // <ul> Unordered list (children must be <li>) break; case TT_LI: // 61 // <li> List item (</li> optional) break; /* Lists: Definition lists */ case TT_DL: // 62 // <dl> Definition list break; case TT_DT: // 63 // <dt> Definition term { set bold } break; case TT_DD: // 64 // <dd> Definition description { indent } break; /* Lists: <dir> & <menu> */ // case TT_DIR: // TT_UL // <dir> Directory list { cf. <ul> } // break; // case TT_MENU: // TT_UL // <menu> Menu list { cf. <ul> } // break; /* Tables: */ case TT_TABLE: // 65 // <table> Table break; case TT_CAPTION: // 66 // <caption> Caption (child of <table>) break; case TT_THEAD: // 67 // <thead> Table Head (child of <table> acting as new <table>) break; case TT_TFOOT: // 68 // <tfoot> Table Foot (child of <table> acting as new <table>) break; case TT_TBODY: // 69 // <tbody> Table Body (child of <table> acting as new <table>) break; case TT_COLGROUP: // 70 // <colgroup> Un-partition existing columns into one break; case TT_COL: // 71 // <col> Subdivide grouped column (child of <colgroup>) break; case TT_TR: // 72 // <tr> Row (child of <table>) break; case TT_TH: // 73 // <th> Header { set bold } (child of <tr>) break; case TT_TD: // 74 // <td> Data (child of <tr>) break; /* Objects, Images & Applets: Including an image */ case TT_IMG: // 75 // <img> Image tag (empty; no children; no end tag) if (m_TokenStack.descendant_of (TT_PRE)) break; // Is this possible? break; /* Objects, Images & Applets: Generic inclusion */ case TT_OBJECT: // 76 // <object> Object if (m_TokenStack.descendant_of (TT_PRE)) break; // Is this possible? break; /* Objects, Images & Applets: Object initialization */ case TT_PARAM: // 77 // <param> Option for object (empty; no children; no end tag) break; /* Objects, Images & Applets: Including an applet */ case TT_APPLET: // 78 // <applet> Applet break; /* Objects, Images & Applets: Client-side image maps */ case TT_MAP: // 79 // <map> Map break; case TT_AREA: // 80 // <area> Area with map (child of <map>; empty; no children; no end tag) break; /* Alignment, Font Styles etc.: Alignment */ case TT_CENTER: // 81 // <center> = <div align="center"> break; /* Alignment, Font Styles etc.: Font style elements */ case TT_TT: // 82 // <tt> Fixed width { set fixed width } if (start) m_TextState.setFontFamily ("Courier"); break; case TT_I: // 83 // <i> Italic { set italics } if (start) m_TextState.setItalic (); break; case TT_B: // 84 // <b> Bold { set bold } if (start) m_TextState.setBold (); break; case TT_BIG: // 85 // <big> Large { inc. font size } if (m_TokenStack.descendant_of (TT_PRE)) break; // Is this possible? if (start) m_TextState.scaleFontSize (1.25f); break; case TT_SMALL: // 86 // <small> Small { dec. font size } if (m_TokenStack.descendant_of (TT_PRE)) break; // Is this possible? if (start) m_TextState.scaleFontSize (0.8f); break; case TT_S: // 87 // <s> Strike-through { set strike-through } if (start) m_TextState.setLineThrough (); break; // case TT_STRIKE: // TT_S // <strike> Strike-through { set strike-through } // break; case TT_U: // 88 // <u> Underline { set underline } if (start) m_TextState.setUnderline (); break; /* Alignment, Font Styles etc.: Font modifier elements */ case TT_FONT: // 89 // <font> Set new font (mother of all...) if (start) { const char * color = s_attrLookup (atts, "color"); if (color) { UT_HashColor hash; const char * hexColor = hash.setColor (color); if (hexColor) m_TextState.setForeground (hexColor + 1); } } break; case TT_BASEFONT: // 90 // <basefont> Default font (font sizes calculated relative to base-font) break; /* Alignment, Font Styles etc.: Rules */ case TT_HR: // 91 // <hr> Horizontal rule (empty; no children; no end tag) if (start) m_inBlock = false; break; /* Frames: */ case TT_FRAMESET: // 92 // <frameset> Set of frames { no action } break; case TT_FRAME: // 93 // <frame> A frame (child of frameset) { no action } break; case TT_NOFRAMES: // 94 // <noframes> Frameless version { no action } break; case TT_IFRAME: // 95 // <iframe> Inline frame { no action } break; /* Forms: */ case TT_FORM: // 96 // <form> Form { no action } break; case TT_INPUT: // 97 // <input> Input { no action } break; case TT_BUTTON: // 98 // <button> Button { no action } break; case TT_SELECT: // 99 // <select> Select { no action } break; case TT_OPTGROUP: // 100 // <optgroup> Opt. Group { no action } break; case TT_OPTION: // 101 // <option> Option { no action } break; case TT_TEXTAREA: // 102 // <textarea> Text area { no action } break; case TT_ISINDEX: // 103 // <isindex> Index { no action } break; case TT_LABEL: // 104 // <label> Label { no action } break; case TT_FIELDSET: // 105 // <fieldset> Field set { no action } break; case TT_LEGEND: // 106 // <legend> Legend { no action } break; /* Scripts: */ case TT_SCRIPT: // 107 // <script> Script { no action } break; case TT_NOSCRIPT: // 108 // <noscript> Alt. to script { no action } break; /* Misc: */ case TT_BDO: // 109 // <bdo> BiDi override break; default: break; } if (m_error != UT_OK) { m_pHTML->stop (); return; } }