/* AbiWord: HTML_Parser class, uses libxml2 * Copyright (C) 2002 Francis James Franklin * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include #include #include #include #include "HTML_Parser.h" static void _startElement (void * userData, const xmlChar * name, const xmlChar ** atts); static void _endElement (void * userData, const xmlChar * name); static void _charData (void * userData, const xmlChar * buffer, int length); static xmlEntityPtr _getEntity (void * userData, const xmlChar * name); HTML_Parser::HTML_Parser () : m_pListener(0) { // } HTML_Parser::~HTML_Parser () { // } /* WARNING: this sniff mechanism is overly tolerant. libxml2's strength in * parsing is a weakness in detection. */ bool HTML_Parser::sniff (const char * buffer, int length) { if ((length == 0) || (buffer == 0)) return false; htmlSAXHandler hdl; htmlParserCtxtPtr ctxt; hdl.internalSubset = NULL; hdl.isStandalone = NULL; hdl.hasInternalSubset = NULL; hdl.hasExternalSubset = NULL; hdl.resolveEntity = NULL; hdl.getEntity = _getEntity; hdl.entityDecl = NULL; hdl.notationDecl = NULL; hdl.attributeDecl = NULL; hdl.elementDecl = NULL; hdl.unparsedEntityDecl = NULL; hdl.setDocumentLocator = NULL; hdl.startDocument = NULL; hdl.endDocument = NULL; hdl.startElement = _startElement; hdl.endElement = _endElement; hdl.reference = NULL; hdl.characters = _charData; hdl.ignorableWhitespace = NULL; hdl.processingInstruction = NULL; hdl.comment = NULL; hdl.warning = NULL; hdl.error = NULL; hdl.fatalError = NULL; hdl.getParameterEntity = NULL; hdl.cdataBlock = NULL; hdl.externalSubset = NULL; m_bStopped = false; m_bValid = false; // Watch out for libxml2 bug - keep initial buffer << 4000 for safety int initial_length = (length <= 128) ? length : 128; ctxt = htmlCreatePushParserCtxt (&hdl, (void *) this, buffer, (int) initial_length, 0, XML_CHAR_ENCODING_NONE); if (ctxt == 0) return false; int length_remaining = length - initial_length; const char * ptr = buffer + initial_length; while (length_remaining && !m_bValid && !m_bStopped) { int parse_length = (length_remaining <= 128) ? length_remaining : 127; if (htmlParseChunk (ctxt, ptr, parse_length, 0)) { m_bStopped = true; break; } length_remaining -= parse_length; ptr += parse_length; } if (!m_bValid && !m_bStopped) htmlParseChunk (ctxt, ptr, 0, 1); ctxt->sax = 0; htmlFreeParserCtxt (ctxt); if (m_bStopped) return false; // Suggests (very) invalid HTML return m_bValid; } bool HTML_Parser::parse (const char * filename) { if (filename == 0) return false; FILE * in = fopen (filename, "rb"); if (in == 0) return false; htmlSAXHandler hdl; htmlParserCtxtPtr ctxt = 0; hdl.internalSubset = NULL; hdl.isStandalone = NULL; hdl.hasInternalSubset = NULL; hdl.hasExternalSubset = NULL; hdl.resolveEntity = NULL; hdl.getEntity = _getEntity; hdl.entityDecl = NULL; hdl.notationDecl = NULL; hdl.attributeDecl = NULL; hdl.elementDecl = NULL; hdl.unparsedEntityDecl = NULL; hdl.setDocumentLocator = NULL; hdl.startDocument = NULL; hdl.endDocument = NULL; hdl.startElement = _startElement; hdl.endElement = _endElement; hdl.reference = NULL; hdl.characters = _charData; hdl.ignorableWhitespace = NULL; hdl.processingInstruction = NULL; hdl.comment = NULL; hdl.warning = NULL; hdl.error = NULL; hdl.fatalError = NULL; hdl.getParameterEntity = NULL; hdl.cdataBlock = NULL; hdl.externalSubset = NULL; m_bStopped = false; m_bValid = false; // Watch out for libxml2 bug - keep initial buffer << 4000 for safety char buffer[2048]; int length = fread (buffer, 1, 2048, in); int done = (length < 2048) ? 1 : 0; if (length) { ctxt = htmlCreatePushParserCtxt (&hdl, (void *) this, buffer, length, 0, XML_CHAR_ENCODING_NONE); if (ctxt == 0) m_bStopped = true; } if (ctxt) { while (!done && !m_bStopped) { length = fread (buffer, 1, 2048, in); done = (length < 2048) ? 1 : 0; if (htmlParseChunk (ctxt, buffer, length, 0)) { m_bStopped = true; break; } } if (!m_bStopped) htmlParseChunk (ctxt, buffer, 0, 1); ctxt->sax = 0; htmlFreeParserCtxt (ctxt); } fclose (in); return !m_bStopped; } static void _startElement (void * userData, const xmlChar * name, const xmlChar ** atts) { HTML_Parser * pHTML = (HTML_Parser *) userData; /* libxml2 can supply atts == 0, which is a little at variance to what is expected... */ const char * ptr = 0; const char ** new_atts = (const char **) atts; if (atts == 0) new_atts = &ptr; pHTML->startElement ((const char *) name, new_atts); } static void _endElement (void * userData, const xmlChar * name) { HTML_Parser * pHTML = (HTML_Parser *) userData; pHTML->endElement ((const char *) name); } static void _charData (void * userData, const xmlChar * buffer, int length) { HTML_Parser * pHTML = (HTML_Parser *) userData; pHTML->charData ((const char *) buffer, length); } static xmlEntityPtr _getEntity (void * userData, const xmlChar * name) { return xmlGetPredefinedEntity (name); } void HTML_Parser::startElement (const char * name, const char ** atts) { if (m_bStopped) return; if (!m_bValid) { if ((strcmp (name, "html") == 0) || (strcmp (name, "html:html") == 0)) { m_bValid = true; } else { m_bStopped = true; return; } } if (m_pListener) m_pListener->startElement (name, atts); } void HTML_Parser::endElement (const char * name) { if (m_bStopped) return; if (m_pListener) m_pListener->endElement (name); } void HTML_Parser::charData (const char * buffer, int length) { if (m_bStopped) return; if (m_pListener) m_pListener->charData (buffer, length); }