/* Crossmark processing library * Copyright (C) 2006, Robert Staudinger * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include #include #include #include "cm-scanner.hh" #include "cm-stdio-stream.hh" // debug #include using namespace crossmark; Scanner::Scanner (const std::string &file) : _istream (* new streams::StdInput (file)), _ownStream (FALSE), _next (NULL), _c1 (0) { _next = new tokens::Start (); } Scanner::Scanner (streams::Input &istream) : _istream (istream), _ownStream (TRUE), _next (NULL), _c1 (0) { _next = new tokens::Start (); } Scanner::~Scanner () { if (_ownStream) { delete &_istream; } } /*! * Fetch the next token from the input file. */ tokens::Token * Scanner::fetchToken () { tokens::Token *token; if (_next) { token = _next; _next = NULL; return token; } tokens::Text *text = NULL; gboolean restart; gunichar c2; gunichar tail; while (TRUE) { restart = FALSE; // look ahead if (_c1 == 0) { _c1 = _istream.getChar (); } if ((_next = scanEnd ()) || (_next = scanNewline (restart)) || (_next = scanIndent ())) { if (text) { return text; } else { token = _next; _next = NULL; return token; } } if (restart) { continue; } // look further c2 = _istream.getChar (); //std::cout << "-- '" << _c1 << "' '" << c2 << "'" << std::endl; if ((_next = scanEnd (c2))) { if (text) { text->append (_c1); } else { text = new tokens::Text (g_strdup_printf ("%c", c2)); } _c1 = c2; return text; } else if (_next = scanStyle (c2, tail)) { if (text && tail) { text->append (tail); token = text; } else if (text) { token = text; } else if (tail) { text = new tokens::Text (g_strdup_printf ("%c", tail)); token = text; } else { token = _next; _next = NULL; } return token; } else { if (text) { text->append (_c1); } else { text = new tokens::Text (g_strdup_printf ("%c", _c1)); } _c1 = c2; } } } tokens::Token * Scanner::scanEnd () { if (_c1 == EOF) { return new tokens::End (); } return NULL; } tokens::Token * Scanner::scanEnd (gunichar c) { if (c == EOF) { return new tokens::End (); } return NULL; } /*! * \todo Not eat the newline if an indentation follows, * need to recognise that for blockquote, lists. * \todo Should a newline be replaced by a whitespace under certain circumstances? * Otherwise "foo\nbar" ends up as "foobar". */ tokens::Token * Scanner::scanNewline (gboolean &restart) { gboolean isParagraph; restart = FALSE; isParagraph = FALSE; if (_c1 == '\n') { do { _c1 = _istream.getChar (); if (_c1 == '\n') { isParagraph = TRUE; } } while (_c1 == '\n'); if (isParagraph) { return new tokens::Paragraph (); } else { return new tokens::Newline (); restart = TRUE; } } return NULL; } /*! * \todo For now only tab indentation is supported. */ tokens::Token * Scanner::scanIndent () { if (_c1 == '\t') { _c1 = 0; return new tokens::Indent (); } return NULL; } /*! * \todo Support all specified word boundaries (whitespace, punctuation, newline). */ tokens::Token * Scanner::scanStyle (gunichar c2, gunichar &tail) { tail = 0; if (_c1 == ' ' && c2 == '*') { tail = _c1; _c1 = 0; return new tokens::Style (tokens::Style::ASTERISK, tokens::Style::LEFT); } else if (_c1 == '*' && c2 == ' ') { _c1 = c2; return new tokens::Style (tokens::Style::ASTERISK, tokens::Style::RIGHT); } else if (_c1 == ' ' && c2 == '/') { tail = _c1; _c1 = 0; return new tokens::Style (tokens::Style::SLASH, tokens::Style::LEFT); } else if (_c1 == '/' && c2 == ' ') { _c1 = c2; return new tokens::Style (tokens::Style::SLASH, tokens::Style::RIGHT); } else if (_c1 == ' ' && c2 == '`') { tail = _c1; _c1 = 0; return new tokens::Style (tokens::Style::BACKTICK, tokens::Style::LEFT); } else if (_c1 == '`' && c2 == ' ') { _c1 = c2; return new tokens::Style (tokens::Style::BACKTICK, tokens::Style::RIGHT); } else if (_c1 == ' ' && c2 == '_') { tail = _c1; _c1 = 0; return new tokens::Style (tokens::Style::UNDERSCORE, tokens::Style::LEFT); } else if (_c1 == '_' && c2 == ' ') { _c1 = c2; return new tokens::Style (tokens::Style::UNDERSCORE, tokens::Style::RIGHT); } return NULL; }