/* Copyright (C) 2007 One Laptop Per Child * Author: Marc Maurer * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include #include #include "pd_Document.h" #include "fl_BlockLayout.h" #include "fp_Run.h" #include "fp_TextRun.h" #include "px_ChangeRecord.h" #include "px_CR_Strux.h" #include "px_CR_Span.h" #include "pf_Frag_Strux.h" #include "pf_Frag_Strux_Block.h" #include "LanguagePattern.h" #include "Highlighter.h" using std::map; Highlighter::Highlighter(FL_DocLayout* pDocLayout, const LanguageDefinition* pLangDef) : m_pDocLayout(pDocLayout), m_pLangDef(pLangDef), m_iListenerId(0) { // FIXME: we have to be _sure_ that we are notified AFTER the FL_DocLayout class we belong to, // so the layout classes have been updated already when we get notified if (m_pDocLayout && m_pDocLayout->getDocument()) { m_pDocLayout->getDocument()->addListener(this, &m_iListenerId); } } Highlighter::~Highlighter() { if (m_pDocLayout && m_pDocLayout->getDocument()) { m_pDocLayout->getDocument()->removeListener(m_iListenerId); } } bool Highlighter::populate(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr) { UT_return_val_if_fail(sfh, false); UT_return_val_if_fail(pcr, false); UT_DEBUGMSG(("Highlighter::populate() - sfh: 0x%x, pcr type: %d\n", sfh, pcr->getType())); if (pcr->getType() == PX_ChangeRecord::PXT_InsertSpan || pcr->getType() == PX_ChangeRecord::PXT_DeleteSpan) { return _highlight(sfh, pcr, true); } return true; } bool Highlighter::populateStrux(PL_StruxDocHandle sdh, const PX_ChangeRecord *pcr, PL_StruxFmtHandle *psfh) { UT_DEBUGMSG(("Highlighter::populateStrux(sdh: 0x%x)\n", sdh)); UT_return_val_if_fail(sdh, false); // UT_return_val_if_fail(pcr, false); UT_return_val_if_fail(psfh, false); switch (pcr->getType()) { case PX_ChangeRecord::PXT_InsertStrux: { UT_DEBUGMSG(("Highlighter::populateStrux() - insert\n")); // FIXME: only do this for blocks const pf_Frag_Strux* pFS = reinterpret_cast(sdh); RegionMap* pNewRegionMap = new RegionMap(); if (_setupBlockMatches(pcr, *pNewRegionMap)) { *psfh = reinterpret_cast(pNewRegionMap); } else { DELETEP(pNewRegionMap); *psfh = (void*)NULL; } break; } case PX_ChangeRecord::PXT_DeleteStrux: UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); // a PXT_DeleteStrux while populating is, erm, well, weird break; default: // ignore the rest, we don't need it for highlighting break; } return true; } bool Highlighter::change(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr) { UT_DEBUGMSG(("Highlighter::change()\n")); UT_return_val_if_fail(sfh, false); UT_return_val_if_fail(pcr, false); // we are only interested in changes in actual spans switch (pcr->getType()) { case PX_ChangeRecord::PXT_InsertSpan: case PX_ChangeRecord::PXT_DeleteSpan: return _highlight(sfh, pcr); case PX_ChangeRecord::PXT_DeleteStrux: { UT_DEBUGMSG(("Highlighter::change() - PXT_DeleteStrux, sfh: 0x%x\n", sfh)); UT_return_val_if_fail(sfh, false); RegionMap* pMatches = const_cast(reinterpret_cast(sfh)); DELETEP(pMatches); return true; } default: // ignore the rest, we don't need it for highlighting break; } return true; } bool Highlighter::insertStrux(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr, PL_StruxDocHandle sdhNew, PL_ListenerId lid, void(*pfnBindHandles)(PL_StruxDocHandle sdhNew, PL_ListenerId lid, PL_StruxFmtHandle sfhNew)) { UT_DEBUGMSG(("Highlighter::insertStrux()\n")); // UT_return_val_if_fail(sfh, false); UT_return_val_if_fail(pcr, false); UT_return_val_if_fail(sdhNew, false); RegionMap* pNewRegionMap = new RegionMap(); if (_setupBlockMatches(pcr, *pNewRegionMap)) { pfnBindHandles(sdhNew, m_iListenerId, pNewRegionMap); } else { DELETEP(pNewRegionMap); pfnBindHandles(sdhNew, m_iListenerId, (void*)NULL); } return true; } bool Highlighter::signal(UT_uint32 iSignal) { UT_DEBUGMSG(("Highlighter::signal()\n")); return true; } PLListenerType Highlighter::getType() const { return PTL_DocLayout; } bool Highlighter::_setupBlockMatches(const PX_ChangeRecord *pcr, RegionMap& matches) { UT_return_val_if_fail(pcr, false); // FIXME: can't this be faster? ie, use the sfh from the normal layout listener UT_return_val_if_fail(pcr->getType() == PX_ChangeRecord::PXT_InsertStrux, false); const PX_ChangeRecord_Strux* pcrs = reinterpret_cast(pcr); if (pcrs->getStruxType() != PTX_Block) { UT_DEBUGMSG(("Ignoring insertion of non-block type strux\n")); return false; } // TODO: we assume that this will return the block before the new block; is this // always correct?? UT_DEBUGMSG(("Looking for previous block before cr pos: %d\n", pcr->getPosition())); fl_BlockLayout* pPrevBL = m_pDocLayout->findBlockAtPosition(pcr->getPosition()-1); UT_return_val_if_fail(pPrevBL, false); if (pPrevBL->getPosition() >= pcr->getPosition()) { UT_DEBUGMSG(("Apparently we are the first block in the document, no need to set up a continuation marker\n")); return true; // apparently there is no block before our current block } UT_DEBUGMSG(("Prev block in document (pos: %d): 0x%x\n", pPrevBL->getPosition()-1, pPrevBL)); UT_DEBUGMSG(("Looking at previous block to see if we should be a continuation\n")); RegionMap* pPrevMap = _getRegionMap(pPrevBL); if (pPrevMap) { if (_isOpen(pPrevBL->getLength()-1, *pPrevMap)) // -1 for the block itself, which is included in the block length { UT_DEBUGMSG(("The new block should be inserted as continuation section\n")); // insert a continuation section 'marker' // TODO: should we make this closed or not? RegExMatch cont_match; cont_match.byte_start = 0; cont_match.byte_end = 0; cont_match.char_start = 0; cont_match.char_end = 0; cont_match.continuation = true; cont_match.closed = false; cont_match.pattern = (*(--pPrevMap->end())).second.pattern; matches.insert(map::value_type(cont_match.byte_start, cont_match)); } else UT_DEBUGMSG(("The new block is no continuation block\n")); } return true; } UT_RGBColor Highlighter::_getColor(const LanguagePattern& pattern) { if (strcmp(pattern.style.c_str(), "String") == 0) return UT_RGBColor(235,64,255); else if (strcmp(pattern.style.c_str(), "Decimal") == 0) return UT_RGBColor(148,64,255); else if (strcmp(pattern.style.c_str(), "Keyword") == 0) return UT_RGBColor(150,0,0); else if (strcmp(pattern.style.c_str(), "Types") == 0) return UT_RGBColor(0,150,81); else if (strcmp(pattern.style.c_str(), "Comment") == 0) return UT_RGBColor(0,85,213); else if (strcmp(pattern.style.c_str(), "Preprocessor") == 0) return UT_RGBColor(0,0,213); else if (strcmp(pattern.style.c_str(), "Others") == 0) return UT_RGBColor(0,200,0); else if (strcmp(pattern.style.c_str(), "Data Type") == 0) return UT_RGBColor(64,200,0); else if (strcmp(pattern.style.c_str(), "Base-N Integer") == 0) return UT_RGBColor(64,0,0); else if (strcmp(pattern.style.c_str(), "Character") == 0) return UT_RGBColor(64,0,64); else { UT_DEBUGMSG(("unknown pattern style: %s\n", pattern.style.c_str())); UT_ASSERT(UT_NOT_IMPLEMENTED); return UT_RGBColor(255,0,0); } } bool Highlighter::_highlight(PL_StruxFmtHandle sfh, const PX_ChangeRecord *pcr, bool isPopulating) { UT_DEBUGMSG(("Highlighter::_highlight() - pcr: 0x%x, pcr type: %d\n", pcr, pcr ? pcr->getType() : -1)); UT_return_val_if_fail(pcr->getType() == PX_ChangeRecord::PXT_InsertSpan || pcr->getType() == PX_ChangeRecord::PXT_DeleteSpan, false); UT_DEBUGMSG(("Highlighter::_highlight() - cr is of type %s\n", pcr->getType() == PX_ChangeRecord::PXT_InsertSpan ? "insert" : "delete")); const PX_ChangeRecord_Span* pcrs = static_cast(pcr); // calculate the corrention needed to determine positions before the changerecord was applied UT_sint32 crCorrection = (pcrs->getType() == PX_ChangeRecord::PXT_InsertSpan ? -pcrs->getLength() : ( pcrs->getType() == PX_ChangeRecord::PXT_DeleteSpan ? pcrs->getLength() : 0 )); UT_DEBUGMSG(("Highlighter::_highlight() insert/delete change: blockoffset: %d, bufindex: %u, length %d\n", pcrs->getBlockOffset(), pcrs->getBufIndex(), pcrs->getLength())); // get our current block // FIXME: can't this be faster? ie, use the sfh from the normal layout listener fl_BlockLayout* pBL = m_pDocLayout->findBlockAtPosition(pcr->getPosition()); UT_return_val_if_fail(pBL, true); // FIXME: we should stop highlighting when a match is equal to the match // we already (might have) had // // Re-match all (potentially) damaged blocks // RegionMap* pMatches = 0; PT_BlockOffset damageOffset = -1; bool finished = false; fl_BlockLayout* pPrevBL = 0; while (pBL != pPrevBL) { pPrevBL = pBL; if (pBL == 0) break; bool openEndAtStart = false; bool openEndAtEnd = false; UT_GrowBuf textBuffer; // get a text buffer for this block // TODO: optimize this PL_StruxDocHandle sdh = pBL->getStruxDocHandle(); UT_return_val_if_fail(sdh, true); const pf_Frag_Strux_Block* fsb = reinterpret_cast(sdh); m_pDocLayout->getDocument()->getBlockBuf(sdh, &textBuffer); pMatches = _getRegionMap(pBL); if (!pMatches) { UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); break; } RegionMap& matches = *pMatches; // check if we need to recheck the next block openEndAtStart = _isOpen(textBuffer.getLength() + crCorrection, matches); crCorrection = 0; // we only need to correct for the first block // find the offset in the block the damage has occurred if (damageOffset == -1) damageOffset = _findDamageOffset(fsb, pcr, matches); // syntac highlight this block UT_DEBUGMSG(("Syntaxhighlighting from offset: %d\n", damageOffset)); // first, check if we damaged a continuation match (from a match in the previous block) bool matchingFinished = false; if (damageOffset == 0 && matches.begin() != matches.end() && (*matches.begin()).second.continuation) { RegExMatch old_cont_match = (*matches.begin()).second; UT_DEBUGMSG(("Damage occurred to a continuation section, for pattern: %s!\n", old_cont_match.pattern->name.c_str())); PT_BlockOffset end_regex_offset = 0; // FIXME: remove ugly const cast bool foundEndRegEx = _matchEndRegex(textBuffer, *const_cast(old_cont_match.pattern), &end_regex_offset); if (foundEndRegEx) { UT_DEBUGMSG(("Found end regex for continuation match at pos: %d\n", end_regex_offset)); _pruneRegionsUpTo(end_regex_offset, matches); RegExMatch cont_closed_match; cont_closed_match.byte_start = 0; cont_closed_match.byte_end = end_regex_offset; // FIXME: THIS IS PLAIN WRONG cont_closed_match.char_start = 0; cont_closed_match.char_end = end_regex_offset; cont_closed_match.continuation = true; cont_closed_match.closed = true; cont_closed_match.pattern = old_cont_match.pattern; matches.insert(map::value_type(cont_closed_match.byte_start, cont_closed_match)); UT_DEBUGMSG(("Moving damageOffset forward to %d\n", end_regex_offset)); damageOffset = end_regex_offset; } else { UT_DEBUGMSG(("Found no end regex for continuation match, marking whole block as a continuation and open\n")); // we apparently damaged the end regex match, and no other one exists in this // block. This means we'll make 1 match that spans this whole block, and is as // well a continuation match as an open match matches.clear(); RegExMatch cont_open_match; cont_open_match.byte_start = 0; cont_open_match.byte_end = textBuffer.getLength(); // FIXME: THIS IS PLAIN WRONG cont_open_match.char_start = 0; cont_open_match.char_end = textBuffer.getLength(); cont_open_match.continuation = true; cont_open_match.closed = false; cont_open_match.pattern = old_cont_match.pattern; matches.insert(map::value_type(cont_open_match.byte_start, cont_open_match)); matchingFinished = true; } } if (!matchingFinished) { // delete all regions after the damage spot, we can't be sure they won't // be touched in any shape or form _pruneRegions(damageOffset, matches); _matchRegions(textBuffer, damageOffset, matches); _pruneAndRematchRegions(textBuffer, damageOffset, matches); } // Update the run coloring information for this block _decorateRuns(pBL, damageOffset, matches); if (!isPopulating) { // the last match in this block could have become open during this rematching // session, which means we have to continue matching on the next line // NOTE: we need to check what the situation was BEFORE the changerecord was // applied, hence the changerecord correction factor openEndAtEnd = _isOpen(textBuffer.getLength(), matches); // now check if we need to rematch the next block if (!(openEndAtStart || openEndAtEnd)) { UT_DEBUGMSG(("openEndAtStart = false, openEndAtEnd = false\n")); // do nothing } else if (!openEndAtStart && openEndAtEnd) { UT_DEBUGMSG(("openEndAtStart = false, openEndAtEnd = true\n")); // search for the ending regex of our open match in the next block(s) RegionMap::iterator ilast = --matches.end(); RegExMatch& open_match = (*ilast).second; if (open_match.pattern) { pBL = pBL->getNextBlockInDocument(); while (pBL) { UT_GrowBuf nextTextBuffer; PT_BlockOffset end_regex_offset = 0; PL_StruxDocHandle next_sdh = pBL->getStruxDocHandle(); UT_return_val_if_fail(next_sdh, false); m_pDocLayout->getDocument()->getBlockBuf(next_sdh, &nextTextBuffer); bool foundEndRegEx = _matchEndRegex(nextTextBuffer, *const_cast(open_match.pattern), &end_regex_offset); if (foundEndRegEx) { damageOffset = end_regex_offset; RegionMap* pNextBlockMatches = _getRegionMap(pBL); if (pNextBlockMatches) { // first, delete all matches in the next block up to the end of the ending regex _pruneRegionsUpTo(end_regex_offset, *pNextBlockMatches); // insert a new match, which closes the opened match RegExMatch closing_match; closing_match.byte_start = 0; // TODO: this isn't true when there are for example images in this block closing_match.byte_end = damageOffset; // FIXME, not utf8 safe closing_match.char_start = 0; closing_match.char_end = damageOffset; closing_match.continuation = true; closing_match.closed = true; closing_match.pattern = open_match.pattern; pNextBlockMatches->insert(map::value_type(closing_match.byte_start, closing_match)); // recolor the runs _decorateRuns(pBL, 0, *pNextBlockMatches); } else UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); break; } else { // nope, this block does not have the ending regex we are looking for // this means we will erase all it's current matches, and replace it // with one open match UT_DEBUGMSG(("No closing regex found for block at pos: %d, erasing all and inserting a signle continued, open match\n", pBL->getPosition())); RegionMap* pNextBlockMatches = _getRegionMap(pBL); if (pNextBlockMatches) { // clear and insert one continued, open match pNextBlockMatches->clear(); RegExMatch cont_match; cont_match.byte_start = 0; // TODO: this isn't true when there are for example images in this block cont_match.byte_end = pBL->getLength(); // FIXME, not utf8 safe cont_match.char_start = 0; cont_match.char_end = pBL->getLength(); cont_match.continuation = true; cont_match.closed = false; cont_match.pattern = open_match.pattern; pNextBlockMatches->insert(map::value_type(cont_match.byte_start, cont_match)); // recolor the runs _decorateRuns(pBL, 0, *pNextBlockMatches); } else UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); // now search again for a matching end regex in the next block; maybe // we are lucky this time pBL = pBL->getNextBlockInDocument(); } } } else UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); continue; } else if (openEndAtStart && openEndAtEnd) { UT_DEBUGMSG(("openEndAtStart = true, openEndAtEnd = true\n")); // FIXME: we're ONLY done here when the current open match and the // end as this block didn't change its type during this rematching // iteration. If its open of a different type now, then we should // recheck the next block for a closing match } // TODO: ALSO CHECK IF WHEN WE _ARE_ OPEN, THAT THE NEXT CONTINUATIO MATCH HAS THE SAME PATTERN // one final last case: the next block is a continuation, but we don't have // any open match on the end of the current block, meaning we have found // some stale highlighting, force a recheck. if (!openEndAtEnd && _nextBlockIsContinuation(pBL)) // FIXME: suboptimal, as it requires 2 times fetching the next block in this document { UT_DEBUGMSG(("Detected stale continuation highlighting in the next block, continuing with next block\n")); pBL = pBL->getNextBlockInDocument(); // NOTE: make sure to remove all existing matches, as a full recheck will never remove continuations RegionMap* pNextMatches = _getRegionMap(pBL); if (pNextMatches) pNextMatches->clear(); else UT_ASSERT(UT_SHOULD_NOT_HAPPEN); damageOffset = 0; } } /* if (!populating) */ } /* while (pBL != pPrevBL) */ return true; } PT_BlockOffset Highlighter::_findDamageOffset(const pf_Frag_Strux_Block* fsb, const PX_ChangeRecord *pcr, RegionMap& matches) { // get the position of the damage relative to the start of the block // NOTE: we want to find the position for the situation that existed // BEFORE the changerecord was applied, as all our matches are indexed // using that metric PT_BlockOffset damageOffset = pcr->getPosition() - fsb->getPos() - 1; // TODO: rewrite this!!! return 0; } bool Highlighter::_isContinuation(RegionMap& matches) { UT_DEBUGMSG(("Highlighter::_isContinuation()\n")); return matches.begin() != matches.end() && (*matches.begin()).second.continuation; } bool Highlighter::_isOpen(PT_BlockOffset endOffset, RegionMap& matches) { UT_DEBUGMSG(("Highlighter::_isOpen() - endOffset: %d\n", endOffset)); if (matches.size() == 0) return false; // TODO: only do this if the current match was actually modified/removed/whatever (can't check that here) RegionMap::iterator ilast = --matches.end(); RegExMatch& match = (*ilast).second; UT_DEBUGMSG(("Found the last match, char_end: %d, type: %s\n", match.char_end, match.pattern->name.c_str())); if (match.char_end == endOffset && !match.closed) return true; return false; } bool Highlighter::_nextBlockIsContinuation(fl_BlockLayout* pBL) { UT_return_val_if_fail(pBL, false); fl_BlockLayout* pNextBL = pBL->getNextBlockInDocument(); if (pNextBL) { RegionMap* pMatches = _getRegionMap(pNextBL); if (pMatches) { if (pMatches->begin() != pMatches->end() && (*pMatches->begin()).second.continuation) return true; } else UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); } return false; } RegionMap* Highlighter::_getRegionMap(const fl_BlockLayout* pBL) { UT_return_val_if_fail(pBL, NULL); // get pt element (frag strux) for this block PL_StruxDocHandle sdh = pBL->getStruxDocHandle(); UT_return_val_if_fail(sdh, 0); const pf_Frag_Strux_Block* fsb = reinterpret_cast(sdh); return const_cast(reinterpret_cast(fsb->getFmtHandle(m_iListenerId))); // we really don't want/require the RegionMap to be const } void Highlighter::_pruneRegionsUpTo(PT_BlockOffset startOffset, RegionMap& matches) { for (RegionMap::iterator nbmit = matches.begin(); nbmit != matches.end();) { RegionMap::iterator cur_nbmit = nbmit++; if ((*cur_nbmit).first < startOffset) matches.erase(cur_nbmit); else break; } } void Highlighter::_pruneRegions(PT_BlockOffset startOffset, RegionMap& matches) { for (RegionMap::iterator mit = matches.begin(); mit != matches.end(); mit++) { RegExMatch& match = (*mit).second; if (match.char_start >= startOffset) { matches.erase(mit, matches.end()); break; } } } void Highlighter::_matchRegions(const UT_GrowBuf& textBuffer, PT_BlockOffset startOffset, RegionMap& matches) { UT_return_if_fail(m_pLangDef); gchar* utf8str = g_ucs4_to_utf8((const gunichar*)textBuffer.getPointer(0), textBuffer.getLength(), 0, 0, 0); int startByteOffset = (startOffset > 0 ? g_utf8_offset_to_pointer(utf8str, startOffset) - utf8str : 0); UT_DEBUGMSG(("Highlighter::_nextRegion() - utf8str: %s, startByteOffset: %d\n", utf8str, startByteOffset)); RegExMatch match; for (vector::const_iterator pos = m_pLangDef->getPatterns().begin(); pos != m_pLangDef->getPatterns().end(); pos++) { LanguagePattern* pPat = *pos; if (pPat) { match.pattern = pPat; switch (pPat->type) { case LanguagePattern::ESCAPE_CHAR: // do nothing break; default: { // try to match this pattern; every pattern may match multiple times (where applicable) int start = startByteOffset; int strlength = g_utf8_strlen(utf8str, -1); while (start >= 0 && start < strlength) { if (pPat->start_regex.size() > 0) { if (_search(pPat->comp_start_regex, pPat->start_regex, utf8str, strlength, start, &match) >= 0) { UT_DEBUGMSG(("start-regex match: %s at pos %d\n", pPat->start_regex.c_str(), match.byte_start)); start = match.byte_end; if (pPat->end_regex.size() > 0) { // now try to find up the ending regex RegExMatch end_match = match; if (start < strlength && _search(pPat->comp_end_regex, pPat->end_regex, utf8str, strlength, start, &end_match) >= 0) { UT_DEBUGMSG(("end-regex match: %s at pos: %d\n", pPat->end_regex.c_str(), end_match.byte_start)); start = end_match.byte_end; // now resize the opening match to include the end match match.byte_end = end_match.byte_end; match.char_end = end_match.char_end; matches.insert(map::value_type(match.byte_start, match)); } else { // we found no matching end regex for the opening regex if (pPat->endAtLineEnd) { UT_DEBUGMSG(("no end-regex match found, inflating to end of line and inserting at pos %d, type: %s\n", match.byte_start, pPat->name.c_str())); // just scale up this match to the end of the line, and be done with it match.byte_end = (int)g_utf8_strlen(utf8str, -1); match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string? matches.insert(map::value_type(match.byte_start, match)); UT_DEBUGMSG(("inflated, byte_end: %d, char_end: %d\n", match.byte_end, match.char_end)); break; } else { UT_DEBUGMSG(("no end-regex match found, inflating to end of line, marking unclosed and inserting\n")); // scale up this match to the end of the line; // furtermore mark this match as // non-closed, so we can continue searching for the matching end regex on the next line; match.byte_end = (int)g_utf8_strlen(utf8str, -1); match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string? match.closed = false; matches.insert(map::value_type(match.byte_start, match)); break; } } } else { // there is no end regex belonging to this start regex if (pPat->endAtLineEnd) { // scale up this match to the end of the line match.byte_end = strlength; match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string? matches.insert(map::value_type(match.byte_start, match)); break; } else { // huh?! should we just scale this match up to the end of this document? UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); // for now, to handle this we'll just scale this match up to the end of the line match.byte_end = strlength; match.char_end = g_utf8_pointer_to_offset(utf8str, utf8str + match.byte_end); // TODO: is it save to make the offset point after the string? matches.insert(map::value_type(match.byte_start, match)); break; } } } else break; } else if (pPat->regex.size() > 0) { if (_search(pPat->comp_regex, pPat->regex, utf8str, strlength, start, &match) >= 0) { // TODO: is end-at-end-of-line important here? UT_DEBUGMSG(("regex match: %s\n", pPat->regex.c_str())); start = match.byte_end; matches.insert(map::value_type(match.byte_start, match));; } else break; } else { UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); break; } } } break; } /* switch */ } /* if */ } /* for */ } void Highlighter::_pruneAndRematchRegions(const UT_GrowBuf& textBuffer, PT_BlockOffset startOffset, RegionMap& matches) { UT_DEBUGMSG(("pruneAndRematchRegions() - startOffset: %d\n", startOffset)); // now prune all overlapping matches for (map::iterator cur = matches.begin(); cur != matches.end(); cur++) { RegExMatch cur_match = (*cur).second; UT_continue_if_fail(cur_match.pattern); // first, skip every match that is before our starting position if (cur_match.char_end <= startOffset) { UT_DEBUGMSG(("Skipping check on pos %d for pattern: %s\n", cur_match.byte_start, cur_match.pattern->name.c_str())); continue; } UT_DEBUGMSG(("\tKeep match at pos: %d-%d for pattern: %s\n", cur_match.byte_start, cur_match.byte_end, cur_match.pattern->name.c_str())); map::iterator next = cur; next++; if (next == matches.end()) { UT_DEBUGMSG(("\tWe are the last match, auto-approving\n")); break; } RegExMatch next_match = (*next).second; bool checknext = true; while (checknext) { if ((next_match.byte_start <= cur_match.byte_end-1) && (next_match.byte_end-1 <= cur_match.byte_end-1)) { // the current match completely overlaps the next match, so we // can remove the next match from our list of valid matches UT_DEBUGMSG(("\tDelete match at pos: %d for pattern: %s\n", next_match.byte_start, next_match.pattern->name.c_str())); map::iterator del = next; next++; next_match = (*next).second; matches.erase(del); if (next == matches.end()) checknext = false; } else if ((next_match.byte_start <= cur_match.byte_end-1) && next_match.byte_end-1 > cur_match.byte_end-1) { // the next match has its starting point within the current match, // and its end point after the end of the current match. // for example (c code): // // if (true) { pr/*intf("*/a"); printf("b"); } // // the closing " of string in this case should be interpreted as // the start of a string. // to reach that behavior, we invalidate every match after the // current match, and start over from the end point of the current match UT_DEBUGMSG(("\tPartial overlap found at pos %d, INVALIDATING FROM %d, and auto-approving current\n", next_match.byte_start, cur_match.byte_end)); matches.erase(next, matches.end()); _matchRegions(textBuffer, cur_match.char_end, matches); UT_DEBUGMSG(("\tDone redoing partial overlap from pos %d\n", cur_match.byte_end)); // note that we can continue validating from this point onwards, as none of the // already validated matches will be touched by the previous _matchRegions() calls // we can auto-approve the current match now, as no other match can overlap this one // anymore, given the matches.erase() we just did checknext = false; } else { UT_DEBUGMSG(("\tNo overlap against current match anymore: %d-%d for pattern: %s\n", cur_match.byte_start, cur_match.byte_end, cur_match.pattern->name.c_str())); checknext = false; } } /* while */ } } bool Highlighter::_matchEndRegex(const UT_GrowBuf& textBuffer, LanguagePattern& pattern, PT_BlockOffset* iOffset) { UT_DEBUGMSG(("Highlighter::_matchEndRegex()\n")); if (pattern.end_regex.size() == 0) return false; gchar* utf8str = g_ucs4_to_utf8((const gunichar*)textBuffer.getPointer(0), textBuffer.getLength(), 0, 0, 0); UT_DEBUGMSG(("Highlighter::_matchEndRegex() - utf8str: %s\n", utf8str)); RegExMatch match; if (_search(pattern.comp_end_regex, pattern.end_regex, utf8str, (int)g_utf8_strlen(utf8str, -1), 0, &match) >= 0) { UT_DEBUGMSG(("Found end regex ending at char offset: %d\n", match.char_end)); *iOffset = match.char_end; return true; } return false; } void Highlighter::_decorateRuns(fl_BlockLayout* pBL, UT_uint32 runOffset, RegionMap& matches) { UT_return_if_fail(pBL); // TODO: use getRun() ourselves, as we iterate way to much now using findRunAtOffset RegionMap::const_iterator cmit = matches.begin(); fp_Run* pRun = pBL->findRunAtOffset(runOffset); while (pRun) { runOffset = pRun->getBlockOffset(); // find the actual run offset if (pRun->getType() == FPRUN_TEXT) { fp_TextRun* pTextRun = static_cast(pRun); vector& vDecoration = pTextRun->getTextDecorations(); vDecoration.clear(); pTextRun->markAsDirty(); UT_DEBUGMSG(("Got textrun: 0x%x, run block offset: %d\n", pRun, pTextRun->getBlockOffset())); while (cmit != matches.end()) { RegExMatch match = (*cmit).second; if (match.char_end <= pRun->getBlockOffset()) { // the match is before this run, we're done with the current match cmit++; } else if (match.char_start >= pRun->getBlockOffset() + pRun->getLength()) { // the match is after this run, we're done witht the current run break; } else { // this match touches this run UT_uint32 hit_start = match.char_start > pRun->getBlockOffset() ? match.char_start : pRun->getBlockOffset(); UT_uint32 hit_end = match.char_end > pRun->getBlockOffset() + pRun->getLength() ? pRun->getBlockOffset() + pRun->getLength() : match.char_end; // color this part of the fp_TextRunDecoration decor; decor.setStartOffset(hit_start - pRun->getBlockOffset()); // decorator offsets are relative to the run block offset decor.setEndOffset(hit_end - pRun->getBlockOffset()); decor.setFgColor(_getColor(*match.pattern)); vDecoration.push_back(decor); UT_DEBUGMSG(("Colored textrun: 0x%x, run block offset: %d, length: %d, decor.start: %d, decor.end: %d, match.char_end: %d\n", pRun, pTextRun->getBlockOffset(), pRun->getLength(), decor.getStartOffset(), decor.getEndOffset(), match.char_end)); if (match.char_end <= pRun->getBlockOffset() + pRun->getLength()) cmit++; // we're done with the current match if (hit_end == pRun->getBlockOffset() + pRun->getLength()) break; // we're done with the current run } } } runOffset = pRun->getBlockOffset() + pRun->getLength(); UT_DEBUGMSG(("Finding run at offset: %d\n", runOffset)); pRun = pBL->findRunAtOffset(runOffset); } UT_DEBUGMSG(("Done coloring runs\n")); } int Highlighter::_search(regex_t*& comp_regex, const string& regex, gchar* str, int length, int start, RegExMatch* pMatch) { //UT_DEBUGMSG(("Highlighter::_search() - regex: >%s<, str: >%s<, length: %d, start: %d\n", regex.c_str(), str, length, start)); //UT_return_val_if_fail(comp_regex, -2); UT_return_val_if_fail(regex.size() > 0, -2); UT_return_val_if_fail(str != NULL && *str != '\0', -2); UT_return_val_if_fail(length > 0, -2); UT_return_val_if_fail(start >= 0 && start < length, -2); UT_return_val_if_fail(pMatch, -2); int startpos = -1; /* no match */ if (!comp_regex) { comp_regex = new regex_t(); //UT_DEBUGMSG(("Compiling regex in 0x%x\n", comp_regex)); re_syntax_options = RE_SYNTAX_POSIX_MINIMAL_EXTENDED; comp_regex->translate = NULL; comp_regex->fastmap = reinterpret_cast(g_malloc(256)); comp_regex->allocated = 0; comp_regex->buffer = NULL; // precompile our regular expression, and cache the results const char *res = re_compile_pattern(regex.c_str(), regex.size(), comp_regex); if (res != NULL) { UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); FREEP(comp_regex->fastmap); DELETEP(comp_regex); return -2; /* internal error */ } if (re_compile_fastmap(comp_regex) != 0) { UT_ASSERT_HARMLESS(UT_SHOULD_NOT_HAPPEN); regfree(comp_regex); DELETEP(comp_regex); return -2; /* internal error */ } } //UT_DEBUGMSG(("Using compiled regex: 0x%x\n", comp_regex)); re_registers regs; int p = re_search(comp_regex, str, length, start, length, ®s); if (p >= 0) { // UT_DEBUGMSG(("match at pos %d for regex: %s\n", p, regex.c_str())); startpos = p; // byte index pMatch->byte_start = p; pMatch->byte_end = regs.end[0]; // character index pMatch->char_start = g_utf8_pointer_to_offset(str, str + p); pMatch->char_end = g_utf8_pointer_to_offset(str, str + regs.end[0]); //UT_DEBUGMSG((" start: %d, end: %d growbuf: start: %d, end: %d\n", pMatch->byte_start, pMatch->byte_end, pMatch->char_start, pMatch->char_end)); // HACK HACK HACK: when we have a match, and the same pattern matches next time, it will crash regfree(comp_regex); DELETEP(comp_regex); } return startpos; }