/* -*- mode: C++; tab-width: 4; c-basic-offset: 4; -*- */ // UT_Stringbuf.cpp // Copyright (C) 2001 Mike Nordell // // This class is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This class is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA // 02111-1307, USA. // #include #include #include #include "ut_string.h" #include "ut_stringbuf.h" #include "ut_string_class.h" #include "ut_assert.h" #include "ut_debugmsg.h" // these classes keep zero terminated strings. // if size() != 0, capacity() is always at least size() + 1. ////////////////////////////////////////////////////////////////// static inline void my_ut_swap(size_t & a, size_t & b) { size_t t = a; a = b; b = t; } static inline void my_ut_swap(char *& a, char *& b) { char * t = a; a = b; b = t; } static inline void my_ut_swap(UT_UCS2Char *& a, UT_UCS2Char *& b) { UT_UCS2Char * t = a; a = b; b = t; } static inline void my_ut_swap(UT_UCS4Char *& a, UT_UCS4Char *& b) { UT_UCS4Char * t = a; a = b; b = t; } ////////////////////////////////////////////////////////////////// static const float g_rGrowBy = 1.5; static inline size_t priv_max(size_t a, size_t b) { return a < b ? b : a; } //////////////////////////////////////////////////////////////////////// // // 8-bit string // // String is built of 8-bit units (bytes) // Encoding could be any single-byte or multi-byte encoding // //////////////////////////////////////////////////////////////////////// UT_Stringbuf::UT_Stringbuf() : m_psz(0), m_pEnd(0), m_size(0) { } UT_Stringbuf::UT_Stringbuf(const UT_Stringbuf& rhs) : m_psz(new char_type[rhs.capacity()]), m_pEnd(m_psz + rhs.size()), m_size(rhs.capacity()) { copy(m_psz, rhs.m_psz, rhs.capacity()); } UT_Stringbuf::UT_Stringbuf(const char_type* sz, size_t n) : m_psz(new char_type[n+1]), m_pEnd(m_psz + n), m_size(n+1) { copy(m_psz, sz, n); m_psz[n] = 0; } UT_Stringbuf::~UT_Stringbuf() { clear(); } void UT_Stringbuf::operator=(const UT_Stringbuf& rhs) { if (this != &rhs) { clear(); assign(rhs.m_psz, rhs.size()); } } void UT_Stringbuf::assign(const char_type* sz, size_t n) { if (n) { if (n >= capacity()) { grow_nocopy(n); } copy(m_psz, sz, n); m_psz[n] = 0; m_pEnd = m_psz + n; } else { clear(); } } void UT_Stringbuf::append(const char_type* sz, size_t n) { if (!n) { return; } if (!capacity()) { assign(sz, n); return; } const size_t nLen = size(); grow_copy(nLen + n); copy(m_psz + nLen, sz, n); m_psz[nLen + n] = 0; m_pEnd += n; } void UT_Stringbuf::append(const UT_Stringbuf& rhs) { append(rhs.m_psz, rhs.size()); } void UT_Stringbuf::swap(UT_Stringbuf& rhs) { my_ut_swap(m_psz , rhs.m_psz ); my_ut_swap(m_pEnd, rhs.m_pEnd); my_ut_swap(m_size, rhs.m_size); } void UT_Stringbuf::clear() { if (m_psz) { delete[] m_psz; m_psz = 0; m_pEnd = 0; m_size = 0; } } void UT_Stringbuf::reserve(size_t n) { grow_nocopy(n); } void UT_Stringbuf::grow_nocopy(size_t n) { grow_common(n, false); } void UT_Stringbuf::grow_copy(size_t n) { grow_common(n, true); } void UT_Stringbuf::grow_common(size_t n, bool bCopy) { ++n; // allow for zero termination if (n > capacity()) { const size_t nCurSize = size(); n = priv_max(n, static_cast(nCurSize * g_rGrowBy)); char_type* pNew = new char_type[n]; if (bCopy && m_psz) { copy(pNew, m_psz, size() + 1); } delete[] m_psz; m_psz = pNew; m_pEnd = m_psz + nCurSize; m_size = n; } } void UT_Stringbuf::copy(char_type* pDest, const char_type* pSrc, size_t n) { if (pDest && pSrc && n) memcpy(pDest, pSrc, n * sizeof(char_type)); } //////////////////////////////////////////////////////////////////////// // // UTF-8 string: encoding is *always* UTF-8 // //////////////////////////////////////////////////////////////////////// UT_UTF8Stringbuf::UT_UTF8Stringbuf () : m_psz(0), m_pEnd(0), m_strlen(0), m_buflen(0) { // } UT_UTF8Stringbuf::UT_UTF8Stringbuf (const UT_UTF8Stringbuf & rhs) : m_psz(0), m_pEnd(0), m_strlen(0), m_buflen(0) { append (rhs); } UT_UTF8Stringbuf::UT_UTF8Stringbuf (const char * sz, size_t n /* == 0 => null-termination */) : m_psz(0), m_pEnd(0), m_strlen(0), m_buflen(0) { append (sz, n); } UT_UTF8Stringbuf::~UT_UTF8Stringbuf () { clear (); } void UT_UTF8Stringbuf::operator=(const UT_UTF8Stringbuf & rhs) { m_pEnd = m_psz; m_strlen = 0; append (rhs); } void UT_UTF8Stringbuf::assign (const char * sz, size_t n /* == 0 => null-termination */) { m_pEnd = m_psz; m_strlen = 0; append (sz, n); } // returns 0 if invalid, or if end of string, i.e. 0 // technically it could differentiate, since UCS-4 is only 31-bit, but... UT_UTF8Stringbuf::UCS4Char UT_UTF8Stringbuf::charCode (const char * str) { if ( str == 0) return 0; if (*str == 0) return 0; const char * p = str; if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1 { return (UCS4Char) (*p); } UCS4Char ret_code = 0; int bytesInSequence = 0; int bytesExpectedInSequence = 0; while (*p) { // 'continuing' octets: if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence { if (bytesInSequence == 0) break; bytesInSequence++; ret_code = (ret_code << 6) | (UCS4Char) (*p & 0x3f); if (bytesInSequence == bytesExpectedInSequence) break; p++; continue; } if (bytesInSequence) break; bytesInSequence++; /* 4,5,6-byte sequences may require > 2 bytes in UCS-4 */ if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence { bytesExpectedInSequence = 6; ret_code = (UCS4Char) (*p & 0x01); p++; continue; } if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence { bytesExpectedInSequence = 5; ret_code = (UCS4Char) (*p & 0x03); p++; continue; } if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence { bytesExpectedInSequence = 4; ret_code = (UCS4Char) (*p & 0x07); p++; continue; } /* 1,2,3-byte sequences do not require > 2 bytes in UCS-4 */ if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence { bytesExpectedInSequence = 3; ret_code = (UCS4Char) (*p & 0x0f); p++; continue; } if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence { bytesExpectedInSequence = 2; ret_code = (UCS4Char) (*p & 0x1f); p++; continue; } ret_code = 0; break; // invalid byte - not UTF-8 } if (bytesInSequence != bytesExpectedInSequence) ret_code = 0; return ret_code; } void UT_UTF8Stringbuf::append (const char * sz, size_t n /* == 0 => null-termination */) { if (sz == 0) return; if (!grow ((n?n:strlen(sz)) + 1)) return; const char * p = sz; char buf[6]; int bytesInSequence = 0; int bytesExpectedInSequence = 0; size_t np = 0; while ((!n && *p) || (np < n)) { if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1 { if (bytesInSequence) break; *m_pEnd++ = *p; *m_pEnd = 0; m_strlen++; p++; np++; continue; } // 'continuing' octets: if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence { if (bytesInSequence == 0) break; buf[bytesInSequence++] = *p; if (bytesInSequence == bytesExpectedInSequence) { for (int b = 0; b < bytesInSequence; b++) *m_pEnd++ = buf[b]; *m_pEnd = 0; m_strlen++; bytesInSequence = 0; bytesExpectedInSequence = 0; } p++; np++; continue; } if (bytesInSequence) break; buf[bytesInSequence++] = *p; /* 4,5,6-byte sequences may require > 2 bytes in UCS-4 */ if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence { bytesExpectedInSequence = 6; p++; np++; continue; } if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence { bytesExpectedInSequence = 5; p++; np++; continue; } if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence { bytesExpectedInSequence = 4; p++; np++; continue; } /* 1,2,3-byte sequences do not require > 2 bytes in UCS-4 */ if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence { bytesExpectedInSequence = 3; p++; np++; continue; } if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence { bytesExpectedInSequence = 2; p++; np++; continue; } break; // invalid byte - not UTF-8 } } void UT_UTF8Stringbuf::append (const UT_UTF8Stringbuf & rhs) { if (grow (rhs.byteLength () + 1)) { memcpy (m_pEnd, rhs.data (), rhs.byteLength ()); m_strlen += rhs.utf8Length (); m_pEnd = m_pEnd + rhs.byteLength (); *m_pEnd = 0; } } void UT_UTF8Stringbuf::appendUCS4 (const UT_UCS4Char * sz, size_t n /* == 0 => null-termination */) { size_t bytelength = 0; size_t i; for (i = 0; (i < n) || (n == 0); i++) { int seql = UT_UCS4Stringbuf::UTF8_ByteLength (sz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? bytelength += static_cast(seql); } if (!grow (bytelength + 1)) return; for (i = 0; (i < n) || (n == 0); i++) { int seql = UT_UCS4Stringbuf::UTF8_ByteLength (sz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? UT_UCS4Stringbuf::UCS4_to_UTF8 (m_pEnd, bytelength, sz[i]); m_strlen++; } *m_pEnd = 0; } void UT_UTF8Stringbuf::appendUCS2 (const UT_UCS2Char * sz, size_t n /* == 0 => null-termination */) { size_t bytelength = 0; size_t i; for (i = 0; (i < n) || (n == 0); i++) { int seql = UT_UCS4Stringbuf::UTF8_ByteLength ((UT_UCS4Char)sz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? bytelength += static_cast(seql); } if (!grow (bytelength + 1)) return; for (i = 0; (i < n) || (n == 0); i++) { int seql = UT_UCS4Stringbuf::UTF8_ByteLength ((UT_UCS4Char)sz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? UT_UCS4Stringbuf::UCS4_to_UTF8 (m_pEnd, bytelength, (UT_UCS4Char)sz[i]); m_strlen++; } *m_pEnd = 0; } /* replaces with in the current string */ void UT_UTF8Stringbuf::escape (const UT_UTF8String & utf8_str1, const UT_UTF8String & utf8_str2) { size_t diff = 0; size_t len1 = utf8_str1.byteLength (); size_t len2 = utf8_str2.byteLength (); const char * str1 = utf8_str1.utf8_str (); const char * str2 = utf8_str2.utf8_str (); if (len2 > len1) { diff = len2 - len1; size_t incr = 0; char * ptr = m_psz; while (ptr + len1 <= m_pEnd) { if (memcmp (ptr, str1, len1) == 0) { incr += diff; ptr += len1; } else { ++ptr; } } if (!grow (incr)) return; } else { diff = len1 - len2; } char * ptr = m_psz; while (ptr + len1 <= m_pEnd) { if (memcmp (ptr, str1, len1) == 0) { if (diff) { if (len2 > len1) { memmove (ptr + diff, ptr, m_pEnd - ptr + 1); m_pEnd += diff; } else { memmove (ptr, ptr + diff, m_pEnd - (ptr + diff) + 1); m_pEnd -= diff; } } memcpy (ptr, str2, len2); ptr += len2; m_strlen += utf8_str2.length () - utf8_str1.length (); } else { ++ptr; } } } /* escapes '<', '>' & '&' in the current string */ void UT_UTF8Stringbuf::escapeXML () { size_t incr = 0; char * ptr = m_psz; while (ptr < m_pEnd) { if ((*ptr == '<') || (*ptr == '>')) incr += 3; else if (*ptr == '&') incr += 4; else if (*ptr == '"') incr += 5; ptr++; } bool bInsert = grow (incr); ptr = m_psz; while (ptr < m_pEnd) { if (*ptr == '<') { if (bInsert) { *ptr++ = '&'; insert (ptr, "lt;", 3); } else *ptr++ = '?'; } else if (*ptr == '>') { if (bInsert) { *ptr++ = '&'; insert (ptr, "gt;", 3); } else *ptr++ = '?'; } else if (*ptr == '&') { if (bInsert) { *ptr++ = '&'; insert (ptr, "amp;", 4); } else *ptr++ = '?'; } else if (*ptr == '"') { if (bInsert) { *ptr++ = '&'; insert (ptr, "quot;", 5); } else *ptr++ = '?'; } else ptr++; } } /* this function escapes the string to provide for conformity with http://www.w3.org/TR/xlink/#link-locators, section 5.4 */ void UT_UTF8Stringbuf::escapeURL () { if(!m_psz || !*m_psz) return; // now work out how many exra characters we will need // need to do this first of all, since growing the string will invalidate all pointers UTF8Iterator I(this); UT_UCS4Char c; UT_uint32 iIncrease = 0; for(c = charCode(I.current()); c != 0; c = charCode(I.advance())) { UT_sint32 iByteLen = UT_UCS4Stringbuf::UTF8_ByteLength(c); if(iByteLen > 1) iIncrease += iByteLen; else if(c <= 0x20 || c > 0x7e || (!isalnum(c) && !strchr("$-_.+!*'(),", c))) iIncrease += 2; } grow(iIncrease); UT_uint32 iScheme = 0; if(!UT_strnicmp(m_psz, "ftp://", 6)) iScheme = 1; else if(!UT_strnicmp(m_psz, "http://", 7)) iScheme = 2; else if(!UT_strnicmp(m_psz, "gopher://", 9)) iScheme = 3; else if(!UT_strnicmp(m_psz, "mailto:", 7)) iScheme = 4; else if(!UT_strnicmp(m_psz, "news:", 5)) iScheme = 5; else if(!UT_strnicmp(m_psz, "nntp://", 7)) iScheme = 6; else if(!UT_strnicmp(m_psz, "telnet://", 9)) iScheme = 7; else if(!UT_strnicmp(m_psz, "wais://", 7)) iScheme = 8; else if(!UT_strnicmp(m_psz, "file://", 7)) iScheme = 9; else if(!UT_strnicmp(m_psz, "prospero://", 11)) iScheme = 10; // now we parse the string into its constituent parts char * p = strstr(m_psz, "://"); char * schm = NULL; char * user = NULL; char * pswd = NULL; char * host = NULL; char * port = NULL; char * last_quest = NULL; char * last_hash = NULL; char * last_slash = NULL; if(p) { user = p + 3; schm = user; p = strchr(p+3, '/'); } else if(iScheme == 4) { p = m_psz + 7; } else if(iScheme == 5) { p = m_psz + 5; } char * urlpath = p ? p : m_psz; if(urlpath != m_psz && iScheme != 4 && iScheme != 5) { *urlpath = 0; char * at = strrchr(user, '@'); if(!at) { user = NULL; } else { host = at + 1; port = strchr(host, ':'); if(port) port++; *at = 0; pswd = strchr(user, ':'); if(pswd) pswd++; *at = '@'; } *urlpath = '/'; } // find out the last /, ? and # -- we need these to work out if ?#& should be escaped // in http or not last_slash = strrchr(urlpath, '/'); last_quest = strrchr(urlpath, '?'); last_hash = strrchr(urlpath, '#'); if(last_quest < last_slash) last_quest = NULL; // this is not a query questionmark if(last_hash < last_slash) last_hash = NULL; char buff[30]; UTF8Iterator J(this); for(c = charCode(J.current()); c != 0; c = charCode(J.advance())) { char * p = (char*) J.current(); UT_sint32 iByteLen = UT_UCS4Stringbuf::UTF8_ByteLength(c); if (iByteLen > 1) // mutlibyte in utf-8; each byte is to be encoded { char bytes[20]; bytes[0] = 0; UT_sint32 j; for(j = 0; j < iByteLen; ++j) { UT_uint32 v = (unsigned char)p[j]; snprintf(buff, 30, "%%%02x", v); strcat(bytes,buff); } char * b = bytes; for(j = 0; j < iByteLen; ++j) { *p++ = *b++; } insert(p, b, strlen(b)); for(j = 0; j < iByteLen; ++j) { J.advance(); J.advance(); J.advance(); } J.retreat(); } else if(// all single byte chars that always have to be encoded (c <= 0x20 || c > 0x7e || (!isalnum(c) && !strchr("$-_.+!*'(),;/?:@=&#", c))) // between the path element and the scheme marker all reserved chars other than @ and : also need to // be encode || (p < urlpath && p >= schm && strchr(";/?=&#",c)) // in user name and pswd, colons and @ have to be encoded || ((user && host && p >= user && p < host - 1) && ((c == ':' && (!pswd || p != pswd - 1)) || c == '@')) // in the host part we also encode @ || (c == '@' && p >= host && p < urlpath) // in url paths, the requirements are scheme-specific // http scheme: "/?;" are reserved; encode all # other than the fragment marker, // all = before the parameter ? as well as all :, @, & || (p > urlpath && ((iScheme == 0 || iScheme == 2) && ((c=='?' && p!=last_quest) || (c=='#' && p!=last_hash) || (c=='=' && p urlpath && (iScheme == 4 && strchr(";?:@=&#/",c))) // news, only @ is reserved || (p > urlpath && (iScheme == 5 && strchr(";?:=&#/",c))) // in all other schemes we escape the reserved characters except / || (p > urlpath && (iScheme != 0 && iScheme != 2 && iScheme != 4 && iScheme != 5) && strchr(";?:@=&#", c))) { UT_return_if_fail( p ); // we have to adjust any pointers we keep in line with the insertion if(last_quest >= p) last_quest += 2; if(last_hash >= p) last_hash += 2; if(last_slash >= p) last_slash += 2; if(host >= p) host += 2; if(pswd >= p) pswd += 2; if(user >= p) user += 2; if(port >= p) port += 2; UT_uint32 v = *p; snprintf(buff, 30, "%02x", v); *p++ = '%'; insert(p, buff, strlen(buff)); // move past the two new chars J.advance(); J.advance(); } } } /* decode %xx encoded characters */ static UT_uint32 s_charCode_to_hexval(UT_UCS4Char c) { if(c >= 0x30 && c <= 0x39) return c - 0x30; else if(c >= 0x41 && c <= 0x46) return c - 0x41 + 10; else if(c >= 0x61 && c <= 0x66) return c - 0x61 + 10; UT_return_val_if_fail( UT_SHOULD_NOT_HAPPEN, 0 ); } void UT_UTF8Stringbuf::decodeURL() { if(!m_psz || !*m_psz) return; char * buff = (char*)malloc(byteLength() + 1); UT_return_if_fail( buff ); buff[0] = 0; UTF8Iterator J(this); const char * ptr = J.current(); UT_UCS4Char c = charCode(J.current()); char utf8cache[7]; utf8cache[6] = 0; UT_uint32 iCachePos = 0; UT_uint32 iCacheNeeded = 0; while (c != 0) { if(c == '%') { J.advance(); UT_UCS4Char b1 = charCode(J.current()); J.advance(); UT_UCS4Char b2 = charCode(J.current()); J.advance(); if(isalnum(b1) && isalnum(b2)) { b1 = s_charCode_to_hexval(b1); b2 = s_charCode_to_hexval(b2); UT_UCS4Char code = ((b1 << 4)& 0xf0) | (b2 & 0x0f); if(iCacheNeeded == 0) { // we start new utf8 sequence in the cache if ((code & 0x80) == 0) iCacheNeeded = 1; else if ((code & 0xe0) == 0xc0) iCacheNeeded = 2; else if ((code & 0xf0) == 0xe0) iCacheNeeded = 3; else if ((code & 0xf8) == 0xf0) iCacheNeeded = 4; else if ((code & 0xfc) == 0xf8) iCacheNeeded = 5; else if ((code & 0xfe) == 0xfc) iCacheNeeded = 6; utf8cache[0] = (char) code; utf8cache[iCacheNeeded] = 0; // make sure the sequence will be terminated iCachePos++; } else { // append to our cache utf8cache[iCachePos++] = (char) code; } if(iCacheNeeded == 0 && (code >= 0x7f && code <= 0xff)) { // the present character is not a valid start of utf8 sequence -- // this is almost certainly a character from the extended ASCII set // which was encoded directly according to the RFC 1738 scheme, we // just append it size_t iLenBuff = strlen(buff); size_t iLenLeft = byteLength() - iLenBuff; char * p = buff + iLenBuff; UT_UCS4Stringbuf::UCS4_to_UTF8(p, iLenLeft, code); // we need to null-terminate *p = 0; } if(iCacheNeeded && iCacheNeeded <= iCachePos) { UT_ASSERT_HARMLESS( iCacheNeeded == iCachePos ); // append the cache to our buffer UT_uint32 iLenBuff = strlen(buff); char * p = buff + iLenBuff; strcat(p, utf8cache); iCacheNeeded = iCachePos = 0; } } else { // this should not happen in encoded url and so we will ignore this token; // if we are in the middle of utf8 sequence; we will reset it iCacheNeeded = iCachePos = 0; } } else { J.advance(); // advance here, for the sake of the else clause below if(iCacheNeeded > iCachePos) { // we are processing a utf sequence, so just append this byte to our cache utf8cache[iCachePos++] = (char) c; } else { const char * p = J.current(); UT_uint32 iLen = p ? p - ptr : strlen(ptr); strncat(buff, ptr, iLen); } } ptr = J.current(); c = charCode(J.current()); } assign(buff); free(buff); } /* translates the current string to MIME "quoted-printable" format */ void UT_UTF8Stringbuf::escapeMIME () { static const char hex[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' }; static const char * s_eol = "=\r\n"; if (m_strlen == 0) return; size_t bytes = 0; char * ptr = m_psz; while (*ptr) { char c = *ptr++; unsigned char u = static_cast(c); if ((c == '\r') || (c == '\n') || (c == '=') || (u & 0x80)) bytes += 2; } if (bytes) { if (!grow (bytes)) return; char * pOld = m_pEnd; char * pNew = m_pEnd + bytes; while (pOld >= m_psz) { char c = *pOld--; unsigned char u = static_cast(c); if ((u & 0x80) || (c == '\r') || (c == '\n') || (c == '=')) { *pNew-- = hex[ u & 0x0f]; *pNew-- = hex[(u >> 4) & 0x0f]; *pNew-- = '='; } else *pNew-- = c; } m_pEnd += bytes; m_strlen = m_pEnd - m_psz; } size_t length = 0; ptr = m_psz; while (true) { if (*ptr == 0) { if (length) { size_t offset = ptr - m_psz; if (grow (3)) { ptr = m_psz + offset; insert (ptr, s_eol, 3); } } break; } if (length >= 70) { size_t offset = ptr - m_psz; if (grow (3)) { ptr = m_psz + offset; insert (ptr, s_eol, 3); } length = 0; } if (*ptr == '=') { ptr += 3; length += 3; } else { ptr++; length++; } } } UT_UTF8Stringbuf * UT_UTF8Stringbuf::lowerCase () { if(!byteLength()) return NULL; UT_UTF8Stringbuf * n = new UT_UTF8Stringbuf(); UT_return_val_if_fail(n, NULL); UTF8Iterator s(this); UT_UCS4Char c = charCode(s.current()); while(c) { UT_UCS4Char l = UT_UCS4_tolower(c); n->appendUCS4(&l,1); c = charCode(s.advance()); } return n; } void UT_UTF8Stringbuf::clear () { if (m_psz) free (m_psz); m_psz = 0; m_pEnd = 0; m_strlen = 0; m_buflen = 0; } void UT_UTF8Stringbuf::insert (char *& ptr, const char * str, size_t utf8length) { if ( str == 0) return; if (*str == 0) return; if ((ptr < m_psz) || (ptr > m_pEnd)) return; char * orig_buf = m_psz; char * orig_ptr = ptr; size_t length = static_cast(strlen(str)); if (!grow (length)) return; ptr = m_psz + (orig_ptr - orig_buf); memmove (ptr + length, ptr, (m_pEnd - ptr) + 1); memcpy (ptr, str, length); ptr += length; m_pEnd += length; m_strlen += utf8length; } void UT_UTF8Stringbuf::reserve(size_t n) { grow(n); } bool UT_UTF8Stringbuf::grow (size_t length) { if (length + 1 <= (m_buflen - (m_pEnd - m_psz))) return true; if (m_psz == 0) { if (length == 0) return true; m_psz = static_cast(malloc(length)); if (m_psz == 0) return false; m_strlen = 0; m_buflen = length; m_pEnd = m_psz; *m_pEnd = 0; return true; } size_t new_length = length + (m_pEnd - m_psz) + 1; size_t end_offset = m_pEnd - m_psz; char * more = static_cast(realloc(static_cast(m_psz), new_length)); if (more == 0) return false; m_psz = more; m_pEnd = m_psz + end_offset; m_buflen = new_length; return true; } UT_UTF8Stringbuf::UTF8Iterator::UTF8Iterator (const UT_UTF8Stringbuf * strbuf) : m_strbuf(strbuf), m_utfbuf(0), m_utfptr(0) { sync (); } UT_UTF8Stringbuf::UTF8Iterator::~UTF8Iterator () { // } void UT_UTF8Stringbuf::UTF8Iterator::operator=(const char * position) { if (!sync ()) return; if (static_cast(position- m_utfbuf) > m_strbuf->byteLength ()) { m_utfptr = m_utfbuf + m_strbuf->byteLength (); } else { m_utfptr = position; } } const char * UT_UTF8Stringbuf::UTF8Iterator::current () { if (!sync ()) return 0; if ((*m_utfptr & 0xc0) == 0x80) return 0; // oops - a 'continuing' byte return m_utfptr; } const char * UT_UTF8Stringbuf::UTF8Iterator::start () { if (!sync ()) return 0; return m_utfbuf; } const char * UT_UTF8Stringbuf::UTF8Iterator::end () { if (!sync ()) return 0; return m_utfbuf + m_strbuf->byteLength (); } const char * UT_UTF8Stringbuf::UTF8Iterator::advance () { if (!sync ()) return 0; if (*m_utfptr == 0) return 0; do m_utfptr++; while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte return m_utfptr; } const char * UT_UTF8Stringbuf::UTF8Iterator::retreat () { if (!sync ()) return 0; if (m_utfptr == m_utfbuf) return 0; do m_utfptr--; while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte return m_utfptr; } // returns false only if there is no string data bool UT_UTF8Stringbuf::UTF8Iterator::sync () { if (m_strbuf == 0) return false; const char * utf8_buffer = m_strbuf->data (); if (utf8_buffer == 0) { m_utfbuf = 0; m_utfptr = 0; return false; } size_t utf8_length = m_strbuf->byteLength (); /* note that this doesn't guarantee that m_utfptr points to the * start of UTF-8 char sequence */ if (static_cast(m_utfptr- m_utfbuf) > utf8_length) { m_utfptr = utf8_buffer + utf8_length; } else { m_utfptr = utf8_buffer + (m_utfptr - m_utfbuf); } m_utfbuf = utf8_buffer; return true; } //////////////////////////////////////////////////////////////////////// // // UCS-4 string // // String is built of 32-bit units (longs) // // NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference // NOTE: in the case of UCS-4 and UTF-32 since they really are // NOTE: identical // //////////////////////////////////////////////////////////////////////// /* scans a buffer for the next valid UTF-8 sequence and returns the corresponding * UCS-4 value for that sequence; the pointer and length-remaining are incremented * and decremented respectively; returns 0 if no valid UTF-8 sequence found by the * end of the string */ UT_UCS4Char UT_UCS4Stringbuf::UTF8_to_UCS4 (const char *& buffer, size_t & length) { UT_UCS4Char ucs4; while (true) { ucs4 = 0; if (length == 0) break; unsigned char c = static_cast(*buffer); buffer++; length--; if ((c & 0x80) == 0) { // ascii, single-byte sequence ucs4 = static_cast(c); break; } if ((c & 0xc0) == 0x80) { // hmm, continuing byte - let's just ignore it continue; } /* we have a multi-byte sequence... */ size_t seql; if ((c & 0xe0) == 0xc0) { seql = 2; ucs4 = static_cast(c & 0x1f); } else if ((c & 0xf0) == 0xe0) { seql = 3; ucs4 = static_cast(c & 0x0f); } else if ((c & 0xf8) == 0xf0) { seql = 4; ucs4 = static_cast(c & 0x07); } else if ((c & 0xfc) == 0xf8) { seql = 5; ucs4 = static_cast(c & 0x03); } else if ((c & 0xfe) == 0xfc) { seql = 6; ucs4 = static_cast(c & 0x01); } else { // or perhaps we don't :-( - whatever it is, let's just ignore it continue; } if (length < seql - 1) { // huh? broken sequence perhaps? anyway, let's just ignore it continue; } bool okay = true; for (size_t i = 1; i < seql; i++) { c = static_cast(*buffer); buffer++; length--; if ((c & 0xc0) != 0x80) { // not a continuing byte? grr! okay = false; break; } ucs4 = ucs4 << 6 | static_cast(c & 0x3f); } if (okay) break; } return ucs4; } /* Returns -1 if ucs4 is not valid UCS-4, 0 if ucs4 is 0, 1-6 otherwise */ int UT_UCS4Stringbuf::UTF8_ByteLength (UT_UCS4Char u) { if ((u & 0x7fffffff) != u) return -1; // UCS-4 is only 31-bit! if (u == 0) return 0; // end-of-string if ((u & 0x7fffff80) == 0) return 1; if ((u & 0x7ffff800) == 0) return 2; if ((u & 0x7fff0000) == 0) return 3; if ((u & 0x7fe00000) == 0) return 4; if ((u & 0x7c000000) == 0) return 5; return 6; } /* appends to the buffer the UTF-8 sequence corresponding to the UCS-4 value; * the pointer and length-remaining are incremented and decremented respectively; * returns false if not valid UCS-4 or if (length < UTF8_ByteLength (ucs4)) */ bool UT_UCS4Stringbuf::UCS4_to_UTF8 (char *& buffer, size_t & length, UT_UCS4Char ucs4) { int seql = UT_UCS4Stringbuf::UTF8_ByteLength (ucs4); if (seql < 0) return false; if (seql == 0) { if (length == 0) return false; *buffer++ = 0; length--; return true; } if (length < static_cast(seql)) return false; length -= seql; switch (seql) { case 1: *buffer++ = static_cast(static_cast(ucs4 & 0x7f)); break; case 2: *buffer++ = static_cast(0xc0 | static_cast((ucs4 >> 6) & 0x1f)); *buffer++ = static_cast(0x80 | static_cast(ucs4 & 0x3f)); break; case 3: *buffer++ = static_cast(0xe0 | static_cast((ucs4 >> 12) & 0x0f)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 6) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast(ucs4 & 0x3f)); break; case 4: *buffer++ = static_cast(0xf0 | static_cast((ucs4 >> 18) & 0x07)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 12) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 6) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast(ucs4 & 0x3f)); break; case 5: *buffer++ = static_cast(0xf8 | static_cast((ucs4 >> 24) & 0x03)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 18) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 12) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 6) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast(ucs4 & 0x3f)); break; case 6: *buffer++ = static_cast(0xfc | static_cast((ucs4 >> 30) & 0x01)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 24) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 18) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 12) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast((ucs4 >> 6) & 0x3f)); *buffer++ = static_cast(0x80 | static_cast(ucs4 & 0x3f)); break; default: // huh? UT_ASSERT(UT_SHOULD_NOT_HAPPEN); break; } return true; } UT_UCS4Stringbuf::UT_UCS4Stringbuf() : m_psz(0), m_pEnd(0), m_size(0), m_utf8string(0) { } UT_UCS4Stringbuf::UT_UCS4Stringbuf(const UT_UCS4Stringbuf& rhs) : m_psz(new char_type[rhs.capacity()]), m_pEnd(m_psz + rhs.size()), m_size(rhs.capacity()), m_utf8string(0) { copy(m_psz, rhs.m_psz, rhs.capacity()); } UT_UCS4Stringbuf::UT_UCS4Stringbuf(const char_type* sz, size_t n) : m_psz(new char_type[n+1]), m_pEnd(m_psz + n), m_size(n+1), m_utf8string(0) { copy(m_psz, sz, n); m_psz[n] = 0; } UT_UCS4Stringbuf::~UT_UCS4Stringbuf() { clear(); } void UT_UCS4Stringbuf::operator=(const UT_UCS4Stringbuf& rhs) { if (this != &rhs) { clear(); assign(rhs.m_psz, rhs.size()); } } void UT_UCS4Stringbuf::assign(const char_type* sz, size_t n) { if (m_utf8string) // buffered internal UTF-8 string is invalid { delete[] m_utf8string; m_utf8string = 0; } if (n) { if (n >= capacity()) { grow_nocopy(n); } copy(m_psz, sz, n); m_psz[n] = 0; m_pEnd = m_psz + n; } else { clear(); } } void UT_UCS4Stringbuf::append(const char_type* sz, size_t n) { if (!n) { return; } if (!capacity()) { assign(sz, n); return; } if (m_utf8string) // buffered internal UTF-8 string is invalid { delete[] m_utf8string; m_utf8string = 0; } const size_t nLen = size(); grow_copy(nLen + n); copy(m_psz + nLen, sz, n); m_psz[nLen + n] = 0; m_pEnd += n; } void UT_UCS4Stringbuf::append(const UT_UCS4Stringbuf& rhs) { append(rhs.m_psz, rhs.size()); } void UT_UCS4Stringbuf::swap(UT_UCS4Stringbuf& rhs) { my_ut_swap(m_psz , rhs.m_psz ); my_ut_swap(m_pEnd, rhs.m_pEnd); my_ut_swap(m_size, rhs.m_size); my_ut_swap(m_utf8string, rhs.m_utf8string); } void UT_UCS4Stringbuf::clear() { if (m_psz) { delete[] m_psz; m_psz = 0; m_pEnd = 0; m_size = 0; } if (m_utf8string) { delete[] m_utf8string; m_utf8string = 0; } } const char* UT_UCS4Stringbuf::utf8_data() { if (m_utf8string) return m_utf8string; size_t utf8length = size (); size_t bytelength = 0; size_t i; for (i = 0; i < utf8length; i++) { int seql = UT_UCS4Stringbuf::UTF8_ByteLength (m_psz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // huh? premature end-of-string? bytelength += static_cast(seql); } m_utf8string = new char[bytelength+1]; char * utf8string = m_utf8string; for (i = 0; i < utf8length; i++) { int seql = UT_UCS4Stringbuf::UTF8_ByteLength (m_psz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // huh? premature end-of-string? UT_UCS4Stringbuf::UCS4_to_UTF8 (utf8string, bytelength, m_psz[i]); } *utf8string = 0; return m_utf8string; } void UT_UCS4Stringbuf::reserve(size_t n) { grow_nocopy(n); } void UT_UCS4Stringbuf::grow_nocopy(size_t n) { grow_common(n, false); } void UT_UCS4Stringbuf::grow_copy(size_t n) { grow_common(n, true); } void UT_UCS4Stringbuf::grow_common(size_t n, bool bCopy) { ++n; // allow for zero termination if (n > capacity()) { const size_t nCurSize = size(); n = priv_max(n, static_cast(nCurSize * g_rGrowBy)); char_type* pNew = new char_type[n]; if (bCopy && m_psz) { copy(pNew, m_psz, size() + 1); } delete[] m_psz; m_psz = pNew; m_pEnd = m_psz + nCurSize; m_size = n; } } void UT_UCS4Stringbuf::copy(char_type* pDest, const char_type* pSrc, size_t n) { if(pSrc && pDest) memcpy(pDest, pSrc, n * sizeof(char_type)); }