From ac490ae623167145ae95ef6b009dc56995a002cb Mon Sep 17 00:00:00 2001 From: websnarf Date: Sun, 26 Jul 2015 22:33:37 -0700 Subject: [PATCH] Add basic UTF8 support. Fix warnings. Update copyrights. --- bstraux.c | 20 ++-- bstraux.h | 36 +++---- bstrlib.c | 21 ++-- bstrlib.txt | 118 +++++++++++++++++++++- bstrwrap.cpp | 4 +- bstrwrap.h | 4 +- buniutil.c | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++ buniutil.h | 37 +++++++ test.cpp | 71 +++++++++++++- 9 files changed, 532 insertions(+), 49 deletions(-) create mode 100644 buniutil.c create mode 100644 buniutil.h diff --git a/bstraux.c b/bstraux.c index 208d72f..e9d9e5a 100644 --- a/bstraux.c +++ b/bstraux.c @@ -1,7 +1,9 @@ +#define _CRT_SECURE_NO_WARNINGS + /* * This source file is part of the bstring string library. This code was - * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source - * license and the GPL. Refer to the accompanying documentation for details + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details * on usage and license. */ @@ -9,7 +11,7 @@ * bstraux.c * * This file is not necessarily part of the core bstring library itself, but - * is just an auxilliary module which includes miscellaneous or trivial + * is just an auxilliary module which includes miscellaneous or trivial * functions. */ @@ -956,11 +958,12 @@ bstring b, t; /* Double size, but deal with unusual case of numeric overflows */ - if ((m = b->mlen << 1) <= b->mlen && - (m = b->mlen + 1024) <= b->mlen && - (m = b->mlen + 16) <= b->mlen && - (m = b->mlen + 1) <= b->mlen) t = NULL; - else t = bfromcstralloc (m, ""); + if (b->mlen <= INT_MAX / 2) m = b->mlen << 1; + else if (b->mlen <= INT_MAX - 1024) m = b->mlen + 1024; + else if (b->mlen <= INT_MAX - 16) m = b->mlen + 16; + else if (b->mlen <= INT_MAX - 1) m = b->mlen + 1; + else return NULL; + t = bfromcstralloc (m, ""); if (t) memcpy (t->data, b->data, i); bSecureDestroy (b); /* Cleanse previous buffer */ @@ -1130,4 +1133,3 @@ void * parm; free (ws); return parm; } - diff --git a/bstraux.h b/bstraux.h index c00c7b7..aba8e71 100644 --- a/bstraux.h +++ b/bstraux.h @@ -1,7 +1,7 @@ /* * This source file is part of the bstring string library. This code was - * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source - * license and the GPL. Refer to the accompanying documentation for details + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details * on usage and license. */ @@ -9,7 +9,7 @@ * bstraux.h * * This file is not a necessary part of the core bstring library itself, but - * is just an auxilliary module which includes miscellaneous or trivial + * is just an auxilliary module which includes miscellaneous or trivial * functions. */ @@ -24,7 +24,7 @@ extern "C" { #endif /* Safety mechanisms */ -#define bstrDeclare(b) bstring (b) = NULL; +#define bstrDeclare(b) bstring (b) = NULL; #define bstrFree(b) {if ((b) != NULL && (b)->slen >= 0 && (b)->mlen >= (b)->slen) { bdestroy (b); (b) = NULL; }} /* Backward compatibilty with previous versions of Bstrlib */ @@ -87,22 +87,22 @@ int bwsBuffLength (struct bwriteStream * stream, int sz); void * bwsClose (struct bwriteStream * stream); /* Security functions */ -#define bSecureDestroy(b) { \ -bstring bstr__tmp = (b); \ - if (bstr__tmp && bstr__tmp->mlen > 0 && bstr__tmp->data) { \ - (void) memset (bstr__tmp->data, 0, (size_t) bstr__tmp->mlen); \ - bdestroy (bstr__tmp); \ - } \ +#define bSecureDestroy(b) { \ +bstring bstr__tmp = (b); \ + if (bstr__tmp && bstr__tmp->mlen > 0 && bstr__tmp->data) { \ + (void) memset (bstr__tmp->data, 0, (size_t) bstr__tmp->mlen); \ + bdestroy (bstr__tmp); \ + } \ } -#define bSecureWriteProtect(t) { \ - if ((t).mlen >= 0) { \ - if ((t).mlen > (t).slen)) { \ - (void) memset ((t).data + (t).slen, 0, (size_t) (t).mlen - (t).slen); \ - } \ - (t).mlen = -1; \ - } \ +#define bSecureWriteProtect(t) { \ + if ((t).mlen >= 0) { \ + if ((t).mlen > (t).slen)) { \ + (void) memset ((t).data + (t).slen, 0, (size_t) (t).mlen - (t).slen); \ + } \ + (t).mlen = -1; \ + } \ } -extern bstring bSecureInput (int maxlen, int termchar, +extern bstring bSecureInput (int maxlen, int termchar, bNgetc vgetchar, void * vgcCtx); #ifdef __cplusplus diff --git a/bstrlib.c b/bstrlib.c index eb92ceb..6176a69 100644 --- a/bstrlib.c +++ b/bstrlib.c @@ -1559,9 +1559,9 @@ bstring auxf = (bstring) find; bstring auxr = (bstring) repl; if (b == NULL || b->data == NULL || find == NULL || - find->data == NULL || repl == NULL || repl->data == NULL || - pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || - b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR; + find->data == NULL || repl == NULL || repl->data == NULL || + pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || + b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR; if (pos > b->slen - find->slen) return BSTR_OK; /* Alias with find string */ @@ -1639,12 +1639,12 @@ bstring auxr = (bstring) repl; while ((pos = instr (b, pos, auxf)) >= 0) { if (slen >= mlen - 1) { - int sl, *t; - + int *t; + int vl; mlen += mlen; - sl = sizeof (int *) * mlen; + vl = sizeof (int *) * mlen; if (static_d == d) d = NULL; /* static_d cannot be realloced */ - if (mlen <= 0 || sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) { + if (mlen <= 0 || vl < mlen || NULL == (t = (int *) bstr__realloc (d, vl))) { ret = BSTR_ERR; goto done; } @@ -1683,8 +1683,7 @@ bstring auxr = (bstring) repl; } done:; - if (static_d == d) d = NULL; - bstr__free (d); + if (static_d != d) bstr__free (d); if (auxf != find) bdestroy (auxf); if (auxr != repl) bdestroy (auxr); return ret; @@ -1700,8 +1699,8 @@ int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) { return findreplaceengine (b, find, repl, pos, binstr); } -/* int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, - * int pos) +/* int bfindreplacecaseless (bstring b, const_bstring find, + * const_bstring repl, int pos) * * Replace all occurrences of a find string, ignoring case, with a replace * string after a given point in a bstring. diff --git a/bstrlib.txt b/bstrlib.txt index d0f02f7..7cb2ede 100644 --- a/bstrlib.txt +++ b/bstrlib.txt @@ -588,9 +588,14 @@ test.cpp - C++ unit/regression test for bstrwrap.cpp bsafe.c - C runtime stubs to abort usage of unsafe C functions. bsafe.h - C header file for bsafe.c functions. -C projects need only include bstrlib.h and compile/link bstrlib.c to use the -bstring library. C++ projects need to additionally include bstrwrap.h and -compile/link bstrwrap.cpp. For both, there may be a need to make choices +utf8util.c - C implemention of generic utf8 parsing functions. +utf8util.h - C head file for generic utf8 parsing functions. +buniutil.c - C implemention utf8 bstring packing and unpacking functions. +buniutil.c - C header file for utf8 bstring functions. + +C modules need only include bstrlib.h and compile/link bstrlib.c to use the +basic bstring library. C++ projects need to additionally include bstrwrap.h +and compile/link bstrwrap.cpp. For both, there may be a need to make choices about feature configuration as described in the "Configurable compilation options" in the section above. @@ -1977,6 +1982,112 @@ The macros =============================================================================== +Unicode functions +----------------- + + The two modules utf8util.c and buniutil.c implement basic functions for + parsing and collecting Unicode data in the UTF8 format. Unicode is + described by a sequence of "code points" which are values between 0 and + 1114111 inclusive mapped to symbol content corresponding to nearly all + the standardized scripts of the world. + + The semantics of Unicode code points is varied and complicated. The + base support of the better string library does not attempt to perform + any interpretation of these code points. The better string library + solely provides support for iterating through unicode code points, + appending and extracting code points to and from bstrings, and parsing + UTF8 and UTF16 from raw data. + + To use these functions compile and link utf8util.c and buniutil.c + + .......................................................................... + + extern void utf8IteratorInit (struct utf8Iterator* iter, + unsigned char* data, int slen); + + Initialize a unicode utf8 iterator to traverse an array of utf8 encoded + code points pointed to by data, with length slen from the start. The + iterator iter is only valid for as long as the array it is pointed to + is valid and not modified. + + .......................................................................... + + extern void utf8IteratorUninit (struct utf8Iterator* iter); + + Invalidate utf8 iterator. + + .......................................................................... + + extern cpUcs4 utf8IteratorGetNextCodePoint (struct utf8Iterator* iter, + cpUcs4 errCh); + + Parse code point the iterator is pointing at and advance the iterator to + the next code point. If the iterator was pointing at a valid code point + the code point is returned, otherwise, errCh will be returned. + + .......................................................................... + + extern cpUcs4 utf8IteratorGetCurrCodePoint (struct utf8Iterator* iter, + cpUcs4 errCh); + + Parse code point the iterator is pointing at. If the iterator was + pointing at a valid code point the code point is returned, otherwise, + errCh will be returned. + + .......................................................................... + + extern int utf8ScanBackwardsForCodePoint (unsigned char* msg, int len, + int pos, cpUcs4* out); + + From the position "pos" in the array msg of length len, search for the + last position before or at pos where from which a valid Unicode code + point can be parsed. If such an offset is found it is returned otherwise + a negative value is returned. The code point parsed is put into *out if + it is not NULL. + + .......................................................................... + + extern int buIsUTF8Content (const_bstring bu); + + Scan a bstring and determine if it is made entirely of unicode code + valid points. If it is, 1 is returned, otherwise 0 is returned. + + .......................................................................... + + extern int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, + cpUcs4 errCh); + + Append the code points passed in the UCS4 format (raw numbers) in the + array bu of length len. Any unparsable characters are replaced by errCh. + If errCh is not a valid Unicode code point, then parsing errors will cause + BSTR_ERR to be returned. + + .......................................................................... + + extern int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, + const_bstring bu, int pos); + + Convert a string of UTF8 codepoints (bu), skipping the first pos, into a + sequence of UTF16 encoded code points. Returns the number of UCS2 16-bit + words written to the output. No more than len words are written to the + target array ucs2. If any code point in bu is unparsable, it will be + translated to errCh. + + .......................................................................... + + extern int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, + cpUcs2* bom, cpUcs4 errCh); + + Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu). Any + invalid code point is replaced by errCh. If errCh is itself not a + valid code point, then this translation will halt upon the first error + and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order mark + has been previously read, it may be passed in as bom, otherwise if *bom is + set to 0, it will be filled in with the BOM as read from the first + character if it is a BOM. + +=============================================================================== + The bstest module ----------------- @@ -3197,5 +3308,6 @@ Michael Hsieh Richard A. Smith Simon Ekstrom Wayne Scott +Zed A. Shaw =============================================================================== diff --git a/bstrwrap.cpp b/bstrwrap.cpp index 31c9a5b..ce77f17 100644 --- a/bstrwrap.cpp +++ b/bstrwrap.cpp @@ -1,7 +1,7 @@ /* * This source file is part of the bstring string library. This code was - * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source - * license and the GPL. Refer to the accompanying documentation for details + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details * on usage and license. */ diff --git a/bstrwrap.h b/bstrwrap.h index 6a09b05..0ba63a0 100644 --- a/bstrwrap.h +++ b/bstrwrap.h @@ -1,7 +1,7 @@ /* * This source file is part of the bstring string library. This code was - * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source - * license and the GPL. Refer to the accompanying documentation for details + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details * on usage and license. */ diff --git a/buniutil.c b/buniutil.c new file mode 100644 index 0000000..7d3f102 --- /dev/null +++ b/buniutil.c @@ -0,0 +1,270 @@ +/* + * This source file is part of the bstring string library. This code was + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details + * on usage and license. + */ + +/* + * buniutil.c + * + * This file is not necessarily part of the core bstring library itself, but + * is just an implementation of basic utf8 processing for bstrlib. Note that + * this module is dependent upon bstrlib.c and utf8util.c + */ + +#include "bstrlib.h" +#include "buniutil.h" + +#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL) + +/* int buIsUTF8Content (const_bstring bu) + * + * Scan string and return 1 if its entire contents is entirely UTF8 code + * points. Otherwise return 0. + */ +int buIsUTF8Content (const_bstring bu) { +struct utf8Iterator iter; + + if (NULL == bdata (bu)) return 0; + for (utf8IteratorInit (&iter, bu->data, bu->slen); + !utf8IteratorNoMore (&iter);) { + if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0; + } + return 1; +} + +/* int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, + * int pos) + * + * Convert a string of UTF8 codepoints (bu) into a sequence of UTF16 encoded + * code points. Returns the number of UCS2 16-bit words written to the + * output. No more than len words are written to the target array ucs2. If + * any code point in bu is unparsable, it will be translated to errCh. + */ +int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) { +struct tagbstring t; +struct utf8Iterator iter; +cpUcs4 ucs4; +int i, j; + + if (!isLegalUnicodeCodePoint (errCh)) errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; + if (NULL == ucs2 || 0 >= len || NULL == bdata (bu) || 0 > pos) return BSTR_ERR; + + for (j=0, i=0; j < bu->slen; j++) { + if (0x80 != (0xC0 & bu->data[j])) { + if (i >= pos) break; + i++; + } + } + + t.mlen = -1; + t.data = bu->data + j; + t.slen = bu->slen - j; + + utf8IteratorInit (&iter, t.data, t.slen); + + ucs4 = BSTR_ERR; + for (i=0; 0 < len && !utf8IteratorNoMore (&iter) && 0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) { + if (ucs4 < 0x10000) { + *ucs2++ = (cpUcs2) ucs4; + len--; + } else { + if (len < 2) { + *ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; + len--; + } else { + long y = ucs4 - 0x10000; + ucs2[0] = (cpUcs2) (0xD800 | (y >> 10)); + ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF)); + len -= 2; + ucs2 += 2; + i++; + } + } + } + while (0 < len) { + *ucs2++ = 0; + len--; + } + + utf8IteratorUninit (&iter); + if (0 > ucs4) return BSTR_ERR; + return i; +} + +/* + +Unicode UTF-8 +------- ----- +U-00000000 - U-0000007F: 0xxxxxxx +U-00000080 - U-000007FF: 110xxxxx 10xxxxxx +U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx +U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + +UTF-32: U-000000 - U-10FFFF + +*/ + +/* int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) + * + * Convert an array of UCS4 code points (bu) to UTF8 codepoints b. Any + * invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, then this translation will halt upon the first error + * and return BSTR_ERR. Otherwise BSTR_OK is returned. + */ +int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) { +int i, oldSlen; + + if (NULL == bu || NULL == b || 0 > len || 0 > (oldSlen = blengthe (b, -1))) return BSTR_ERR; + if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0; + + for (i=0; i < len; i++) { + unsigned char c[6]; + cpUcs4 v = bu[i]; + + if (!isLegalUnicodeCodePoint (v)) { + if (~0 == errCh) { + b->slen = oldSlen; + return BSTR_ERR; + } + v = errCh; + } + + if (v < 0x80) { + if (BSTR_OK != bconchar (b, (char) v)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else if (v < 0x800) { + c[0] = (unsigned char) ( (v >> 6) + 0xc0); + c[1] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk (b, c, 2)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else if (v < 0x10000) { + c[0] = (unsigned char) ( (v >> 12) + 0xe0); + c[1] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[2] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk (b, c, 3)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else +#if 0 + if (v < 0x200000) +#endif + { + c[0] = (unsigned char) ( (v >> 18) + 0xf0); + c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[3] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk (b, c, 4)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } +#if 0 + else if (v < 0x4000000) { + c[0] = (unsigned char) ( (v >> 24) + 0xf8); + c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[3] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[4] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk (b, c, 5)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else { + c[0] = (unsigned char) ( (v >> 30) + 0xfc); + c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); + c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[4] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[5] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk (b, c, 6)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } +#endif + } + return BSTR_OK; +} + +#define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs)) +#define TEMP_UCS4_BUFFER_SIZE (64) + +/* int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, + * cpUcs2* bom, cpUcs4 errCh) + * + * Append an array of UCS4 code points (utf16) to UTF8 codepoints (bu). Any + * invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, then this translation will halt upon the first error + * and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order mark + * has been previously read, it may be passed in as bom, otherwise if *bom is + * set to 0, it will be filled in with the BOM as read from the first + * character if it is a BOM. + */ +int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) { +cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE]; +int cc, i, sm, oldSlen; + + if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR; + if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0; + if (len == 0) return BSTR_OK; + + oldSlen = bu->slen; + i = 0; + + /* Check for BOM character and select endianess. Also remove the + BOM from the stream, since there is no need for it in a UTF-8 encoding. */ + if (bom && (cpUcs2) 0xFFFE == *bom) { + sm = 8; + } else if (bom && (cpUcs2) 0xFEFF == *bom) { + sm = 0; + } else if (utf16[i] == (cpUcs2) 0xFFFE) { + if (bom) *bom = utf16[i]; + sm = 8; + i++; + } else if (utf16[i] == (cpUcs2) 0xFEFF) { + if (bom) *bom = utf16[i]; + sm = 0; + i++; + } else { + sm = 0; /* Assume local endianness. */ + } + + cc = 0; + for (;i < len; i++) { + cpUcs4 c, v = endSwap (utf16[i], sm); + if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */ + if (v >= 0xDC00 || i >= len) { + ErrMode:; + if (~0 == errCh) { + ErrReturn:; + bu->slen = oldSlen; + return BSTR_ERR; + } + v = errCh; + } else { + i++; + if ((c = endSwap (utf16[i], sm) - 0xDC00) > 0x3FF) goto ErrMode; + v = ((v - 0xD800) << 10) + c + 0x10000; + } + } + buff[cc] = v; + cc++; + if (cc >= TEMP_UCS4_BUFFER_SIZE) { + if (0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn; + cc = 0; + } + } + if (cc > 0 && 0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn; + + return BSTR_OK; +} diff --git a/buniutil.h b/buniutil.h new file mode 100644 index 0000000..1017212 --- /dev/null +++ b/buniutil.h @@ -0,0 +1,37 @@ +/* + * This source file is part of the bstring string library. This code was + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details + * on usage and license. + */ + +/* + * buniutil.h + * + * This file is the interface for the buniutil basic "Unicode for bstrings" + * functions. Note that there are dependencies on bstrlib.h and utf8util.h . + */ + +#ifndef BSTRLIB_UNICODE_UTILITIES +#define BSTRLIB_UNICODE_UTILITIES + +#include "utf8util.h" +#include "bstrlib.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern int buIsUTF8Content (const_bstring bu); +extern int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh); + +/* For those unfortunate enough to be stuck supporting UTF16. */ +extern int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos); +extern int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh); + +#ifdef __cplusplus +} +#endif + +#endif /* BSTRLIB_UNICODE_UTILITIES */ + diff --git a/test.cpp b/test.cpp index 2f15ac8..0976667 100644 --- a/test.cpp +++ b/test.cpp @@ -1,11 +1,11 @@ // // This source file is part of the bstring string library. This code was -// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source -// license. Refer to the accompanying documentation for details on usage and +// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source +// license. Refer to the accompanying documentation for details on usage and // license. // -// +// // test.cpp // // This file is the C++ unit test for Bstrlib @@ -16,7 +16,7 @@ #include "bstrlib.h" #include "bstrwrap.h" -// Exceptions must be turned on in the compiler to successfully run +// Exceptions must be turned on in the compiler to successfully run // this test. The compiler must also support STL. #define dumpOutQty (32) @@ -1559,6 +1559,69 @@ int ret = 0; return ret; } +/* int bMultiCountConcat (bstring dst, int n, ...) + * + * Concatenate a sequence of exactly n bstring arguments to dst. + */ +int bMultiCountConcat (bstring dst, int n, ...) { +va_list arglist; +int i, ret = 0; + va_start (arglist, n); + for (i = 0; i < n; i++) { + ret = bconcat (dst, va_arg (arglist, bstring)); + if (0 > ret) break; + } + va_end (arglist); + return ret; +} + +/* int bMultiCountCatCstr (bstring dst, int n, ...) + * + * Concatenate a sequence of exactly n char * arguments to dst. + */ +int bMultiCountCatCstr (bstring dst, int n, ...) { +va_list arglist; +int i, ret = 0; + va_start (arglist, n); + for (i = 0; i < n; i++) { + ret = bcatcstr (dst, va_arg (arglist, char *)); + if (0 > ret) break; + } + va_end (arglist); + return ret; +} + +/* + * These can be dangerous because there is no compiler time type checking + * on the arguments. + */ + +extern int bMultiCountConcat (bstring dst, int n, ...); +extern int bMultiCountCatCstr (bstring dst, int n, ...); + +#if defined(__GNUC__) +# define COUNT_ARGS(...) COUNT_ARGS_(,##__VA_ARGS__,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0) +# define COUNT_ARGS_(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32,_33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48,_49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,_64,_65,_66,_67,_68,_69,_70,_71,_72,_73,_74,_75,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85,_86,_87,_88,_89,_90,_91,_92,_93,_94,_95,_96,_97,_98,_99,cnt,...) cnt +#else +# if defined(__WATCOMC__) || defined(_MSC_VER) +# define COUNT_ARGS(...) ARGCNT_ARGINDEX100((ARGCNT_0_LENGTH_ ## __VA_ARGS__ ## _SPECIAL_CASE,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)) +# define ARGCNT_ARGINDEX100(__args) ARGCNT_ARGINDEX100_RAW __args +# define ARGCNT_0_LENGTH__SPECIAL_CASE ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0 +# define ARGCNT_ARGINDEX100_RAW(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32,_33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48,_49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,_64,_65,_66,_67,_68,_69,_70,_71,_72,_73,_74,_75,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85,_86,_87,_88,_89,_90,_91,_92,_93,_94,_95,_96,_97,_98,_99,n,...) n +# endif +#endif + +/* + * The following macros are only available on more recent compilers. + * Can process up to 100 arguments. These can also be dangerous because + * there is no compiler time type checking on the arguments. + */ + +#if defined(COUNT_ARGS) +# define bMultiConcat(dst,...) bMultiCountConcat((dst),COUNT_ARGS(__VA_ARGS__),##__VA_ARGS__) +# define bMultiCatCstr(dst,...) bMultiCountCatCstr((dst),COUNT_ARGS(__VA_ARGS__),##__VA_ARGS__) +#endif + int main () { int ret = 0;