Add basic UTF8 support. Fix warnings. Update copyrights.
This commit is contained in:
parent
2a17f97443
commit
ac490ae623
20
bstraux.c
20
bstraux.c
|
@ -1,7 +1,9 @@
|
|||
#define _CRT_SECURE_NO_WARNINGS
|
||||
|
||||
/*
|
||||
* This source file is part of the bstring string library. This code was
|
||||
* written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* on usage and license.
|
||||
*/
|
||||
|
||||
|
@ -9,7 +11,7 @@
|
|||
* bstraux.c
|
||||
*
|
||||
* This file is not necessarily part of the core bstring library itself, but
|
||||
* is just an auxilliary module which includes miscellaneous or trivial
|
||||
* is just an auxilliary module which includes miscellaneous or trivial
|
||||
* functions.
|
||||
*/
|
||||
|
||||
|
@ -956,11 +958,12 @@ bstring b, t;
|
|||
/* Double size, but deal with unusual case of numeric
|
||||
overflows */
|
||||
|
||||
if ((m = b->mlen << 1) <= b->mlen &&
|
||||
(m = b->mlen + 1024) <= b->mlen &&
|
||||
(m = b->mlen + 16) <= b->mlen &&
|
||||
(m = b->mlen + 1) <= b->mlen) t = NULL;
|
||||
else t = bfromcstralloc (m, "");
|
||||
if (b->mlen <= INT_MAX / 2) m = b->mlen << 1;
|
||||
else if (b->mlen <= INT_MAX - 1024) m = b->mlen + 1024;
|
||||
else if (b->mlen <= INT_MAX - 16) m = b->mlen + 16;
|
||||
else if (b->mlen <= INT_MAX - 1) m = b->mlen + 1;
|
||||
else return NULL;
|
||||
t = bfromcstralloc (m, "");
|
||||
|
||||
if (t) memcpy (t->data, b->data, i);
|
||||
bSecureDestroy (b); /* Cleanse previous buffer */
|
||||
|
@ -1130,4 +1133,3 @@ void * parm;
|
|||
free (ws);
|
||||
return parm;
|
||||
}
|
||||
|
||||
|
|
36
bstraux.h
36
bstraux.h
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* This source file is part of the bstring string library. This code was
|
||||
* written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* on usage and license.
|
||||
*/
|
||||
|
||||
|
@ -9,7 +9,7 @@
|
|||
* bstraux.h
|
||||
*
|
||||
* This file is not a necessary part of the core bstring library itself, but
|
||||
* is just an auxilliary module which includes miscellaneous or trivial
|
||||
* is just an auxilliary module which includes miscellaneous or trivial
|
||||
* functions.
|
||||
*/
|
||||
|
||||
|
@ -24,7 +24,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
/* Safety mechanisms */
|
||||
#define bstrDeclare(b) bstring (b) = NULL;
|
||||
#define bstrDeclare(b) bstring (b) = NULL;
|
||||
#define bstrFree(b) {if ((b) != NULL && (b)->slen >= 0 && (b)->mlen >= (b)->slen) { bdestroy (b); (b) = NULL; }}
|
||||
|
||||
/* Backward compatibilty with previous versions of Bstrlib */
|
||||
|
@ -87,22 +87,22 @@ int bwsBuffLength (struct bwriteStream * stream, int sz);
|
|||
void * bwsClose (struct bwriteStream * stream);
|
||||
|
||||
/* Security functions */
|
||||
#define bSecureDestroy(b) { \
|
||||
bstring bstr__tmp = (b); \
|
||||
if (bstr__tmp && bstr__tmp->mlen > 0 && bstr__tmp->data) { \
|
||||
(void) memset (bstr__tmp->data, 0, (size_t) bstr__tmp->mlen); \
|
||||
bdestroy (bstr__tmp); \
|
||||
} \
|
||||
#define bSecureDestroy(b) { \
|
||||
bstring bstr__tmp = (b); \
|
||||
if (bstr__tmp && bstr__tmp->mlen > 0 && bstr__tmp->data) { \
|
||||
(void) memset (bstr__tmp->data, 0, (size_t) bstr__tmp->mlen); \
|
||||
bdestroy (bstr__tmp); \
|
||||
} \
|
||||
}
|
||||
#define bSecureWriteProtect(t) { \
|
||||
if ((t).mlen >= 0) { \
|
||||
if ((t).mlen > (t).slen)) { \
|
||||
(void) memset ((t).data + (t).slen, 0, (size_t) (t).mlen - (t).slen); \
|
||||
} \
|
||||
(t).mlen = -1; \
|
||||
} \
|
||||
#define bSecureWriteProtect(t) { \
|
||||
if ((t).mlen >= 0) { \
|
||||
if ((t).mlen > (t).slen)) { \
|
||||
(void) memset ((t).data + (t).slen, 0, (size_t) (t).mlen - (t).slen); \
|
||||
} \
|
||||
(t).mlen = -1; \
|
||||
} \
|
||||
}
|
||||
extern bstring bSecureInput (int maxlen, int termchar,
|
||||
extern bstring bSecureInput (int maxlen, int termchar,
|
||||
bNgetc vgetchar, void * vgcCtx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
21
bstrlib.c
21
bstrlib.c
|
@ -1559,9 +1559,9 @@ bstring auxf = (bstring) find;
|
|||
bstring auxr = (bstring) repl;
|
||||
|
||||
if (b == NULL || b->data == NULL || find == NULL ||
|
||||
find->data == NULL || repl == NULL || repl->data == NULL ||
|
||||
pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
|
||||
b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
|
||||
find->data == NULL || repl == NULL || repl->data == NULL ||
|
||||
pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
|
||||
b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
|
||||
if (pos > b->slen - find->slen) return BSTR_OK;
|
||||
|
||||
/* Alias with find string */
|
||||
|
@ -1639,12 +1639,12 @@ bstring auxr = (bstring) repl;
|
|||
|
||||
while ((pos = instr (b, pos, auxf)) >= 0) {
|
||||
if (slen >= mlen - 1) {
|
||||
int sl, *t;
|
||||
|
||||
int *t;
|
||||
int vl;
|
||||
mlen += mlen;
|
||||
sl = sizeof (int *) * mlen;
|
||||
vl = sizeof (int *) * mlen;
|
||||
if (static_d == d) d = NULL; /* static_d cannot be realloced */
|
||||
if (mlen <= 0 || sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
|
||||
if (mlen <= 0 || vl < mlen || NULL == (t = (int *) bstr__realloc (d, vl))) {
|
||||
ret = BSTR_ERR;
|
||||
goto done;
|
||||
}
|
||||
|
@ -1683,8 +1683,7 @@ bstring auxr = (bstring) repl;
|
|||
}
|
||||
|
||||
done:;
|
||||
if (static_d == d) d = NULL;
|
||||
bstr__free (d);
|
||||
if (static_d != d) bstr__free (d);
|
||||
if (auxf != find) bdestroy (auxf);
|
||||
if (auxr != repl) bdestroy (auxr);
|
||||
return ret;
|
||||
|
@ -1700,8 +1699,8 @@ int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
|
|||
return findreplaceengine (b, find, repl, pos, binstr);
|
||||
}
|
||||
|
||||
/* int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl,
|
||||
* int pos)
|
||||
/* int bfindreplacecaseless (bstring b, const_bstring find,
|
||||
* const_bstring repl, int pos)
|
||||
*
|
||||
* Replace all occurrences of a find string, ignoring case, with a replace
|
||||
* string after a given point in a bstring.
|
||||
|
|
118
bstrlib.txt
118
bstrlib.txt
|
@ -588,9 +588,14 @@ test.cpp - C++ unit/regression test for bstrwrap.cpp
|
|||
bsafe.c - C runtime stubs to abort usage of unsafe C functions.
|
||||
bsafe.h - C header file for bsafe.c functions.
|
||||
|
||||
C projects need only include bstrlib.h and compile/link bstrlib.c to use the
|
||||
bstring library. C++ projects need to additionally include bstrwrap.h and
|
||||
compile/link bstrwrap.cpp. For both, there may be a need to make choices
|
||||
utf8util.c - C implemention of generic utf8 parsing functions.
|
||||
utf8util.h - C head file for generic utf8 parsing functions.
|
||||
buniutil.c - C implemention utf8 bstring packing and unpacking functions.
|
||||
buniutil.c - C header file for utf8 bstring functions.
|
||||
|
||||
C modules need only include bstrlib.h and compile/link bstrlib.c to use the
|
||||
basic bstring library. C++ projects need to additionally include bstrwrap.h
|
||||
and compile/link bstrwrap.cpp. For both, there may be a need to make choices
|
||||
about feature configuration as described in the "Configurable compilation
|
||||
options" in the section above.
|
||||
|
||||
|
@ -1977,6 +1982,112 @@ The macros
|
|||
|
||||
===============================================================================
|
||||
|
||||
Unicode functions
|
||||
-----------------
|
||||
|
||||
The two modules utf8util.c and buniutil.c implement basic functions for
|
||||
parsing and collecting Unicode data in the UTF8 format. Unicode is
|
||||
described by a sequence of "code points" which are values between 0 and
|
||||
1114111 inclusive mapped to symbol content corresponding to nearly all
|
||||
the standardized scripts of the world.
|
||||
|
||||
The semantics of Unicode code points is varied and complicated. The
|
||||
base support of the better string library does not attempt to perform
|
||||
any interpretation of these code points. The better string library
|
||||
solely provides support for iterating through unicode code points,
|
||||
appending and extracting code points to and from bstrings, and parsing
|
||||
UTF8 and UTF16 from raw data.
|
||||
|
||||
To use these functions compile and link utf8util.c and buniutil.c
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern void utf8IteratorInit (struct utf8Iterator* iter,
|
||||
unsigned char* data, int slen);
|
||||
|
||||
Initialize a unicode utf8 iterator to traverse an array of utf8 encoded
|
||||
code points pointed to by data, with length slen from the start. The
|
||||
iterator iter is only valid for as long as the array it is pointed to
|
||||
is valid and not modified.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern void utf8IteratorUninit (struct utf8Iterator* iter);
|
||||
|
||||
Invalidate utf8 iterator.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern cpUcs4 utf8IteratorGetNextCodePoint (struct utf8Iterator* iter,
|
||||
cpUcs4 errCh);
|
||||
|
||||
Parse code point the iterator is pointing at and advance the iterator to
|
||||
the next code point. If the iterator was pointing at a valid code point
|
||||
the code point is returned, otherwise, errCh will be returned.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern cpUcs4 utf8IteratorGetCurrCodePoint (struct utf8Iterator* iter,
|
||||
cpUcs4 errCh);
|
||||
|
||||
Parse code point the iterator is pointing at. If the iterator was
|
||||
pointing at a valid code point the code point is returned, otherwise,
|
||||
errCh will be returned.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern int utf8ScanBackwardsForCodePoint (unsigned char* msg, int len,
|
||||
int pos, cpUcs4* out);
|
||||
|
||||
From the position "pos" in the array msg of length len, search for the
|
||||
last position before or at pos where from which a valid Unicode code
|
||||
point can be parsed. If such an offset is found it is returned otherwise
|
||||
a negative value is returned. The code point parsed is put into *out if
|
||||
it is not NULL.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern int buIsUTF8Content (const_bstring bu);
|
||||
|
||||
Scan a bstring and determine if it is made entirely of unicode code
|
||||
valid points. If it is, 1 is returned, otherwise 0 is returned.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len,
|
||||
cpUcs4 errCh);
|
||||
|
||||
Append the code points passed in the UCS4 format (raw numbers) in the
|
||||
array bu of length len. Any unparsable characters are replaced by errCh.
|
||||
If errCh is not a valid Unicode code point, then parsing errors will cause
|
||||
BSTR_ERR to be returned.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh,
|
||||
const_bstring bu, int pos);
|
||||
|
||||
Convert a string of UTF8 codepoints (bu), skipping the first pos, into a
|
||||
sequence of UTF16 encoded code points. Returns the number of UCS2 16-bit
|
||||
words written to the output. No more than len words are written to the
|
||||
target array ucs2. If any code point in bu is unparsable, it will be
|
||||
translated to errCh.
|
||||
|
||||
..........................................................................
|
||||
|
||||
extern int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len,
|
||||
cpUcs2* bom, cpUcs4 errCh);
|
||||
|
||||
Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu). Any
|
||||
invalid code point is replaced by errCh. If errCh is itself not a
|
||||
valid code point, then this translation will halt upon the first error
|
||||
and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order mark
|
||||
has been previously read, it may be passed in as bom, otherwise if *bom is
|
||||
set to 0, it will be filled in with the BOM as read from the first
|
||||
character if it is a BOM.
|
||||
|
||||
===============================================================================
|
||||
|
||||
The bstest module
|
||||
-----------------
|
||||
|
||||
|
@ -3197,5 +3308,6 @@ Michael Hsieh
|
|||
Richard A. Smith
|
||||
Simon Ekstrom
|
||||
Wayne Scott
|
||||
Zed A. Shaw
|
||||
|
||||
===============================================================================
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* This source file is part of the bstring string library. This code was
|
||||
* written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* on usage and license.
|
||||
*/
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* This source file is part of the bstring string library. This code was
|
||||
* written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* on usage and license.
|
||||
*/
|
||||
|
||||
|
|
|
@ -0,0 +1,270 @@
|
|||
/*
|
||||
* This source file is part of the bstring string library. This code was
|
||||
* written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* on usage and license.
|
||||
*/
|
||||
|
||||
/*
|
||||
* buniutil.c
|
||||
*
|
||||
* This file is not necessarily part of the core bstring library itself, but
|
||||
* is just an implementation of basic utf8 processing for bstrlib. Note that
|
||||
* this module is dependent upon bstrlib.c and utf8util.c
|
||||
*/
|
||||
|
||||
#include "bstrlib.h"
|
||||
#include "buniutil.h"
|
||||
|
||||
#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL)
|
||||
|
||||
/* int buIsUTF8Content (const_bstring bu)
|
||||
*
|
||||
* Scan string and return 1 if its entire contents is entirely UTF8 code
|
||||
* points. Otherwise return 0.
|
||||
*/
|
||||
int buIsUTF8Content (const_bstring bu) {
|
||||
struct utf8Iterator iter;
|
||||
|
||||
if (NULL == bdata (bu)) return 0;
|
||||
for (utf8IteratorInit (&iter, bu->data, bu->slen);
|
||||
!utf8IteratorNoMore (&iter);) {
|
||||
if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu,
|
||||
* int pos)
|
||||
*
|
||||
* Convert a string of UTF8 codepoints (bu) into a sequence of UTF16 encoded
|
||||
* code points. Returns the number of UCS2 16-bit words written to the
|
||||
* output. No more than len words are written to the target array ucs2. If
|
||||
* any code point in bu is unparsable, it will be translated to errCh.
|
||||
*/
|
||||
int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) {
|
||||
struct tagbstring t;
|
||||
struct utf8Iterator iter;
|
||||
cpUcs4 ucs4;
|
||||
int i, j;
|
||||
|
||||
if (!isLegalUnicodeCodePoint (errCh)) errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
|
||||
if (NULL == ucs2 || 0 >= len || NULL == bdata (bu) || 0 > pos) return BSTR_ERR;
|
||||
|
||||
for (j=0, i=0; j < bu->slen; j++) {
|
||||
if (0x80 != (0xC0 & bu->data[j])) {
|
||||
if (i >= pos) break;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
t.mlen = -1;
|
||||
t.data = bu->data + j;
|
||||
t.slen = bu->slen - j;
|
||||
|
||||
utf8IteratorInit (&iter, t.data, t.slen);
|
||||
|
||||
ucs4 = BSTR_ERR;
|
||||
for (i=0; 0 < len && !utf8IteratorNoMore (&iter) && 0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) {
|
||||
if (ucs4 < 0x10000) {
|
||||
*ucs2++ = (cpUcs2) ucs4;
|
||||
len--;
|
||||
} else {
|
||||
if (len < 2) {
|
||||
*ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
|
||||
len--;
|
||||
} else {
|
||||
long y = ucs4 - 0x10000;
|
||||
ucs2[0] = (cpUcs2) (0xD800 | (y >> 10));
|
||||
ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF));
|
||||
len -= 2;
|
||||
ucs2 += 2;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (0 < len) {
|
||||
*ucs2++ = 0;
|
||||
len--;
|
||||
}
|
||||
|
||||
utf8IteratorUninit (&iter);
|
||||
if (0 > ucs4) return BSTR_ERR;
|
||||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Unicode UTF-8
|
||||
------- -----
|
||||
U-00000000 - U-0000007F: 0xxxxxxx
|
||||
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
|
||||
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
|
||||
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
|
||||
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
|
||||
UTF-32: U-000000 - U-10FFFF
|
||||
|
||||
*/
|
||||
|
||||
/* int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh)
|
||||
*
|
||||
* Convert an array of UCS4 code points (bu) to UTF8 codepoints b. Any
|
||||
* invalid code point is replaced by errCh. If errCh is itself not a
|
||||
* valid code point, then this translation will halt upon the first error
|
||||
* and return BSTR_ERR. Otherwise BSTR_OK is returned.
|
||||
*/
|
||||
int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) {
|
||||
int i, oldSlen;
|
||||
|
||||
if (NULL == bu || NULL == b || 0 > len || 0 > (oldSlen = blengthe (b, -1))) return BSTR_ERR;
|
||||
if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;
|
||||
|
||||
for (i=0; i < len; i++) {
|
||||
unsigned char c[6];
|
||||
cpUcs4 v = bu[i];
|
||||
|
||||
if (!isLegalUnicodeCodePoint (v)) {
|
||||
if (~0 == errCh) {
|
||||
b->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
v = errCh;
|
||||
}
|
||||
|
||||
if (v < 0x80) {
|
||||
if (BSTR_OK != bconchar (b, (char) v)) {
|
||||
b->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
} else if (v < 0x800) {
|
||||
c[0] = (unsigned char) ( (v >> 6) + 0xc0);
|
||||
c[1] = (unsigned char) (( v & 0x3f) + 0x80);
|
||||
if (BSTR_OK != bcatblk (b, c, 2)) {
|
||||
b->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
} else if (v < 0x10000) {
|
||||
c[0] = (unsigned char) ( (v >> 12) + 0xe0);
|
||||
c[1] = (unsigned char) (((v >> 6) & 0x3f) + 0x80);
|
||||
c[2] = (unsigned char) (( v & 0x3f) + 0x80);
|
||||
if (BSTR_OK != bcatblk (b, c, 3)) {
|
||||
b->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
} else
|
||||
#if 0
|
||||
if (v < 0x200000)
|
||||
#endif
|
||||
{
|
||||
c[0] = (unsigned char) ( (v >> 18) + 0xf0);
|
||||
c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
|
||||
c[2] = (unsigned char) (((v >> 6) & 0x3f) + 0x80);
|
||||
c[3] = (unsigned char) (( v & 0x3f) + 0x80);
|
||||
if (BSTR_OK != bcatblk (b, c, 4)) {
|
||||
b->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
else if (v < 0x4000000) {
|
||||
c[0] = (unsigned char) ( (v >> 24) + 0xf8);
|
||||
c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
|
||||
c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
|
||||
c[3] = (unsigned char) (((v >> 6) & 0x3f) + 0x80);
|
||||
c[4] = (unsigned char) (( v & 0x3f) + 0x80);
|
||||
if (BSTR_OK != bcatblk (b, c, 5)) {
|
||||
b->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
} else {
|
||||
c[0] = (unsigned char) ( (v >> 30) + 0xfc);
|
||||
c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80);
|
||||
c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
|
||||
c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
|
||||
c[4] = (unsigned char) (((v >> 6) & 0x3f) + 0x80);
|
||||
c[5] = (unsigned char) (( v & 0x3f) + 0x80);
|
||||
if (BSTR_OK != bcatblk (b, c, 6)) {
|
||||
b->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return BSTR_OK;
|
||||
}
|
||||
|
||||
#define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs))
|
||||
#define TEMP_UCS4_BUFFER_SIZE (64)
|
||||
|
||||
/* int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len,
|
||||
* cpUcs2* bom, cpUcs4 errCh)
|
||||
*
|
||||
* Append an array of UCS4 code points (utf16) to UTF8 codepoints (bu). Any
|
||||
* invalid code point is replaced by errCh. If errCh is itself not a
|
||||
* valid code point, then this translation will halt upon the first error
|
||||
* and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order mark
|
||||
* has been previously read, it may be passed in as bom, otherwise if *bom is
|
||||
* set to 0, it will be filled in with the BOM as read from the first
|
||||
* character if it is a BOM.
|
||||
*/
|
||||
int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) {
|
||||
cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE];
|
||||
int cc, i, sm, oldSlen;
|
||||
|
||||
if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR;
|
||||
if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;
|
||||
if (len == 0) return BSTR_OK;
|
||||
|
||||
oldSlen = bu->slen;
|
||||
i = 0;
|
||||
|
||||
/* Check for BOM character and select endianess. Also remove the
|
||||
BOM from the stream, since there is no need for it in a UTF-8 encoding. */
|
||||
if (bom && (cpUcs2) 0xFFFE == *bom) {
|
||||
sm = 8;
|
||||
} else if (bom && (cpUcs2) 0xFEFF == *bom) {
|
||||
sm = 0;
|
||||
} else if (utf16[i] == (cpUcs2) 0xFFFE) {
|
||||
if (bom) *bom = utf16[i];
|
||||
sm = 8;
|
||||
i++;
|
||||
} else if (utf16[i] == (cpUcs2) 0xFEFF) {
|
||||
if (bom) *bom = utf16[i];
|
||||
sm = 0;
|
||||
i++;
|
||||
} else {
|
||||
sm = 0; /* Assume local endianness. */
|
||||
}
|
||||
|
||||
cc = 0;
|
||||
for (;i < len; i++) {
|
||||
cpUcs4 c, v = endSwap (utf16[i], sm);
|
||||
if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
|
||||
if (v >= 0xDC00 || i >= len) {
|
||||
ErrMode:;
|
||||
if (~0 == errCh) {
|
||||
ErrReturn:;
|
||||
bu->slen = oldSlen;
|
||||
return BSTR_ERR;
|
||||
}
|
||||
v = errCh;
|
||||
} else {
|
||||
i++;
|
||||
if ((c = endSwap (utf16[i], sm) - 0xDC00) > 0x3FF) goto ErrMode;
|
||||
v = ((v - 0xD800) << 10) + c + 0x10000;
|
||||
}
|
||||
}
|
||||
buff[cc] = v;
|
||||
cc++;
|
||||
if (cc >= TEMP_UCS4_BUFFER_SIZE) {
|
||||
if (0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;
|
||||
cc = 0;
|
||||
}
|
||||
}
|
||||
if (cc > 0 && 0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;
|
||||
|
||||
return BSTR_OK;
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* This source file is part of the bstring string library. This code was
|
||||
* written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* on usage and license.
|
||||
*/
|
||||
|
||||
/*
|
||||
* buniutil.h
|
||||
*
|
||||
* This file is the interface for the buniutil basic "Unicode for bstrings"
|
||||
* functions. Note that there are dependencies on bstrlib.h and utf8util.h .
|
||||
*/
|
||||
|
||||
#ifndef BSTRLIB_UNICODE_UTILITIES
|
||||
#define BSTRLIB_UNICODE_UTILITIES
|
||||
|
||||
#include "utf8util.h"
|
||||
#include "bstrlib.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern int buIsUTF8Content (const_bstring bu);
|
||||
extern int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh);
|
||||
|
||||
/* For those unfortunate enough to be stuck supporting UTF16. */
|
||||
extern int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos);
|
||||
extern int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BSTRLIB_UNICODE_UTILITIES */
|
||||
|
71
test.cpp
71
test.cpp
|
@ -1,11 +1,11 @@
|
|||
//
|
||||
// This source file is part of the bstring string library. This code was
|
||||
// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source
|
||||
// license. Refer to the accompanying documentation for details on usage and
|
||||
// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source
|
||||
// license. Refer to the accompanying documentation for details on usage and
|
||||
// license.
|
||||
//
|
||||
|
||||
//
|
||||
//
|
||||
// test.cpp
|
||||
//
|
||||
// This file is the C++ unit test for Bstrlib
|
||||
|
@ -16,7 +16,7 @@
|
|||
#include "bstrlib.h"
|
||||
#include "bstrwrap.h"
|
||||
|
||||
// Exceptions must be turned on in the compiler to successfully run
|
||||
// Exceptions must be turned on in the compiler to successfully run
|
||||
// this test. The compiler must also support STL.
|
||||
|
||||
#define dumpOutQty (32)
|
||||
|
@ -1559,6 +1559,69 @@ int ret = 0;
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* int bMultiCountConcat (bstring dst, int n, ...)
|
||||
*
|
||||
* Concatenate a sequence of exactly n bstring arguments to dst.
|
||||
*/
|
||||
int bMultiCountConcat (bstring dst, int n, ...) {
|
||||
va_list arglist;
|
||||
int i, ret = 0;
|
||||
va_start (arglist, n);
|
||||
for (i = 0; i < n; i++) {
|
||||
ret = bconcat (dst, va_arg (arglist, bstring));
|
||||
if (0 > ret) break;
|
||||
}
|
||||
va_end (arglist);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* int bMultiCountCatCstr (bstring dst, int n, ...)
|
||||
*
|
||||
* Concatenate a sequence of exactly n char * arguments to dst.
|
||||
*/
|
||||
int bMultiCountCatCstr (bstring dst, int n, ...) {
|
||||
va_list arglist;
|
||||
int i, ret = 0;
|
||||
va_start (arglist, n);
|
||||
for (i = 0; i < n; i++) {
|
||||
ret = bcatcstr (dst, va_arg (arglist, char *));
|
||||
if (0 > ret) break;
|
||||
}
|
||||
va_end (arglist);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* These can be dangerous because there is no compiler time type checking
|
||||
* on the arguments.
|
||||
*/
|
||||
|
||||
extern int bMultiCountConcat (bstring dst, int n, ...);
|
||||
extern int bMultiCountCatCstr (bstring dst, int n, ...);
|
||||
|
||||
#if defined(__GNUC__)
|
||||
# define COUNT_ARGS(...) COUNT_ARGS_(,##__VA_ARGS__,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)
|
||||
# define COUNT_ARGS_(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32,_33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48,_49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,_64,_65,_66,_67,_68,_69,_70,_71,_72,_73,_74,_75,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85,_86,_87,_88,_89,_90,_91,_92,_93,_94,_95,_96,_97,_98,_99,cnt,...) cnt
|
||||
#else
|
||||
# if defined(__WATCOMC__) || defined(_MSC_VER)
|
||||
# define COUNT_ARGS(...) ARGCNT_ARGINDEX100((ARGCNT_0_LENGTH_ ## __VA_ARGS__ ## _SPECIAL_CASE,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1))
|
||||
# define ARGCNT_ARGINDEX100(__args) ARGCNT_ARGINDEX100_RAW __args
|
||||
# define ARGCNT_0_LENGTH__SPECIAL_CASE ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
|
||||
# define ARGCNT_ARGINDEX100_RAW(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32,_33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48,_49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,_64,_65,_66,_67,_68,_69,_70,_71,_72,_73,_74,_75,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85,_86,_87,_88,_89,_90,_91,_92,_93,_94,_95,_96,_97,_98,_99,n,...) n
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following macros are only available on more recent compilers.
|
||||
* Can process up to 100 arguments. These can also be dangerous because
|
||||
* there is no compiler time type checking on the arguments.
|
||||
*/
|
||||
|
||||
#if defined(COUNT_ARGS)
|
||||
# define bMultiConcat(dst,...) bMultiCountConcat((dst),COUNT_ARGS(__VA_ARGS__),##__VA_ARGS__)
|
||||
# define bMultiCatCstr(dst,...) bMultiCountCatCstr((dst),COUNT_ARGS(__VA_ARGS__),##__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
int main () {
|
||||
int ret = 0;
|
||||
|
||||
|
|
Loading…
Reference in New Issue