From ac490ae623167145ae95ef6b009dc56995a002cb Mon Sep 17 00:00:00 2001
From: websnarf <github@azillionmonkeys.com>
Date: Sun, 26 Jul 2015 22:33:37 -0700
Subject: [PATCH] Add basic UTF8 support.  Fix warnings.  Update copyrights.

---
 bstraux.c    |  20 ++--
 bstraux.h    |  36 +++----
 bstrlib.c    |  21 ++--
 bstrlib.txt  | 118 +++++++++++++++++++++-
 bstrwrap.cpp |   4 +-
 bstrwrap.h   |   4 +-
 buniutil.c   | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++
 buniutil.h   |  37 +++++++
 test.cpp     |  71 +++++++++++++-
 9 files changed, 532 insertions(+), 49 deletions(-)
 create mode 100644 buniutil.c
 create mode 100644 buniutil.h

diff --git a/bstraux.c b/bstraux.c
index 208d72f..e9d9e5a 100644
--- a/bstraux.c
+++ b/bstraux.c
@@ -1,7 +1,9 @@
+#define _CRT_SECURE_NO_WARNINGS
+
 /*
  * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source 
- * license and the GPL. Refer to the accompanying documentation for details 
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
  * on usage and license.
  */
 
@@ -9,7 +11,7 @@
  * bstraux.c
  *
  * This file is not necessarily part of the core bstring library itself, but
- * is just an auxilliary module which includes miscellaneous or trivial 
+ * is just an auxilliary module which includes miscellaneous or trivial
  * functions.
  */
 
@@ -956,11 +958,12 @@ bstring b, t;
 			/* Double size, but deal with unusual case of numeric
 			   overflows */
 
-			if ((m = b->mlen << 1)   <= b->mlen &&
-			    (m = b->mlen + 1024) <= b->mlen &&
-			    (m = b->mlen + 16)   <= b->mlen &&
-			    (m = b->mlen + 1)    <= b->mlen) t = NULL;
-			else t = bfromcstralloc (m, "");
+			if (b->mlen <= INT_MAX / 2) m = b->mlen << 1;
+			else if (b->mlen <= INT_MAX - 1024) m = b->mlen + 1024;
+			else if (b->mlen <= INT_MAX - 16) m = b->mlen + 16;
+			else if (b->mlen <= INT_MAX - 1) m = b->mlen + 1;
+			else return NULL;
+			t = bfromcstralloc (m, "");
 
 			if (t) memcpy (t->data, b->data, i);
 			bSecureDestroy (b); /* Cleanse previous buffer */
@@ -1130,4 +1133,3 @@ void * parm;
 	free (ws);
 	return parm;
 }
-
diff --git a/bstraux.h b/bstraux.h
index c00c7b7..aba8e71 100644
--- a/bstraux.h
+++ b/bstraux.h
@@ -1,7 +1,7 @@
 /*
  * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source 
- * license and the GPL. Refer to the accompanying documentation for details 
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
  * on usage and license.
  */
 
@@ -9,7 +9,7 @@
  * bstraux.h
  *
  * This file is not a necessary part of the core bstring library itself, but
- * is just an auxilliary module which includes miscellaneous or trivial 
+ * is just an auxilliary module which includes miscellaneous or trivial
  * functions.
  */
 
@@ -24,7 +24,7 @@ extern "C" {
 #endif
 
 /* Safety mechanisms */
-#define bstrDeclare(b)               bstring (b) = NULL; 
+#define bstrDeclare(b)               bstring (b) = NULL;
 #define bstrFree(b)                  {if ((b) != NULL && (b)->slen >= 0 && (b)->mlen >= (b)->slen) { bdestroy (b); (b) = NULL; }}
 
 /* Backward compatibilty with previous versions of Bstrlib */
@@ -87,22 +87,22 @@ int bwsBuffLength (struct bwriteStream * stream, int sz);
 void * bwsClose (struct bwriteStream * stream);
 
 /* Security functions */
-#define bSecureDestroy(b) {	                                            \
-bstring bstr__tmp = (b);	                                            \
-	if (bstr__tmp && bstr__tmp->mlen > 0 && bstr__tmp->data) {          \
-	    (void) memset (bstr__tmp->data, 0, (size_t) bstr__tmp->mlen);   \
-	    bdestroy (bstr__tmp);                                           \
-	}                                                                   \
+#define bSecureDestroy(b) {                                             \
+bstring bstr__tmp = (b);                                                \
+    if (bstr__tmp && bstr__tmp->mlen > 0 && bstr__tmp->data) {          \
+        (void) memset (bstr__tmp->data, 0, (size_t) bstr__tmp->mlen);   \
+        bdestroy (bstr__tmp);                                           \
+    }                                                                   \
 }
-#define bSecureWriteProtect(t) {	                                              \
-	if ((t).mlen >= 0) {                                                          \
-	    if ((t).mlen > (t).slen)) {                                               \
-	        (void) memset ((t).data + (t).slen, 0, (size_t) (t).mlen - (t).slen); \
-	    }                                                                         \
-	    (t).mlen = -1;                                                            \
-	}                                                                             \
+#define bSecureWriteProtect(t) {                                                  \
+    if ((t).mlen >= 0) {                                                          \
+        if ((t).mlen > (t).slen)) {                                               \
+            (void) memset ((t).data + (t).slen, 0, (size_t) (t).mlen - (t).slen); \
+        }                                                                         \
+        (t).mlen = -1;                                                            \
+    }                                                                             \
 }
-extern bstring bSecureInput (int maxlen, int termchar, 
+extern bstring bSecureInput (int maxlen, int termchar,
                              bNgetc vgetchar, void * vgcCtx);
 
 #ifdef __cplusplus
diff --git a/bstrlib.c b/bstrlib.c
index eb92ceb..6176a69 100644
--- a/bstrlib.c
+++ b/bstrlib.c
@@ -1559,9 +1559,9 @@ bstring auxf = (bstring) find;
 bstring auxr = (bstring) repl;
 
 	if (b == NULL || b->data == NULL || find == NULL ||
-	    find->data == NULL || repl == NULL || repl->data == NULL ||
-	    pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
-	    b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
+		find->data == NULL || repl == NULL || repl->data == NULL ||
+		pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen ||
+		b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
 	if (pos > b->slen - find->slen) return BSTR_OK;
 
 	/* Alias with find string */
@@ -1639,12 +1639,12 @@ bstring auxr = (bstring) repl;
 
 	while ((pos = instr (b, pos, auxf)) >= 0) {
 		if (slen >= mlen - 1) {
-			int sl, *t;
-
+			int *t;
+			int vl;
 			mlen += mlen;
-			sl = sizeof (int *) * mlen;
+			vl = sizeof (int *) * mlen;
 			if (static_d == d) d = NULL; /* static_d cannot be realloced */
-			if (mlen <= 0 || sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
+			if (mlen <= 0 || vl < mlen || NULL == (t = (int *) bstr__realloc (d, vl))) {
 				ret = BSTR_ERR;
 				goto done;
 			}
@@ -1683,8 +1683,7 @@ bstring auxr = (bstring) repl;
 	}
 
 	done:;
-	if (static_d == d) d = NULL;
-	bstr__free (d);
+	if (static_d != d) bstr__free (d);
 	if (auxf != find) bdestroy (auxf);
 	if (auxr != repl) bdestroy (auxr);
 	return ret;
@@ -1700,8 +1699,8 @@ int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
 	return findreplaceengine (b, find, repl, pos, binstr);
 }
 
-/*  int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl,
- *                    int pos)
+/*  int bfindreplacecaseless (bstring b, const_bstring find,
+ *                            const_bstring repl, int pos)
  *
  *  Replace all occurrences of a find string, ignoring case, with a replace
  *  string after a given point in a bstring.
diff --git a/bstrlib.txt b/bstrlib.txt
index d0f02f7..7cb2ede 100644
--- a/bstrlib.txt
+++ b/bstrlib.txt
@@ -588,9 +588,14 @@ test.cpp        - C++ unit/regression test for bstrwrap.cpp
 bsafe.c         - C runtime stubs to abort usage of unsafe C functions.
 bsafe.h         - C header file for bsafe.c functions.
 
-C projects need only include bstrlib.h and compile/link bstrlib.c to use the
-bstring library.  C++ projects need to additionally include bstrwrap.h and
-compile/link bstrwrap.cpp.  For both, there may be a need to make choices
+utf8util.c      - C implemention of generic utf8 parsing functions.
+utf8util.h      - C head file for generic utf8 parsing functions.
+buniutil.c      - C implemention utf8 bstring packing and unpacking functions.
+buniutil.c      - C header file for utf8 bstring functions.
+
+C modules need only include bstrlib.h and compile/link bstrlib.c to use the
+basic bstring library.  C++ projects need to additionally include bstrwrap.h
+and compile/link bstrwrap.cpp.  For both, there may be a need to make choices
 about feature configuration as described in the "Configurable compilation
 options" in the section above.
 
@@ -1977,6 +1982,112 @@ The macros
 
 ===============================================================================
 
+Unicode functions
+-----------------
+
+    The two modules utf8util.c and buniutil.c implement basic functions for
+    parsing and collecting Unicode data in the UTF8 format.  Unicode is
+    described by a sequence of "code points" which are values between 0 and
+    1114111 inclusive mapped to symbol content corresponding to nearly all
+    the standardized scripts of the world.
+
+    The semantics of Unicode code points is varied and complicated.  The
+    base support of the better string library does not attempt to perform
+    any interpretation of these code points.  The better string library
+    solely provides support for iterating through unicode code points,
+    appending and extracting code points to and from bstrings, and parsing
+    UTF8 and UTF16 from raw data.
+
+    To use these functions compile and link utf8util.c and buniutil.c
+
+    ..........................................................................
+
+    extern void utf8IteratorInit (struct utf8Iterator* iter,
+                                  unsigned char* data, int slen);
+
+    Initialize a unicode utf8 iterator to traverse an array of utf8 encoded
+    code points pointed to by data, with length slen from the start.  The
+    iterator iter is only valid for as long as the array it is pointed to
+    is valid and not modified.
+
+    ..........................................................................
+
+    extern void utf8IteratorUninit (struct utf8Iterator* iter);
+
+    Invalidate utf8 iterator.
+
+    ..........................................................................
+
+    extern cpUcs4 utf8IteratorGetNextCodePoint (struct utf8Iterator* iter,
+                                                cpUcs4 errCh);
+
+    Parse code point the iterator is pointing at and advance the iterator to
+    the next code point.  If the iterator was pointing at a valid code point
+    the code point is returned, otherwise, errCh will be returned.
+
+    ..........................................................................
+
+    extern cpUcs4 utf8IteratorGetCurrCodePoint (struct utf8Iterator* iter,
+                                                cpUcs4 errCh);
+
+    Parse code point the iterator is pointing at.  If the iterator was
+    pointing at a valid code point the code point is returned, otherwise,
+    errCh will be returned.
+
+    ..........................................................................
+
+    extern int utf8ScanBackwardsForCodePoint (unsigned char* msg, int len,
+                                              int pos, cpUcs4* out);
+
+    From the position "pos" in the array msg of length len, search for the
+    last position before or at pos where from which a valid Unicode code
+    point can be parsed.  If such an offset is found it is returned otherwise
+    a negative value is returned.  The code point parsed is put into *out if
+    it is not NULL.
+
+    ..........................................................................
+
+    extern int buIsUTF8Content (const_bstring bu);
+
+    Scan a bstring and determine if it is made entirely of unicode code 
+    valid points.  If it is, 1 is returned, otherwise 0 is returned.
+
+    ..........................................................................
+
+    extern int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len,
+                                cpUcs4 errCh);
+
+    Append the code points passed in the UCS4 format (raw numbers) in the
+    array bu of length len.  Any unparsable characters are replaced by errCh.
+    If errCh is not a valid Unicode code point, then parsing errors will cause
+    BSTR_ERR to be returned.
+
+    ..........................................................................
+
+    extern int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh,
+                              const_bstring bu, int pos);
+
+    Convert a string of UTF8 codepoints (bu), skipping the first pos, into a 
+    sequence of UTF16 encoded code points.  Returns the number of UCS2 16-bit
+    words written to the output.  No more than len words are written to the
+    target array ucs2.  If any code point in bu is unparsable, it will be
+    translated to errCh.
+
+    ..........................................................................
+
+    extern int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len,
+                                 cpUcs2* bom, cpUcs4 errCh);
+
+    Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu).  Any
+    invalid code point is replaced by errCh.  If errCh is itself not a
+    valid code point, then this translation will halt upon the first error
+    and return BSTR_ERR.  Otherwise BSTR_OK is returned.  If a byte order mark
+    has been previously read, it may be passed in as bom, otherwise if *bom is
+    set to 0, it will be filled in with the BOM as read from the first
+    character if it is a BOM.
+
+===============================================================================
+
 The bstest module
 -----------------
 
@@ -3197,5 +3308,6 @@ Michael Hsieh
 Richard A. Smith
 Simon Ekstrom
 Wayne Scott
+Zed A. Shaw
 
 ===============================================================================
diff --git a/bstrwrap.cpp b/bstrwrap.cpp
index 31c9a5b..ce77f17 100644
--- a/bstrwrap.cpp
+++ b/bstrwrap.cpp
@@ -1,7 +1,7 @@
 /*
  * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source 
- * license and the GPL. Refer to the accompanying documentation for details 
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
  * on usage and license.
  */
 
diff --git a/bstrwrap.h b/bstrwrap.h
index 6a09b05..0ba63a0 100644
--- a/bstrwrap.h
+++ b/bstrwrap.h
@@ -1,7 +1,7 @@
 /*
  * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source 
- * license and the GPL. Refer to the accompanying documentation for details 
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
  * on usage and license.
  */
 
diff --git a/buniutil.c b/buniutil.c
new file mode 100644
index 0000000..7d3f102
--- /dev/null
+++ b/buniutil.c
@@ -0,0 +1,270 @@
+/*
+ * This source file is part of the bstring string library.  This code was
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
+ * on usage and license.
+ */
+
+/*
+ * buniutil.c
+ *
+ * This file is not necessarily part of the core bstring library itself, but
+ * is just an implementation of basic utf8 processing for bstrlib.  Note that
+ * this module is dependent upon bstrlib.c and utf8util.c
+ */
+
+#include "bstrlib.h"
+#include "buniutil.h"
+
+#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL)
+
+/*  int buIsUTF8Content (const_bstring bu)
+ *
+ *  Scan string and return 1 if its entire contents is entirely UTF8 code 
+ *  points.  Otherwise return 0.
+ */
+int buIsUTF8Content (const_bstring bu) {
+struct utf8Iterator iter;
+
+	if (NULL == bdata (bu)) return 0;
+	for (utf8IteratorInit (&iter, bu->data, bu->slen);
+		 !utf8IteratorNoMore (&iter);) {
+		if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0;
+	}
+	return 1;
+}
+
+/*  int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu,
+ *                     int pos)
+ *
+ *  Convert a string of UTF8 codepoints (bu) into a sequence of UTF16 encoded
+ *  code points.  Returns the number of UCS2 16-bit words written to the 
+ *  output.  No more than len words are written to the target array ucs2.  If
+ *  any code point in bu is unparsable, it will be translated to errCh.
+ */
+int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) {
+struct tagbstring t;
+struct utf8Iterator iter;
+cpUcs4 ucs4;
+int i, j;
+
+	if (!isLegalUnicodeCodePoint (errCh)) errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
+	if (NULL == ucs2 || 0 >= len || NULL == bdata (bu) || 0 > pos) return BSTR_ERR;
+
+	for (j=0, i=0; j < bu->slen; j++) {
+		if (0x80 != (0xC0 & bu->data[j])) {
+			if (i >= pos) break;
+			i++;
+		}
+	}
+
+	t.mlen = -1;
+	t.data = bu->data + j;
+	t.slen = bu->slen - j;
+
+	utf8IteratorInit (&iter, t.data, t.slen);
+
+	ucs4 = BSTR_ERR;
+	for (i=0; 0 < len && !utf8IteratorNoMore (&iter) && 0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) {
+		if (ucs4 < 0x10000) {
+			*ucs2++ = (cpUcs2) ucs4;
+			len--;
+		} else {
+			if (len < 2) {
+				*ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER;
+				len--;
+			} else {
+				long y = ucs4 - 0x10000;
+				ucs2[0] = (cpUcs2) (0xD800 | (y >> 10));
+				ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF));
+				len -= 2;
+				ucs2 += 2;
+				i++;
+			}
+		}
+	}
+	while (0 < len) {
+		*ucs2++ = 0;
+		len--;
+	}
+
+	utf8IteratorUninit (&iter);
+	if (0 > ucs4) return BSTR_ERR;
+	return i;
+}
+
+/*
+
+Unicode                   UTF-8
+-------                   -----
+U-00000000 - U-0000007F:  0xxxxxxx  
+U-00000080 - U-000007FF:  110xxxxx 10xxxxxx  
+U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx  
+U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx  
+U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 
+
+UTF-32: U-000000 - U-10FFFF
+
+*/
+
+/*  int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh)
+ *
+ *  Convert an array of UCS4 code points (bu) to UTF8 codepoints b.  Any
+ *  invalid code point is replaced by errCh.  If errCh is itself not a
+ *  valid code point, then this translation will halt upon the first error
+ *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.
+ */
+int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) {
+int i, oldSlen;
+
+	if (NULL == bu || NULL == b || 0 > len || 0 > (oldSlen = blengthe (b, -1))) return BSTR_ERR;
+	if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;
+
+	for (i=0; i < len; i++) {
+		unsigned char c[6];
+		cpUcs4 v = bu[i];
+
+		if (!isLegalUnicodeCodePoint (v)) {
+			if (~0 == errCh) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+			v = errCh;
+		}
+
+		if (v < 0x80) {
+			if (BSTR_OK != bconchar (b, (char) v)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else if (v < 0x800) {
+			c[0] = (unsigned char) ( (v >>  6)         + 0xc0);
+			c[1] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 2)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else if (v < 0x10000) {
+			c[0] = (unsigned char) ( (v >> 12)         + 0xe0);
+			c[1] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[2] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 3)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else 
+#if 0
+			if (v < 0x200000)
+#endif
+		{
+			c[0] = (unsigned char) ( (v >> 18)         + 0xf0);
+			c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[3] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 4)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} 
+#if 0		
+		else if (v < 0x4000000) {
+			c[0] = (unsigned char) ( (v >> 24)         + 0xf8);
+			c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[3] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[4] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 5)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		} else {
+			c[0] = (unsigned char) ( (v >> 30)         + 0xfc);
+			c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80);
+			c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
+			c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80);
+			c[4] = (unsigned char) (((v >>  6) & 0x3f) + 0x80);
+			c[5] = (unsigned char) ((        v & 0x3f) + 0x80);
+			if (BSTR_OK != bcatblk (b, c, 6)) {
+				b->slen = oldSlen;
+				return BSTR_ERR;
+			}
+		}
+#endif
+	}
+	return BSTR_OK;
+}
+
+#define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs))
+#define TEMP_UCS4_BUFFER_SIZE (64)
+
+/*  int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, 
+ *                        cpUcs2* bom, cpUcs4 errCh)
+ *
+ *  Append an array of UCS4 code points (utf16) to UTF8 codepoints (bu).  Any
+ *  invalid code point is replaced by errCh.  If errCh is itself not a
+ *  valid code point, then this translation will halt upon the first error
+ *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.  If a byte order mark
+ *  has been previously read, it may be passed in as bom, otherwise if *bom is
+ *  set to 0, it will be filled in with the BOM as read from the first 
+ *  character if it is a BOM.
+ */
+int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) {
+cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE];
+int cc, i, sm, oldSlen;
+
+	if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR;
+	if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0;
+	if (len == 0) return BSTR_OK;
+
+	oldSlen = bu->slen;
+	i = 0;
+
+	/* Check for BOM character and select endianess.  Also remove the
+	   BOM from the stream, since there is no need for it in a UTF-8 encoding. */
+	if (bom && (cpUcs2) 0xFFFE == *bom) {
+		sm = 8;
+	} else if (bom && (cpUcs2) 0xFEFF == *bom) {
+		sm = 0;
+	} else if (utf16[i] == (cpUcs2) 0xFFFE) {
+		if (bom) *bom = utf16[i];
+		sm = 8;
+		i++;
+	} else if (utf16[i] == (cpUcs2) 0xFEFF) {
+		if (bom) *bom = utf16[i];
+		sm = 0;
+		i++;
+	} else {
+		sm = 0; /* Assume local endianness. */
+	}
+
+	cc = 0;
+	for (;i < len; i++) {
+		cpUcs4 c, v = endSwap (utf16[i], sm);
+		if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
+			if (v >= 0xDC00 || i >= len) {
+				ErrMode:;
+				if (~0 == errCh) {
+					ErrReturn:;
+					bu->slen = oldSlen;
+					return BSTR_ERR;
+				}
+				v = errCh;
+			} else {
+				i++;
+				if ((c = endSwap (utf16[i], sm) - 0xDC00) > 0x3FF) goto ErrMode;
+				v = ((v - 0xD800) << 10) + c + 0x10000;
+			}
+		}
+		buff[cc] = v;
+		cc++;
+		if (cc >= TEMP_UCS4_BUFFER_SIZE) {
+			if (0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;
+			cc = 0;
+		}
+	}
+	if (cc > 0 && 0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn;
+
+	return BSTR_OK;
+}
diff --git a/buniutil.h b/buniutil.h
new file mode 100644
index 0000000..1017212
--- /dev/null
+++ b/buniutil.h
@@ -0,0 +1,37 @@
+/*
+ * This source file is part of the bstring string library.  This code was
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
+ * on usage and license.
+ */
+
+/*
+ * buniutil.h
+ *
+ * This file is the interface for the buniutil basic "Unicode for bstrings"
+ * functions.  Note that there are dependencies on bstrlib.h and utf8util.h .
+ */
+
+#ifndef BSTRLIB_UNICODE_UTILITIES
+#define BSTRLIB_UNICODE_UTILITIES
+
+#include "utf8util.h"
+#include "bstrlib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int buIsUTF8Content (const_bstring bu);
+extern int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh);
+
+/* For those unfortunate enough to be stuck supporting UTF16. */
+extern int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos);
+extern int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BSTRLIB_UNICODE_UTILITIES */
+
diff --git a/test.cpp b/test.cpp
index 2f15ac8..0976667 100644
--- a/test.cpp
+++ b/test.cpp
@@ -1,11 +1,11 @@
 //
 // This source file is part of the bstring string library.  This code was
-// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source 
-// license. Refer to the accompanying documentation for details on usage and 
+// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source
+// license. Refer to the accompanying documentation for details on usage and
 // license.
 //
 
-// 
+//
 // test.cpp
 //
 // This file is the C++ unit test for Bstrlib
@@ -16,7 +16,7 @@
 #include "bstrlib.h"
 #include "bstrwrap.h"
 
-// Exceptions must be turned on in the compiler to successfully run 
+// Exceptions must be turned on in the compiler to successfully run
 // this test.  The compiler must also support STL.
 
 #define dumpOutQty (32)
@@ -1559,6 +1559,69 @@ int ret = 0;
 	return ret;
 }
 
+/*  int bMultiCountConcat (bstring dst, int n, ...)
+ *
+ *  Concatenate a sequence of exactly n bstring arguments to dst.
+ */
+int bMultiCountConcat (bstring dst, int n, ...) {
+va_list arglist;
+int i, ret = 0;
+	va_start (arglist, n);
+	for (i = 0; i < n; i++) {
+		ret = bconcat (dst, va_arg (arglist, bstring));
+		if (0 > ret) break;
+	}
+	va_end (arglist);
+	return ret;
+}
+
+/*  int bMultiCountCatCstr (bstring dst, int n, ...)
+ *
+ *  Concatenate a sequence of exactly n char * arguments to dst.
+ */
+int bMultiCountCatCstr (bstring dst, int n, ...) {
+va_list arglist;
+int i, ret = 0;
+	va_start (arglist, n);
+	for (i = 0; i < n; i++) {
+		ret = bcatcstr (dst, va_arg (arglist, char *));
+		if (0 > ret) break;
+	}
+	va_end (arglist);
+	return ret;
+}
+
+/*
+ *  These can be dangerous because there is no compiler time type checking
+ *  on the arguments.
+ */
+
+extern int bMultiCountConcat (bstring dst, int n, ...);
+extern int bMultiCountCatCstr (bstring dst, int n, ...);
+
+#if defined(__GNUC__)
+# define COUNT_ARGS(...) COUNT_ARGS_(,##__VA_ARGS__,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)
+# define COUNT_ARGS_(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32,_33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48,_49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,_64,_65,_66,_67,_68,_69,_70,_71,_72,_73,_74,_75,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85,_86,_87,_88,_89,_90,_91,_92,_93,_94,_95,_96,_97,_98,_99,cnt,...) cnt
+#else
+# if defined(__WATCOMC__) || defined(_MSC_VER)
+#  define COUNT_ARGS(...) ARGCNT_ARGINDEX100((ARGCNT_0_LENGTH_ ## __VA_ARGS__ ## _SPECIAL_CASE,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1))
+#  define ARGCNT_ARGINDEX100(__args) ARGCNT_ARGINDEX100_RAW __args
+#  define ARGCNT_0_LENGTH__SPECIAL_CASE ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
+#  define ARGCNT_ARGINDEX100_RAW(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32,_33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48,_49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,_64,_65,_66,_67,_68,_69,_70,_71,_72,_73,_74,_75,_76,_77,_78,_79,_80,_81,_82,_83,_84,_85,_86,_87,_88,_89,_90,_91,_92,_93,_94,_95,_96,_97,_98,_99,n,...) n
+# endif
+#endif
+
+/*
+ * The following macros are only available on more recent compilers.
+ * Can process up to 100 arguments.  These can also be dangerous because
+ * there is no compiler time type checking on the arguments.
+ */
+
+#if defined(COUNT_ARGS)
+# define bMultiConcat(dst,...)  bMultiCountConcat((dst),COUNT_ARGS(__VA_ARGS__),##__VA_ARGS__)
+# define bMultiCatCstr(dst,...) bMultiCountCatCstr((dst),COUNT_ARGS(__VA_ARGS__),##__VA_ARGS__)
+#endif
+
 int main () {
 int ret = 0;