/* * This source file is part of the bstring string library. This code was * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source * license and the GPL. Refer to the accompanying documentation for details * on usage and license. */ /* * buniutil.c * * This file is not necessarily part of the core bstring library itself, but * is just an implementation of basic utf8 processing for bstrlib. Note that * this module is dependent upon bstrlib.c and utf8util.c */ #include "bstrlib.h" #include "buniutil.h" #define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL) /* int buIsUTF8Content (const_bstring bu) * * Scan string and return 1 if its entire contents is entirely UTF8 code * points. Otherwise return 0. */ int buIsUTF8Content (const_bstring bu) { struct utf8Iterator iter; if (NULL == bdata (bu)) return 0; for (utf8IteratorInit (&iter, bu->data, bu->slen); iter.next < iter.slen;) { if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0; } return 1; } /* int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, * int pos) * * Convert a string of UTF8 codepoints (bu) skipping the first pos, into a * sequence of UTF16 encoded code points. Returns the number of UCS2 16-bit * words written to the output. No more than len words are written to the * target array ucs2. If any code point in bu is unparsable, it will be * translated to errCh. */ int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) { struct tagbstring t; struct utf8Iterator iter; cpUcs4 ucs4; int i, j; if (!isLegalUnicodeCodePoint (errCh)) errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; if (NULL == ucs2 || 0 >= len || NULL == bdata (bu) || 0 > pos) return BSTR_ERR; for (j=0, i=0; j < bu->slen; j++) { if (0x80 != (0xC0 & bu->data[j])) { if (i >= pos) break; i++; } } t.mlen = -1; t.data = bu->data + j; t.slen = bu->slen - j; utf8IteratorInit (&iter, t.data, t.slen); ucs4 = BSTR_ERR; for (i=0; 0 < len && iter.next < iter.slen && 0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) { if (ucs4 < 0x10000) { *ucs2++ = (cpUcs2) ucs4; len--; } else { if (len < 2) { *ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; len--; } else { long y = ucs4 - 0x10000; ucs2[0] = (cpUcs2) (0xD800 | (y >> 10)); ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF)); len -= 2; ucs2 += 2; i++; } } } while (0 < len) { *ucs2++ = 0; len--; } utf8IteratorUninit (&iter); if (0 > ucs4) return BSTR_ERR; return i; } /* Unicode UTF-8 ------- ----- U-00000000 - U-0000007F: 0xxxxxxx U-00000080 - U-000007FF: 110xxxxx 10xxxxxx U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx UTF-32: U-000000 - U-10FFFF */ /* int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) * * Convert an array of UCS4 code points (bu) to UTF8 codepoints b. Any * invalid code point is replaced by errCh. If errCh is itself not a * valid code point, then this translation will halt upon the first error * and return BSTR_ERR. Otherwise BSTR_OK is returned. */ int buAppendBlkUcs4 (bstring b, const cpUcs4* bu, int len, cpUcs4 errCh) { int i, oldSlen; if (NULL == bu || NULL == b || 0 > len || 0 > (oldSlen = blengthe (b, -1))) return BSTR_ERR; if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0; for (i=0; i < len; i++) { unsigned char c[6]; cpUcs4 v = bu[i]; if (!isLegalUnicodeCodePoint (v)) { if (~0 == errCh) { b->slen = oldSlen; return BSTR_ERR; } v = errCh; } if (v < 0x80) { if (BSTR_OK != bconchar (b, (char) v)) { b->slen = oldSlen; return BSTR_ERR; } } else if (v < 0x800) { c[0] = (unsigned char) ( (v >> 6) + 0xc0); c[1] = (unsigned char) (( v & 0x3f) + 0x80); if (BSTR_OK != bcatblk (b, c, 2)) { b->slen = oldSlen; return BSTR_ERR; } } else if (v < 0x10000) { c[0] = (unsigned char) ( (v >> 12) + 0xe0); c[1] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); c[2] = (unsigned char) (( v & 0x3f) + 0x80); if (BSTR_OK != bcatblk (b, c, 3)) { b->slen = oldSlen; return BSTR_ERR; } } else #if 0 if (v < 0x200000) #endif { c[0] = (unsigned char) ( (v >> 18) + 0xf0); c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); c[2] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); c[3] = (unsigned char) (( v & 0x3f) + 0x80); if (BSTR_OK != bcatblk (b, c, 4)) { b->slen = oldSlen; return BSTR_ERR; } } #if 0 else if (v < 0x4000000) { c[0] = (unsigned char) ( (v >> 24) + 0xf8); c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); c[3] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); c[4] = (unsigned char) (( v & 0x3f) + 0x80); if (BSTR_OK != bcatblk (b, c, 5)) { b->slen = oldSlen; return BSTR_ERR; } } else { c[0] = (unsigned char) ( (v >> 30) + 0xfc); c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80); c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); c[4] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); c[5] = (unsigned char) (( v & 0x3f) + 0x80); if (BSTR_OK != bcatblk (b, c, 6)) { b->slen = oldSlen; return BSTR_ERR; } } #endif } return BSTR_OK; } #define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs)) #define TEMP_UCS4_BUFFER_SIZE (64) /* int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, * cpUcs2* bom, cpUcs4 errCh) * * Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu). Any * invalid code point is replaced by errCh. If errCh is itself not a * valid code point, then this translation will halt upon the first error * and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order mark * has been previously read, it may be passed in as bom, otherwise if *bom is * set to 0, it will be filled in with the BOM as read from the first * character if it is a BOM. */ int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) { cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE]; int cc, i, sm, oldSlen; if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR; if (!isLegalUnicodeCodePoint (errCh)) errCh = ~0; if (len == 0) return BSTR_OK; oldSlen = bu->slen; i = 0; /* Check for BOM character and select endianess. Also remove the BOM from the stream, since there is no need for it in a UTF-8 encoding. */ if (bom && (cpUcs2) 0xFFFE == *bom) { sm = 8; } else if (bom && (cpUcs2) 0xFEFF == *bom) { sm = 0; } else if (utf16[i] == (cpUcs2) 0xFFFE) { if (bom) *bom = utf16[i]; sm = 8; i++; } else if (utf16[i] == (cpUcs2) 0xFEFF) { if (bom) *bom = utf16[i]; sm = 0; i++; } else { sm = 0; /* Assume local endianness. */ } cc = 0; for (;i < len; i++) { cpUcs4 c, v; v = endSwap (utf16[i], sm); if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */ if (v >= 0xDC00 || i >= len) { ErrMode:; if (~0 == errCh) { ErrReturn:; bu->slen = oldSlen; return BSTR_ERR; } v = errCh; } else { i++; if ((c = endSwap (utf16[i], sm) - 0xDC00) > 0x3FF) goto ErrMode; v = ((v - 0xD800) << 10) + c + 0x10000; } } buff[cc] = v; cc++; if (cc >= TEMP_UCS4_BUFFER_SIZE) { if (0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn; cc = 0; } } if (cc > 0 && 0 > buAppendBlkUcs4 (bu, buff, cc, errCh)) goto ErrReturn; return BSTR_OK; }