Last part of fixes for handling UTF-8 coded strings (Bugs 572865 & 628893)

At the end of a double quoted string, readstrexp() was passing byte count
instead of character count to mk_counted_string(). Cleaned up basic_inchar().
It ignores bad UTF-8 coded characters when reading from file or buffer.
This commit is contained in:
Kevin Cozens 2010-09-19 17:46:41 -04:00
parent 6855267470
commit ae6670ba8b
1 changed files with 51 additions and 51 deletions

View File

@ -1043,11 +1043,11 @@ static char *store_string(scheme *sc, int char_cnt,
else
len = q2 - str;
q=(gchar*)sc->malloc(len+1);
}
else {
} else {
len = g_unichar_to_utf8(fill, utf8);
q=(gchar*)sc->malloc(char_cnt*len+1);
}
if(q==0) {
sc->no_memory=1;
return sc->strbuff;
@ -1565,82 +1565,81 @@ static void port_close(scheme *sc, pointer p, int flag) {
}
}
/* This routine will ignore byte sequences that are not valid UTF-8 */
static gunichar basic_inchar(port *pt) {
int len;
if(pt->kind & port_file) {
unsigned char utf8[7];
int c;
int i;
c = fgetc(pt->rep.stdio.file);
if (c == EOF) return EOF;
utf8[0] = c;
while (TRUE)
{
if (utf8[0] <= 0x7f)
{
return (gunichar) utf8[0];
}
if (c == EOF) return EOF;
/* Check for valid lead byte per RFC-3629 */
if (utf8[0] >= 0xc2 && utf8[0] <= 0xf4)
if (c <= 0x7f)
return (gunichar) c;
/* Is this byte an invalid lead per RFC-3629? */
if (c < 0xc2 || c > 0xf4)
{
len = utf8_length[utf8[0] & 0x3F];
/* Ignore invalid lead byte and get the next characer */
c = fgetc(pt->rep.stdio.file);
}
else /* Byte is valid lead */
{
unsigned char utf8[7];
int len;
int i;
utf8[0] = c; /* Save the lead byte */
len = utf8_length[c & 0x3F];
for (i = 1; i <= len; i++)
{
c = fgetc(pt->rep.stdio.file);
if (c == EOF) return EOF;
utf8[i] = c;
if ((utf8[i] & 0xc0) != 0x80)
{
/* Stop reading if this is not a continuation character */
if ((c & 0xc0) != 0x80)
break;
}
utf8[i] = c;
}
if (i > len)
if (i > len) /* Read the expected number of bytes? */
{
return g_utf8_get_char ((char *) utf8);
return g_utf8_get_char_validated ((char *) utf8,
sizeof(utf8));
}
/* we did not get enough continuation characters. */
utf8[0] = utf8[i]; /* ignore and restart */
/* Not enough continuation characters so ignore and restart */
}
else
{
/* Everything else is invalid and will be ignored */
c = fgetc(pt->rep.stdio.file);
if (c == EOF) return EOF;
utf8[0] = c;
}
}
} /* end of while (TRUE) */
} else {
if(*pt->rep.string.curr == 0 ||
pt->rep.string.curr == pt->rep.string.past_the_end) {
return EOF;
} else {
gunichar c;
gunichar c;
int len;
while (TRUE)
{
/* Found NUL or at end of input buffer? */
if (*pt->rep.string.curr == 0 ||
pt->rep.string.curr == pt->rep.string.past_the_end) {
return EOF;
}
len = pt->rep.string.past_the_end - pt->rep.string.curr;
c = g_utf8_get_char_validated(pt->rep.string.curr, len);
if (c < 0)
if (c >= 0) /* Valid UTF-8 character? */
{
pt->rep.string.curr = g_utf8_find_next_char(pt->rep.string.curr,
pt->rep.string.past_the_end);
if (pt->rep.string.curr == NULL)
pt->rep.string.curr = pt->rep.string.past_the_end;
c = ' ';
}
else
{
len = g_unichar_to_utf8(c, NULL);
len = g_unichar_to_utf8(c, NULL); /* Length of UTF-8 sequence */
pt->rep.string.curr += len;
return c;
}
return c;
}
/* Look for next valid UTF-8 character in buffer */
pt->rep.string.curr = g_utf8_find_next_char(pt->rep.string.curr,
pt->rep.string.past_the_end);
} /* end of while (TRUE) */
}
}
@ -1813,7 +1812,8 @@ static pointer readstrexp(scheme *sc) {
break;
case '"':
*p=0;
return mk_counted_string(sc,sc->strbuff,p-sc->strbuff);
return mk_counted_string(sc,sc->strbuff,
g_utf8_strlen(sc->strbuff, sizeof(sc->strbuff)));
default:
len = g_unichar_to_utf8(c, p);
p += len;
@ -2855,7 +2855,7 @@ static pointer opexe_0(scheme *sc, enum scheme_opcodes op) {
s_goto(sc,OP_EVAL);
case OP_DEF1: /* define */
x=find_slot_in_env(sc,sc->envir,sc->code,0);
x=find_slot_in_env(sc,sc->envir,sc->code,0);
if (x != sc->NIL) {
set_slot_in_env(sc, x, sc->value);
} else {