Skip to content

Commit

Permalink
* ADDED: Turn UTF-8 characters > 255 into character entities
Browse files Browse the repository at this point in the history
  • Loading branch information
Jan Wielemaker committed Jul 8, 2003
1 parent 3d13035 commit 72f1176
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 3 deletions.
3 changes: 3 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
Jul 8, 2003

* ADDED: handle UTF-8 sequences producing characters that cannot be
represented as character entities. Suggested by C. M. Sperberg-McQueen.

* FIXED: handling &#X, where 128<=X<256 with UTF-8 decoding enabled.
C. M. Sperberg-McQueen.

Expand Down
2 changes: 2 additions & 0 deletions Test/ok/utf8-cent.ok
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[element(testdoc, [id='t7-20020923', resp='MSM'], ['\n', element(names, [], ['From Espa�ola -- a ', entity(8216), test, entity(8217), ' for you.']), '\n', element(nums, [], ['From Espa�ola -- a ', entity(8216), test, entity(8217), ' for you.']), '\n', element(names, [], ['From Espa�ola -- a ', entity(8216), test, entity(8217), ' for you.']), '\n', element(nums, [], ['From Espa�ola -- a ', entity(8216), test, entity(8217), ' for you.']), '\n'])].
[].
7 changes: 7 additions & 0 deletions Test/utf8-cent.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<testdoc id="t7-20020923" resp="MSM">
<names>From Española -- a ‘test’ for you.</names>
<nums>From Española -- a ‘test’ for you.</nums>
<names>From Española -- a ‘test’ for you.</names>
<nums>From Española -- a ‘test’ for you.</nums>
</testdoc>
19 changes: 16 additions & 3 deletions parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,8 @@ expand_entities(dtd_parser *p, const ichar *in, ochar *out, int len)
{ int chr;

in = __utf8_get_char(in, &chr);
if ( chr >= OUTPUT_CHARSET_SIZE )
gripe(ERC_REPRESENTATION, "character");
*out++ = chr;
}
#endif
Expand Down Expand Up @@ -4866,15 +4868,26 @@ putchar_dtd_parser(dtd_parser *p, int chr)
}
#ifdef UTF8
case S_UTF8:
{ if ( (chr & 0xc0) != 0x80 ) /* TBD: recover */
if ( (chr & 0xc0) != 0x80 ) /* TBD: recover */
gripe(ERC_SYNTAX_ERROR, "Bad UTF-8 sequence", "");
p->utf8_char <<= 6;
p->utf8_char |= (chr & ~0xc0);
if ( --p->utf8_left == 0 )
{ add_cdata(p, p->utf8_char); /* verbatim? */
{ if ( p->utf8_char >= OUTPUT_CHARSET_SIZE &&
p->mark_state == MS_INCLUDE )
{ if ( p->on_entity )
{ process_cdata(p, FALSE);
(*p->on_entity)(p, NULL, p->utf8_char);
goto utf8_done;
} else
gripe(ERC_REPRESENTATION, "character");
}
add_cdata(p, p->utf8_char); /* verbatim? */
utf8_done:
p->state = p->utf8_saved_state;
}
}

break;
#endif
}
}
Expand Down

0 comments on commit 72f1176

Please sign in to comment.