diff --git a/cpan/CPAN-Meta-YAML/t/11_read_string.t b/cpan/CPAN-Meta-YAML/t/11_read_string.t index 491fd8e8ee2a..296d22fe9568 100644 --- a/cpan/CPAN-Meta-YAML/t/11_read_string.t +++ b/cpan/CPAN-Meta-YAML/t/11_read_string.t @@ -40,11 +40,11 @@ subtest 'invalid UTF-8' => sub { # get invalid UTF-8 by reading Latin-1 with lax :utf8 layer my $string = do { local $SIG{__WARN__} = sub {}; - slurp( test_data_file('latin1.yml'), ":utf8" ); + slurp( test_data_file('latin1.yml'), ":utf8_lax" ); }; my $obj = eval { CPAN::Meta::YAML->read_string($string); }; is( $obj, undef, "read_string should return undef" ); - error_like( qr/invalid UTF-8 string/, + error_like( qr/UTF-8/, "Got expected error about invalid UTF-8 string" ); }; diff --git a/lib/PerlIO.pm b/lib/PerlIO.pm index 01a02cf6486a..d1dd74761446 100644 --- a/lib/PerlIO.pm +++ b/lib/PerlIO.pm @@ -1,6 +1,6 @@ package PerlIO; -our $VERSION = '1.12'; +our $VERSION = '1.13'; # Map layer name to package that defines it our %alias; @@ -175,17 +175,10 @@ between valid UTF-8 bytes and valid Unicode characters. =item :bytes -This is the inverse of the C<:utf8> pseudo-layer. It turns off the flag -on the layer below so that data read from it is considered to -be Perl's internal downgraded encoding, thus interpreted as the native -single-byte encoding of Latin-1 or EBCDIC. Likewise on output Perl will -warn if a "wide" character (a codepoint not in the range 0..255) is -written to a such a stream. - -This is very dangerous to push on a handle using an C<:encoding> layer, -as such a layer assumes to be working with Perl's internal upgraded -encoding, so you will likely get a mangled result. Instead use C<:raw> or -C<:pop> to remove encoding layers. +This removes all layers that do unicode IO, such as C<:utf8> and +C<:encoding>. It ensures that data read from it is considered to be +"octets" i.e. characters in the range 0..255 only. Likewise on output perl +will warn if a "wide" character is written to a such a stream. =item :raw @@ -203,9 +196,9 @@ to add C<:perlio> to the PERLIO environment variable, or open the handle explicitly with that layer, to replace the platform default of C<:crlf>. The implementation of C<:raw> is as a pseudo-layer which when "pushed" -pops itself and then any layers which would modify the binary data stream. -(Undoing C<:utf8> and C<:crlf> may be implemented by clearing flags -rather than popping layers but that is an implementation detail.) +pops itself and then any layers which do not declare themselves as suitable +for binary data. (Undoing :crlf is implemented by clearing a flag rather +than popping the layer but that is an implementation detail.) As a consequence of the fact that C<:raw> normally pops layers, it usually only makes sense to have it as the only or first element in diff --git a/perlio.c b/perlio.c index 97415f8fa810..85da587ade72 100644 --- a/perlio.c +++ b/perlio.c @@ -1024,7 +1024,7 @@ PERLIO_FUNCS_DECL(PerlIO_remove) = { sizeof(PerlIO_funcs), "pop", 0, - PERLIO_K_DUMMY | PERLIO_K_UTF8, + PERLIO_K_UTF8, PerlIOPop_pushed, NULL, PerlIOBase_open, @@ -1492,6 +1492,7 @@ PerlIO_default_layers(pTHX) PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_stdio)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_crlf)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8)); + PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8_lax)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_remove)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_byte)); PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_scalar)); @@ -2239,11 +2240,11 @@ PerlIOUtf8_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab) return -1; } -PERLIO_FUNCS_DECL(PerlIO_utf8) = { +PERLIO_FUNCS_DECL(PerlIO_utf8_lax) = { sizeof(PerlIO_funcs), - "utf8", + "utf8_lax", 0, - PERLIO_K_DUMMY | PERLIO_K_UTF8 | PERLIO_K_MULTIARG, + PERLIO_K_UTF8 | PERLIO_K_MULTIARG, PerlIOUtf8_pushed, NULL, PerlIOBase_open, @@ -2270,12 +2271,33 @@ PERLIO_FUNCS_DECL(PerlIO_utf8) = { NULL, /* set_ptrcnt */ }; +IV +PerlIOBytes_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab) +{ + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(mode); + PERL_UNUSED_ARG(arg); + PERL_UNUSED_ARG(tab); + if (PerlIOValid(f)) { + PerlIO* current = f; + while(!(PerlIOBase(current)->tab->kind & PERLIO_K_RAW)) { + /*PerlIO* next = PerlIONext(current);*/ + PerlIO_flush(current); + PerlIO_pop(aTHX_ current); + /*current = next; */ + } + PerlIOBase(f)->flags &= ~PERLIO_F_UTF8; + return 0; + } + return -1; +} + PERLIO_FUNCS_DECL(PerlIO_byte) = { sizeof(PerlIO_funcs), "bytes", 0, - PERLIO_K_DUMMY | PERLIO_K_MULTIARG, - PerlIOUtf8_pushed, + PERLIO_K_MULTIARG, + PerlIOBytes_pushed, NULL, PerlIOBase_open, NULL, @@ -2305,7 +2327,7 @@ PERLIO_FUNCS_DECL(PerlIO_raw) = { sizeof(PerlIO_funcs), "raw", 0, - PERLIO_K_DUMMY, + 0, PerlIORaw_pushed, PerlIOBase_popped, PerlIOBase_open, @@ -5251,6 +5273,344 @@ PERLIO_FUNCS_DECL(PerlIO_crlf) = { PerlIOCrlf_set_ptrcnt, }; +static const U8 xs_utf8_sequence_len[0x100] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */ + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */ + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */ + 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */ +}; + +#define UTF8_MAX_BYTES 4 + +typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags; + +static STRLEN skip_sequence(const U8 *cur, const STRLEN len) { + STRLEN i, n = xs_utf8_sequence_len[*cur]; + + if (n < 1 || len < 2) + return 1; + + switch (cur[0]) { + case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break; + case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break; + case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break; + case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */ + default: if ((cur[1] & 0xC0) != 0x80) return 1; break; + } + + if (n > len) + n = len; + for (i = 2; i < n; i++) + if ((cur[i] & 0xC0) != 0x80) + break; + return i; +} + +static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) __attribute__noreturn__; +static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) { + static const char *hex = "0123456789ABCDEF"; + const char *fmt; + char seq[UTF8_MAX_BYTES * 3]; + char *d = seq; + + if (eof) + fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file"; + else + fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>"; + + while (len-- > 0) { + const U8 c = *cur++; + *d++ = hex[c >> 4]; + *d++ = hex[c & 15]; + if (len) + *d++ = ' '; + } + *d = 0; + Perl_croak(aTHX_ fmt, seq); +} + +static void report_noncharacter(pTHX_ UV usv) __attribute__noreturn__; +static void report_noncharacter(pTHX_ UV usv) { + static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf; + Perl_croak(aTHX_ fmt, usv); +} + +static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) { + const bool eof = PerlIO_eof(handle); + const U8 *cur = buf; + const U8 *end4 = end - 4; + STRLEN skip = 0; + U32 v; + + while (cur < end4) { + while (cur < end4 && *cur < 0x80) + cur++; + + check: + switch (xs_utf8_sequence_len[*cur]) { + case 0: + goto illformed; + case 1: + cur += 1; + break; + case 2: + /* 110xxxxx 10xxxxxx */ + if ((cur[1] & 0xC0) != 0x80) + goto illformed; + cur += 2; + break; + case 3: + v = ((U32)cur[0] << 16) + | ((U32)cur[1] << 8) + | ((U32)cur[2]); + /* 1110xxxx 10xxxxxx 10xxxxxx */ + if ((v & 0x00F0C0C0) != 0x00E08080 || + /* Non-shortest form */ + v < 0x00E0A080) + goto illformed; + /* Surrogates U+D800..U+DFFF */ + if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080) + goto illformed; + /* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */ + if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE)) + goto noncharacter; + cur += 3; + break; + case 4: + v = ((U32)cur[0] << 24) + | ((U32)cur[1] << 16) + | ((U32)cur[2] << 8) + | ((U32)cur[3]); + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + if ((v & 0xF8C0C0C0) != 0xF0808080 || + /* Non-shortest form */ + v < 0xF0908080 || + /* Greater than U+10FFFF */ + v > 0xF48FBFBF) + goto illformed; + /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */ + if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE) + goto noncharacter; + cur += 4; + break; + } + } + + if (cur < end) { + if (cur + xs_utf8_sequence_len[*cur] <= end) + goto check; + skip = skip_sequence(cur, end - cur); + if (eof || cur + skip < end) + goto illformed; + } + return cur - buf; + + illformed: + if (!skip) + skip = skip_sequence(cur, end - cur); + PerlIOBase(handle)->flags |= PERLIO_F_ERROR; + report_illformed(aTHX_ cur, skip, eof); + + noncharacter: + if (v < 0x10000) + v = (v & 0x3F) | (v & 0x1F00) >> 2; + else + v = (v & 0x3F) | (v & 0x1F00) >> 2 | (v & 0x0F0000) >> 4; + PerlIOBase(handle)->flags |= PERLIO_F_ERROR; + report_noncharacter(aTHX_ v); +} + +typedef struct { + PerlIOBuf buf; + STDCHAR leftovers[UTF8_MAX_BYTES]; + size_t leftover_length; + int flags; +} PerlIOUnicode; + +static struct { + const char* name; + size_t length; + utf8_flags value; +} map[] = { + { STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES }, + { STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS }, + { STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST }, + { STR_WITH_LEN("strict"), STRICT_UTF8 }, + { STR_WITH_LEN("loose"), (utf8_flags)(ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST) }, +}; + +static int lookup_parameter(pTHX_ const char* ptr, size_t len) { + unsigned i; + for (i = 0; i < sizeof map / sizeof *map; ++i) { + if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0) + return map[i].value; + } + Perl_croak(aTHX_ "Unknown argument to :utf8: %*s", (int)len, ptr); +} +static int parse_parameters(pTHX_ SV* param) { + STRLEN len; + const char *begin, *delim; + if (!param || !SvOK(param)) + return 0; + + begin = SvPV(param, len); + delim = strchr(begin, ','); + if(delim) { + int ret = 0; + const char* end = begin + len; + do { + ret |= lookup_parameter(aTHX_ begin, delim - begin); + begin = delim + 1; + delim = strchr(begin, ','); + } while (delim); + if (begin < end) + ret |= lookup_parameter(aTHX_ begin, end - begin); + return ret; + } + else { + return lookup_parameter(aTHX_ begin, len); + } +} + +static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) { + int flags = parse_parameters(aTHX_ arg); + if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) { + PerlIOBase(f)->flags |= PERLIO_F_UTF8; + PerlIOSelf(f, PerlIOUnicode)->flags = flags; + return 0; + } + return -1; +} + +static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) { + PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode); + PerlIOBuf * const b = &u->buf; + PerlIO *n = PerlIONext(f); + SSize_t avail; + Size_t read_bytes = 0; + STDCHAR *end; + SSize_t fit; + + if (PerlIO_flush(f) != 0) + return -1; + if (PerlIOBase(f)->flags & PERLIO_F_TTY) + PerlIOBase_flush_linebuf(aTHX); + + if (!b->buf) + PerlIO_get_base(f); + + assert(b->buf); + + if (u->leftover_length) { + Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR); + b->end = b->buf + u->leftover_length; + read_bytes = u->leftover_length; + u->leftover_length = 0; + } + else { + b->ptr = b->end = b->buf; + } + fit = (SSize_t)b->bufsiz - (b->end - b->buf); + + if (!PerlIOValid(n)) { + PerlIOBase(f)->flags |= PERLIO_F_EOF; + return -1; + } + + if (PerlIO_fast_gets(n)) { + /* + * Layer below is also buffered. We do _NOT_ want to call its + * ->Read() because that will loop till it gets what we asked for + * which may hang on a pipe etc. Instead take anything it has to + * hand, or ask it to fill _once_. + */ + avail = PerlIO_get_cnt(n); + if (avail <= 0) { + avail = PerlIO_fill(n); + if (avail == 0) + avail = PerlIO_get_cnt(n); + else { + if (!PerlIO_error(n) && PerlIO_eof(n)) + avail = 0; + } + } + if (avail > 0) { + STDCHAR *ptr = PerlIO_get_ptr(n); + const SSize_t cnt = avail; + if (avail > fit) + avail = fit; + Copy(ptr, b->end, avail, STDCHAR); + PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail); + read_bytes += avail; + } + } + else { + avail = PerlIO_read(n, b->end, fit); + if (avail > 0) + read_bytes += avail; + } + if (avail <= 0) { + if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) { + PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR; + return -1; + } + } + end = b->buf + read_bytes; + b->end = b->buf; + b->end += validate(aTHX_ (const U8 *)b->end, (const U8 *)end, u->flags, n); + if (b->end < end) { + size_t len = b->buf + read_bytes - b->end; + Copy(b->end, u->leftovers, len, char); + u->leftover_length = len; + } + PerlIOBase(f)->flags |= PERLIO_F_RDBUF; + + return 0; +} + +PERLIO_FUNCS_DECL(PerlIO_utf8) = { + sizeof(PerlIO_funcs), + "utf8", + sizeof(PerlIOUnicode), + PERLIO_K_BUFFERED|PERLIO_K_UTF8, + PerlIOUnicode_pushed, + PerlIOBuf_popped, + PerlIOBuf_open, + PerlIOBase_binmode, + NULL, + PerlIOBase_fileno, + PerlIOBuf_dup, + PerlIOBuf_read, + PerlIOBuf_unread, + PerlIOBuf_write, + PerlIOBuf_seek, + PerlIOBuf_tell, + PerlIOBuf_close, + PerlIOBuf_flush, + PerlIOUnicode_fill, + PerlIOBase_eof, + PerlIOBase_error, + PerlIOBase_clearerr, + PerlIOBase_setlinebuf, + PerlIOBuf_get_base, + PerlIOBuf_bufsiz, + PerlIOBuf_get_ptr, + PerlIOBuf_get_cnt, + PerlIOBuf_set_ptrcnt, +}; + PerlIO * Perl_PerlIO_stdin(pTHX) { diff --git a/perliol.h b/perliol.h index e247ad60dde1..3245cf8e1bba 100644 --- a/perliol.h +++ b/perliol.h @@ -57,7 +57,7 @@ struct _PerlIO_funcs { #define PERLIO_K_BUFFERED 0x00000002 #define PERLIO_K_CANCRLF 0x00000004 #define PERLIO_K_FASTGETS 0x00000008 -#define PERLIO_K_DUMMY 0x00000010 +#define PERLIO_K_DUMMY 0x00000000 /* DEPRECATED */ #define PERLIO_K_UTF8 0x00008000 #define PERLIO_K_DESTRUCT 0x00010000 #define PERLIO_K_MULTIARG 0x00020000 @@ -111,6 +111,7 @@ EXTCONST PerlIO_funcs PerlIO_perlio; EXTCONST PerlIO_funcs PerlIO_stdio; EXTCONST PerlIO_funcs PerlIO_crlf; EXTCONST PerlIO_funcs PerlIO_utf8; +EXTCONST PerlIO_funcs PerlIO_utf8_lax; EXTCONST PerlIO_funcs PerlIO_byte; EXTCONST PerlIO_funcs PerlIO_raw; EXTCONST PerlIO_funcs PerlIO_pending; diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 6c9948f861e9..d8217398a445 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3879,10 +3879,6 @@ One possible cause is that you set the UTF8 flag yourself for data that you thought to be in UTF-8 but it wasn't (it was for example legacy 8-bit data). To guard against this, you can use C. -If you use the C<:encoding(UTF-8)> PerlIO layer for input, invalid byte -sequences are handled gracefully, but if you use C<:utf8>, the flag is set -without validating the data, possibly resulting in this error message. - See also L. =item Malformed UTF-8 returned by \N{%s} immediately after '%s' @@ -7227,6 +7223,12 @@ C>. (F) The C<(*> was followed by something that the regular expression compiler does not recognize. Check your spelling. +=item Unknown argument to :utf8: %s + +(F) An unknown argument was passed to the utf8 layer. Currently supported +arguments are C, C, C, C, +and C. + =item Unknown error (P) Perl was about to print an error message in C<$@>, but the C<$@> variable diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 465796bf8453..c491638d3499 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -946,9 +946,6 @@ therefore refers to "layers" rather than to "disciplines". Now back to the regularly scheduled documentation...> To mark FILEHANDLE as UTF-8, use C<:utf8> or C<:encoding(UTF-8)>. -C<:utf8> just marks the data as UTF-8 without further checking, -while C<:encoding(UTF-8)> checks the data for actually being valid -UTF-8. More details can be found in L. In general, C should be called after L|/open FILEHANDLE,MODE,EXPR> but before any I/O is done on the @@ -957,9 +954,7 @@ flushes any pending buffered output data (and perhaps pending input data) on the handle. An exception to this is the C<:encoding> layer that changes the default character encoding of the handle. The C<:encoding> layer sometimes needs to be called in -mid-stream, and it doesn't flush the stream. C<:encoding> -also implicitly pushes on top of itself the C<:utf8> layer because -internally Perl operates on UTF8-encoded Unicode characters. +mid-stream, and it doesn't flush the stream. The operating system, device drivers, C libraries, and Perl run-time system all conspire to let the programmer treat a single @@ -9774,8 +9769,8 @@ string other than the beginning. A negative OFFSET specifies writing that many characters counting backwards from the end of the string. If SCALAR is of length zero, you can only use an OFFSET of 0. -B: If the filehandle is marked C<:utf8>, C will raise an exception. -The C<:encoding(...)> layer implicitly introduces the C<:utf8> layer. +B: If the filehandle is marked as utf8, C will raise an exception. +The C<:encoding(...)> layer implicitly mark a layer as utf8. Alternately, if the handle is not marked with an encoding but you attempt to write characters with code points over 255, raises an exception. See L|/binmode FILEHANDLE, LAYER>, diff --git a/pod/perliol.pod b/pod/perliol.pod index c38f5fcea49b..6b6dc9cbe4ac 100644 --- a/pod/perliol.pod +++ b/pod/perliol.pod @@ -332,7 +332,7 @@ for the layers class. Data written to this layer should be UTF-8 encoded; data provided by this layer should be considered UTF-8 encoded. Can be set on any layer -by ":utf8" dummy layer. Also set on ":encoding" layer. +by ":utf8_lax" dummy layer. =item PERLIO_F_UNBUF @@ -952,9 +952,7 @@ their own Binmode entry. =item "utf8" -Another dummy layer. When pushed it pops itself and sets the -C flag on the layer which was (and now is once more) -the top of the stack. +This layer reads utf-8 encodede unicode data and automatically decodes it. =back diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod index 262585d47864..28342581c387 100644 --- a/pod/perlunifaq.pod +++ b/pod/perlunifaq.pod @@ -277,19 +277,10 @@ based on the user's locale, C. =head2 What is the difference between C<:encoding> and C<:utf8>? -Because UTF-8 is one of Perl's internal formats, you can often just skip the -encoding or decoding step, and manipulate the UTF8 flag directly. - -Instead of C<:encoding(UTF-8)>, you can simply use C<:utf8>, which skips the -encoding step if the data was already represented as UTF8 internally. This is -widely accepted as good behavior when you're writing, but it can be dangerous -when reading, because it causes internal inconsistency when you have invalid -byte sequences. Using C<:utf8> for input can sometimes result in security -breaches, so please use C<:encoding(UTF-8)> instead. - -Instead of C and C, you could use C<_utf8_on> and C<_utf8_off>, -but this is considered bad style. Especially C<_utf8_on> can be dangerous, for -the same reason that C<:utf8> can. +C<:encoding> is a generic conversion layer, that converts a file from a variety +of encodings to perl's internal encoding utf8 and vice versa. C<:utf8> is a +validation layer that checks if input data is correct UTF-8 but doesn't change +the bytestream in any way. There are some shortcuts for oneliners; see L<-C in perlrun|perlrun/-C [numberElist]>. diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index bbb743e5b1d3..466d04bd607e 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -398,10 +398,14 @@ and on already open streams, use C: The matching of encoding names is loose: case does not matter, and many encodings have several aliases. Note that the C<:utf8> layer must always be specified exactly like that; it is I subject to +<<<<<<< HEAD the loose matching of encoding names. Also note that currently C<:utf8> is unsafe for input, because it accepts the data without validating that it is indeed valid UTF-8; you should instead use C<:encoding(UTF-8)> (with or without a hyphen). +======= +the loose matching of encoding names. +>>>>>>> f7eed95508 (Made :utf8 an actual layer) See L for the C<:utf8> layer, L and L for the C<:encoding()> layer, and diff --git a/t/io/crlf.t b/t/io/crlf.t index 3f0d8489a696..0b0d7cda2d7e 100644 --- a/t/io/crlf.t +++ b/t/io/crlf.t @@ -18,7 +18,7 @@ my $crcr = uni_to_native("\x0d\x0d"); my $ungetc_count = 8200; # Somewhat over the likely buffer size { - plan(tests => 21 + 2 * $ungetc_count); + plan(tests => 17 + 2 * $ungetc_count); ok(open(FOO,">:crlf",$file)); ok(print FOO 'a'.((('a' x 14).qq{\n}) x 2000) || close(FOO)); ok(open(FOO,"<:crlf",$file)); @@ -63,14 +63,10 @@ my $ungetc_count = 8200; # Somewhat over the likely buffer size # binmode :crlf should not cumulate. # Try it first once and then twice so that even UNIXy boxes # get to exercise this, for DOSish boxes even once is enough. - # Try also pushing :utf8 first so that there are other layers - # in between (this should not matter: CRLF layers still should - # not accumulate). - for my $utf8 ('', ':utf8') { for my $binmode (1..2) { open(FOO, ">$file"); # require PerlIO; print PerlIO::get_layers(FOO), "\n"; - binmode(FOO, "$utf8:crlf") for 1..$binmode; + binmode(FOO, ":crlf") for 1..$binmode; # require PerlIO; print PerlIO::get_layers(FOO), "\n"; print FOO "Hello\n"; close FOO; @@ -82,7 +78,6 @@ my $ungetc_count = 8200; # Somewhat over the likely buffer size "\n"; like($foo, qr/$crlf$/); unlike($foo, qr/$crcr/); - } } { diff --git a/t/io/layers.t b/t/io/layers.t index f1004c3bd23e..2ba96658db2a 100644 --- a/t/io/layers.t +++ b/t/io/layers.t @@ -34,11 +34,9 @@ if (${^UNICODE} & 1) { } else { $UTF8_STDIN = 0; } -my $NTEST = 62 - (($DOSISH || !$FASTSTDIO) ? 7 : 0) - ($DOSISH ? 7 : 0) +my $NTEST = 57 - (($DOSISH || !$FASTSTDIO) ? 7 : 0) - ($DOSISH ? 7 : 0) + $UTF8_STDIN; -sub PerlIO::F_UTF8 () { 0x00008000 } # from perliol.h - plan tests => $NTEST; print <<__EOH__; @@ -127,13 +125,13 @@ __EOH__ binmode(F, ":encoding(cp1047)"); check([ PerlIO::get_layers(F) ], - [ qw[stdio crlf encoding(cp1047) utf8] ], + [ qw[stdio crlf encoding(cp1047)] ], ":encoding(cp1047)"); binmode(F, ":crlf"); check([ PerlIO::get_layers(F) ], - [ qw[stdio crlf encoding(cp1047) utf8 crlf utf8] ], + [ qw[stdio crlf encoding(cp1047) crlf ] ], ":encoding(cp1047):crlf"); binmode(F, ":pop:pop"); @@ -163,7 +161,7 @@ __EOH__ binmode(F, ":encoding(utf8)"); check([ PerlIO::get_layers(F) ], - [ qw[stdio encoding(utf8) utf8] ], + [ qw[stdio encoding(utf8)] ], ":encoding(utf8)"); binmode(F, ":raw :crlf"); @@ -217,7 +215,7 @@ __EOH__ "use open IN"); check([ PerlIO::get_layers(G, output => 1) ], - [ qw[stdio encoding(cp1252) utf8] ], + [ qw[stdio encoding(cp1252)] ], "use open OUT"); close F; diff --git a/t/io/utf8.t b/t/io/utf8.t index 458880200696..fd589486cbf6 100644 --- a/t/io/utf8.t +++ b/t/io/utf8.t @@ -11,7 +11,7 @@ skip_all_without_perlio(); no utf8; # needed for use utf8 not griping about the raw octets -plan(tests => 62); +plan(tests => 61); $| = 1; @@ -169,13 +169,9 @@ SKIP: { if ($::IS_EBCDIC) { skip("EBCDIC The file isn't deformed in UTF-EBCDIC", 2); } else { - my @warnings; open F, "<:utf8", $a_file or die $!; - $x = ; chomp $x; - local $SIG{__WARN__} = sub { push @warnings, $_[0]; }; - eval { sprintf "%vd\n", $x }; - is (scalar @warnings, 1); - like ($warnings[0], qr/Malformed UTF-8 character: \\x82 \(unexpected continuation byte 0x82, with no preceding start byte/); + eval { $x = }; chomp $x; + like ($@, qr/^Can't decode ill-formed UTF-8 octet sequence <82>/); } } @@ -326,7 +322,6 @@ is($failed, undef); # if it finds bad UTF-8 (:encoding(utf8) works this way) use warnings 'utf8'; undef $@; - local $SIG{__WARN__} = sub { $@ = shift }; open F, ">$a_file"; binmode F; my ($chrE4, $chrF6) = (chr(0xE4), chr(0xF6)); @@ -337,15 +332,15 @@ is($failed, undef); close F; open F, "<:utf8", $a_file; undef $@; - my $line = ; + eval { + my $line = ; + }; my ($chrE4, $chrF6) = ("E4", "F6"); if ($::IS_EBCDIC) { ($chrE4, $chrF6) = ("43", "EC"); } # EBCDIC - like( $@, qr/utf8 "\\x$chrE4" does not map to Unicode .+ line 1/, + like( $@, qr/^Can't decode ill-formed UTF-8 octet sequence /, "<:utf8 readline must warn about bad utf8"); undef $@; - $line .= ; - like( $@, qr/utf8 "\\x$chrF6" does not map to Unicode .+ line 2/, - "<:utf8 rcatline must warn about bad utf8"); + ok(eof F); close F; } @@ -381,9 +376,9 @@ is($failed, undef); open F, "<:utf8", $a_file; undef $@; local $SIG{__WARN__} = sub { $@ = shift }; - $line = ; + $line = eval { }; - like( $@, qr/utf8 "\\xEF" does not map to Unicode .+ chunk 1/, + like( $@, qr/Can\'t decode ill-formed UTF-8 octet sequence at end of file/, "<:utf8 readline (fixed) must warn about bad utf8"); close F; } diff --git a/universal.c b/universal.c index bd9d6397352b..38f780b3db0b 100644 --- a/universal.c +++ b/universal.c @@ -890,14 +890,6 @@ XS(XS_PerlIO_get_layers) else PUSHs(&PL_sv_undef); nitem++; - if (flgok) { - const IV flags = SvIVX(*flgsvp); - - if (flags & PERLIO_F_UTF8) { - PUSHs(newSVpvs_flags("utf8", SVs_TEMP)); - nitem++; - } - } } }