diff --git a/perl.h b/perl.h index 535a80fb4376..8c8ff4dd4d49 100644 --- a/perl.h +++ b/perl.h @@ -1452,18 +1452,26 @@ Use C> to get the largest type available on the platform. =cut */ #ifndef UINT16_C -# if INTSIZE >= 2 -# define UINT16_C(x) ((U16_TYPE)x##U) +# ifdef _MSC_VER +# define UINT16_C(x) ((U16TYPE)x##ui16) # else -# define UINT16_C(x) ((U16_TYPE)x##UL) +# if INTSIZE >= 2 +# define UINT16_C(x) ((U16TYPE)x##U) +# else +# define UINT16_C(x) ((U16TYPE)x##UL) +# endif # endif #endif #ifndef UINT32_C -# if INTSIZE >= 4 -# define UINT32_C(x) ((U32_TYPE)x##U) +# ifdef _MSC_VER +# define UINT32_C(x) ((U32TYPE)x##ui32) # else -# define UINT32_C(x) ((U32_TYPE)x##UL) +# if INTSIZE >= 4 +# define UINT32_C(x) ((U32TYPE)x##U) +# else +# define UINT32_C(x) ((U32TYPE)x##UL) +# endif # endif #endif diff --git a/pp.c b/pp.c index 5c39bbf540f2..507eb3e07bce 100644 --- a/pp.c +++ b/pp.c @@ -6529,6 +6529,19 @@ PP(pp_unshift) return NORMAL; } +#ifdef _MSC_VER +# pragma intrinsic(_byteswap_ushort, _byteswap_ulong, _byteswap_uint64) +# define S_bswap16(_x) _byteswap_ushort(_x) +# define S_bswap32(_x) _byteswap_ulong(_x) +# define S_bswap64(_x) _byteswap_uint64(_x) +PERL_STATIC_FORCE_INLINE void * + S_memcpy(void *dest, const void *src,size_t count); +#else +# define S_bswap16(_x) _swab_16_(_x) +# define S_bswap32(_x) _swab_32_(_x) +# define S_bswap64(_x) _swab_64_(_x) +# define S_memcpy(_d,_s,_n) memcpy((_d),(_s),(_n)) +#endif PP_wrapped(pp_reverse, 0, 1) { @@ -6555,15 +6568,17 @@ PP_wrapped(pp_reverse, 0, 1) SV *begin, *end; if (can_preserve) { - if (!av_exists(av, i)) { - if (av_exists(av, j)) { + bool exi = av_exists(av, i); + bool exj = av_exists(av, j); + if (!exi) { + if (exj) { SV *sv = av_delete(av, j, 0); begin = *av_fetch(av, i, TRUE); sv_setsv_mg(begin, sv); } continue; } - else if (!av_exists(av, j)) { + else if (!exj) { SV *sv = av_delete(av, i, 0); end = *av_fetch(av, j, TRUE); sv_setsv_mg(end, sv); @@ -6644,18 +6659,19 @@ PP_wrapped(pp_reverse, 0, 1) * in a single pass, rather than 2-3 passes. */ const char * src = SvPV_const(src_sv, len); + U8* dd; /* Prepare the TARG. */ - if (SvTYPE(TARG) < SVt_PV) { + if (SvTHINKFIRST(TARG)) + SV_CHECK_THINKFIRST_COW_DROP(TARG); /* Drops any buffer or RV */ + if (SvTYPE(TARG) < SVt_PV) SvUPGRADE(TARG, SvTYPE(src_sv)); /* No buffer allocation here */ - } else if(SvTHINKFIRST(TARG)) { - SV_CHECK_THINKFIRST_COW_DROP(TARG); /* Drops any buffer */ - } - SvSETMAGIC(TARG); - SvGROW(TARG, len + 1); + else /* can't have SMG if < PVMG, SvROK/SvAMAGIC doesn't apply */ + SvSETMAGIC(TARG); + dd = (U8*)SvGROW(TARG, len + 1); SvCUR_set(TARG, len); SvPOK_only(TARG); - *SvEND(TARG) = '\0'; + dd[len] = '\0'; if (SvTAINTED(src_sv)) SvTAINT(TARG); @@ -6664,9 +6680,9 @@ PP_wrapped(pp_reverse, 0, 1) SvUTF8_on(TARG); const U8* s = (const U8*)src; - U8* dd = (U8*)(SvPVX(TARG) + len); const U8* send = (const U8*)(s + len); int bytes = 0; + dd = dd + len; while (s < send) { bytes = UTF8SKIP(s); if (bytes == 1) { @@ -6679,10 +6695,69 @@ PP_wrapped(pp_reverse, 0, 1) } } } else { - char * outp= SvPVX(TARG); - const char *p = src + len; - while (p != src) - *outp++ = *--p; + STRLEN i = 0; + STRLEN j = len; + U32 u32_1, u32_2; + U16 u16_1, u16_2; + char * outp = NUM2PTR(char*,dd); + /* Take a chunk of bytes from the front and from the + * back, reverse the bytes in each and and swap the + * chunks over. This should have generally good + * performance but also is likely to be optimised + * into bswap instructions by the compiler. + */ +#ifdef HAS_QUAD + U64 u64_1, u64_2; + while (j - i >= 16) { + S_memcpy(&u64_1, src + j - 8, 8); + S_memcpy(&u64_2, src + i, 8); + u64_1 = S_bswap64(u64_1); + u64_2 = S_bswap64(u64_2); + S_memcpy(outp + j - 8, &u64_2, 8); + S_memcpy(outp + i, &u64_1, 8); + i += 8; + j -= 8; + } + + if (j - i >= 8) { + S_memcpy(&u32_1, src + j - 4, 4); + S_memcpy(&u32_2, src + i, 4); + u32_1 = S_bswap32(u32_1); + u32_2 = S_bswap32(u32_2); + S_memcpy(outp + j - 4, &u32_2, 4); + S_memcpy(outp + i, &u32_1, 4); + i += 4; + j -= 4; + } +#else + while (j - i >= 8) { + S_memcpy(&u32_1, src + j - 4, 4); + S_memcpy(&u32_2, src + i, 4); + u32_1 = S_bswap32(u32_1); + u32_2 = S_bswap32(u32_2); + S_memcpy(outp + j - 4, &u32_2, 4); + S_memcpy(outp + i, &u32_1, 4); + i += 4; + j -= 4; + } +#endif + if (j - i >= 4) { + S_memcpy(&u16_1, src + j - 2, 2); + S_memcpy(&u16_2, src + i, 2); + u16_1 = S_bswap16(u16_1); + u16_2 = S_bswap16(u16_2); + S_memcpy(outp + j - 2, &u16_2, 2); + S_memcpy(outp + i, &u16_1, 2); + i += 2; + j -= 2; + } + + /* Swap any remaining bytes one by one. */ + while (i < j) { + outp[i] = src[j - 1]; + outp[j - 1] = src[i]; + i++; j--; + } } RETURN; } @@ -6695,9 +6770,10 @@ PP_wrapped(pp_reverse, 0, 1) if (len > 1) { /* The traditional way, operate on the current byte buffer */ - char *down; if (DO_UTF8(TARG)) { /* first reverse each character */ - U8* s = (U8*)SvPVX(TARG); + char *down; + assert(SvPVX(TARG) == up); + U8* s = (U8*)up; const U8* send = (U8*)(s + len); while (s < send) { if (UTF8_IS_INVARIANT(*s)) { @@ -6720,11 +6796,64 @@ PP_wrapped(pp_reverse, 0, 1) } up = SvPVX(TARG); } - down = SvPVX(TARG) + len - 1; - while (down > up) { - const char tmp = *up; - *up++ = *down; - *down-- = tmp; + STRLEN i = 0; + STRLEN j = len; + U32 u32_1, u32_2; + U16 u16_1, u16_2; + /* Reverse the buffer in place, in chunks where possible */ +#ifdef HAS_QUAD + U64 u64_1, u64_2; + while (j - i >= 16) { + S_memcpy(&u64_1, up + j - 8, 8); + S_memcpy(&u64_2, up + i, 8); + u64_1 = S_bswap64(u64_1); + u64_2 = S_bswap64(u64_2); + S_memcpy(up + j - 8, &u64_2, 8); + S_memcpy(up + i, &u64_1, 8); + i += 8; + j -= 8; + } + + if (j - i >= 8) { + S_memcpy(&u32_1, up + j - 4, 4); + S_memcpy(&u32_2, up + i, 4); + u32_1 = S_bswap32(u32_1); + u32_2 = S_bswap32(u32_2); + S_memcpy(up + j - 4, &u32_2, 4); + S_memcpy(up + i, &u32_1, 4); + i += 4; + j -= 4; + } +#else + while (j - i >= 8) { + S_memcpy(&u32_1, up + j - 4, 4); + S_memcpy(&u32_2, up + i, 4); + u32_1 = S_bswap32(u32_1); + u32_2 = S_bswap32(u32_2); + S_memcpy(up + j - 4, &u32_2, 4); + S_memcpy(up + i, &u32_1, 4); + i += 4; + j -= 4; + } +#endif + if (j - i >= 4) { + S_memcpy(&u16_1, up + j - 2, 2); + S_memcpy(&u16_2, up + i, 2); + u16_1 = S_bswap16(u16_1); + u16_2 = S_bswap16(u16_2); + S_memcpy(up + j - 2, &u16_2, 2); + S_memcpy(up + i, &u16_1, 2); + i += 2; + j -= 2; + } + + /* Finally, swap any remaining bytes one-by-one. */ + while (i < j) { + unsigned char tmp = up[i]; + up[i] = up[j - 1]; + up[j - 1] = tmp; + i++; + j--; } } (void)SvPOK_only_UTF8(TARG); @@ -6732,6 +6861,11 @@ PP_wrapped(pp_reverse, 0, 1) RETURN; } +#undef S_memcpy +#undef S_bswap16 +#undef S_bswap32 +#undef S_bswap64 + PP_wrapped(pp_split, ( (PL_op->op_private & OPpSPLIT_ASSIGN) && (PL_op->op_flags & OPf_STACKED)) @@ -8068,6 +8202,18 @@ PP(pp_is_tainted) return NORMAL; } +#ifdef _MSC_VER +/* this pragma can't be push/pop-ed vs whatever the cmd line to cl.exe was */ +# pragma intrinsic(memcpy) + +void * +S_memcpy(void *dest, const void *src, size_t count) +{ + return memcpy(dest, src, count); +} + +#endif + /* * ex: set ts=8 sts=4 sw=4 et: */