The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.

__UNDEFINED__ SvUTF8 UTF8f UTF8fARG utf8_to_uvchr_buf sv_len_utf8 sv_len_utf8_nomg

#ifdef SVf_UTF8 __UNDEFINED__ SvUTF8(sv) (SvFLAGS(sv) & SVf_UTF8) #endif

#if { VERSION == 5.19.1 } /* 5.19.1 does not have UTF8fARG, only broken UTF8f */ #undef UTF8f #endif

#ifdef SVf_UTF8 __UNDEFINED__ UTF8f SVf __UNDEFINED__ UTF8fARG(u,l,p) newSVpvn_flags((p), (l), ((u) ? SVf_UTF8 : 0) | SVs_TEMP) #endif

#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))

__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD

#ifdef UTF8_MAXLEN __UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN #endif

__UNDEF_NOT_PROVIDED__ UTF_START_MARK(len) \ (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))

/* On non-EBCDIC was valid for some releases earlier than this, but easier to * just do one check */ #if { VERSION < 5.018 } # undef UTF8_MAXBYTES_CASE #endif

#if 'A' == 65 # define D_PPP_BYTE_INFO_BITS 6 /* 6 bits meaningful in continuation bytes */ __UNDEFINED__ UTF8_MAXBYTES_CASE 13 #else # define D_PPP_BYTE_INFO_BITS 5 /* 5 bits meaningful in continuation bytes */ __UNDEFINED__ UTF8_MAXBYTES_CASE 15 #endif

__UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT D_PPP_BYTE_INFO_BITS

#ifdef NATIVE_TO_UTF __UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) NATIVE_TO_UTF(c) #else /* System doesn't support EBCDIC */ __UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) (c) #endif

#ifdef UTF_TO_NATIVE __UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) UTF_TO_NATIVE(c) #else /* System doesn't support EBCDIC */ __UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) (c) #endif

__UNDEF_NOT_PROVIDED__ UTF_START_MASK(len) \ (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2))) __UNDEF_NOT_PROVIDED__ UTF_IS_CONTINUATION_MASK \ ((U8) (0xFF << UTF_ACCUMULATION_SHIFT)) __UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MARK \ (UTF_IS_CONTINUATION_MASK & 0xB0) __UNDEF_NOT_PROVIDED__ UTF_MIN_START_BYTE \ ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))

__UNDEF_NOT_PROVIDED__ UTF_MIN_ABOVE_LATIN1_BYTE \ ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))

#if { VERSION < 5.007 } /* Was the complement of what should have been */ # undef UTF8_IS_DOWNGRADEABLE_START #endif __UNDEF_NOT_PROVIDED__ UTF8_IS_DOWNGRADEABLE_START(c) \ inRANGE(NATIVE_UTF8_TO_I8(c), \ UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1) __UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MASK \ ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))

__UNDEF_NOT_PROVIDED__ UTF8_ACCUMULATE(base, added) \ (((base) << UTF_ACCUMULATION_SHIFT) \ | ((NATIVE_UTF8_TO_I8(added)) \ & UTF_CONTINUATION_MASK))

__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANYUV 0 __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_EMPTY 0x0001 __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_CONTINUATION 0x0002 __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_NON_CONTINUATION 0x0004 __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_SHORT 0x0008 __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_LONG 0x0010 __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_OVERFLOW 0x0080 __UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \ |UTF8_ALLOW_NON_CONTINUATION \ |UTF8_ALLOW_SHORT \ |UTF8_ALLOW_LONG \ |UTF8_ALLOW_OVERFLOW)

#if defined UTF8SKIP

/* Don't use official versions because they use MIN, which may not be available */ #undef UTF8_SAFE_SKIP #undef UTF8_CHK_SKIP

__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ ((((e) - (s)) <= 0) \ ? 0 \ : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))

__UNDEFINED__ UTF8_CHK_SKIP(s) \ (s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)), \ UTF8SKIP(s)))) /* UTF8_CHK_SKIP depends on my_strnlen */ __UNDEFINED__ UTF8_SKIP(s) UTF8SKIP(s) #endif

#if 'A' == 65 __UNDEFINED__ UTF8_IS_INVARIANT(c) isASCII(c) #else __UNDEFINED__ UTF8_IS_INVARIANT(c) (isASCII(c) || isCNTRL_L1(c)) #endif

__UNDEFINED__ UVCHR_IS_INVARIANT(c) UTF8_IS_INVARIANT(c)

#ifdef UVCHR_IS_INVARIANT # if 'A' != 65 || UVSIZE < 8 /* 32 bit platform, which includes UTF-EBCDIC on the releases this is * backported to */ # define D_PPP_UVCHR_SKIP_UPPER(c) 7 # else # define D_PPP_UVCHR_SKIP_UPPER(c) \ (((WIDEST_UTYPE) (c)) < \ (((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13) # endif

__UNDEFINED__ UVCHR_SKIP(c) \ UVCHR_IS_INVARIANT(c) ? 1 : \ (WIDEST_UTYPE) (c) < (32 * (1U << ( D_PPP_BYTE_INFO_BITS))) ? 2 : \ (WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 : \ (WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 : \ (WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 : \ (WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 : \ D_PPP_UVCHR_SKIP_UPPER(c) #endif

#ifdef is_ascii_string __UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l) __UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)

/* Hint: is_ascii_string, is_invariant_string is_utf8_invariant_string() does the same thing and is preferred because its name is more accurate as to what it does */ #endif

#ifdef ibcmp_utf8 __UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \ cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2)) #endif

#if defined(is_utf8_string) && defined(UTF8SKIP) __UNDEFINED__ isUTF8_CHAR(s, e) ( \ (e) <= (s) || ! is_utf8_string(s, UTF8_SAFE_SKIP(s, e)) \ ? 0 \ : UTF8SKIP(s)) #endif

#if 'A' == 65 __UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF" __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD" #elif '^' == 95 __UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73" __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71" #elif '^' == 176 __UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72" __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70" #else # error Unknown character set #endif

#if { VERSION < 5.35.10 } /* Versions prior to 5.31.4 accepted things that are now considered * malformations, and didn't return -1 on error with warnings enabled. * Versions before 5.35.10 dereferenced empty input without checking */ # undef utf8_to_uvchr_buf #endif

/* This implementation brings modern, generally more restricted standards to * utf8_to_uvchr_buf. Some of these are security related, and clearly must * be done. But its arguable that the others need not, and hence should not. * The reason they're here is that a module that intends to play with the * latest perls should be able to work the same in all releases. An example is * that perl no longer accepts any UV for a code point, but limits them to * IV_MAX or below. This is for future internal use of the larger code points. * If it turns out that some of these changes are breaking code that isn't * intended to work with modern perls, the tighter restrictions could be * relaxed. khw thinks this is unlikely, but has been wrong in the past. */

/* 5.6.0 is the first release with UTF-8, and we don't implement this function * there due to its likely lack of still being in use, and the underlying * implementation is very different from later ones, without the later * safeguards, so would require extra work to deal with */ #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf) /* Choose which underlying implementation to use. At least one must be * present or the perl is too early to handle this function */ # if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv) # if defined(utf8n_to_uvchr) /* This is the preferred implementation */ # define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr # elif /* Must be at least 5.6.1 from #if above; \ If have both regular and _simple, regular has all args */ \ defined(utf8_to_uv) && defined(utf8_to_uv_simple) # define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv # elif defined(utf8_to_uvchr) /* The below won't work well on error input */ # define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ utf8_to_uvchr((U8 *)(s), (retlen)) # else # define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \ utf8_to_uv((U8 *)(s), (retlen)) # endif # endif

# if { NEED utf8_to_uvchr_buf }

UV utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) { # if { VERSION >= 5.31.4 } /* But from above, must be < 5.35.10 */ # if { VERSION != 5.35.9 }

/* Versions less than 5.35.9 could dereference s on zero length, so
 * pass it something where no harm comes from that. */
if (send <= s) s = send = (U8 *) "?";
return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen);

# else /* Below is 5.35.9, which also works on non-empty input, but for empty input, can wrongly dereference, and additionally is also just plain broken */ if (send > s) return Perl_utf8_to_uvchr_buf_helper(aTHX_ s, send, retlen); if (! ckWARN_d(WARN_UTF8)) { if (retlen) *retlen = 0; return UNICODE_REPLACEMENT; } else { s = send = (U8 *) "?";

    /* Call just for its warning */
    (void) Perl__utf8n_to_uvchr_msgs_helper(s, 0, NULL, 0, NULL, NULL);
    if (retlen) *retlen = (STRLEN) -1;
    return 0;
}

# endif # else

UV ret;
STRLEN curlen;
bool overflows = 0;
const U8 *cur_s = s;
const bool do_warnings = ckWARN_d(WARN_UTF8);
#    if { VERSION < 5.26.0 } && ! defined(EBCDIC)
STRLEN overflow_length = 0;
#    endif

if (send > s) {
    curlen = send - s;
}
else {
    assert(0);  /* Modern perls die under this circumstance */
    curlen = 0;
    if (! do_warnings) {    /* Handle empty here if no warnings needed */
        if (retlen) *retlen = 0;
        return UNICODE_REPLACEMENT;
    }
}

# if { VERSION < 5.26.0 } && ! defined(EBCDIC)

/* Perl did not properly detect overflow for much of its history on
 * non-EBCDIC platforms, often returning an overlong value which may or may
 * not have been tolerated in the call.  Also, earlier versions, when they
 * did detect overflow, may have disallowed it completely.  Modern ones can
 * replace it with the REPLACEMENT CHARACTER, depending on calling
 * parameters.  Therefore detect it ourselves in  releases it was
 * problematic in. */

if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {

    /* First, on a 32-bit machine the first byte being at least \xFE
     * automatically is overflow, as it indicates something requiring more
     * than 31 bits */
    if (sizeof(ret) < 8) {
        overflows = 1;
        overflow_length = (*s == 0xFE) ? 7 : 13;
    }
    else {
        const U8 highest[] =    /* 2*63-1 */
                    "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
        const U8 *cur_h = highest;

        for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
            if (UNLIKELY(*cur_s == *cur_h)) {
                continue;
            }

            /* If this byte is larger than the corresponding highest UTF-8
             * byte, the sequence overflows; otherwise the byte is less
             * than (as we handled the equality case above), and so the
             * sequence doesn't overflow */
            overflows = *cur_s > *cur_h;
            break;

        }

        /* Here, either we set the bool and broke out of the loop, or got
         * to the end and all bytes are the same which indicates it doesn't
         * overflow.  If it did overflow, it would be this number of bytes
         * */
        overflow_length = 13;
    }
}

if (UNLIKELY(overflows)) {
    ret = 0;

    if (! do_warnings && retlen) {
        *retlen = overflow_length;
    }
}
else

# endif /* < 5.26 */

/* Here, we are either in a release that properly detects overflow, or
 * we have checked for overflow and the next statement is executing as
 * part of the above conditional where we know we don't have overflow.
 *
 * The modern versions allow anything that evaluates to a legal UV, but
 * not overlongs nor an empty input */
ret = D_PPP_utf8_to_uvchr_buf_callee(
      (U8 *) /* Early perls: no const */
            s, curlen, retlen,   (UTF8_ALLOW_ANYUV
                              & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));

# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }

/* But actually, more modern versions restrict the UV to being no more than
 * what an IV can hold, so it could still have gotten it wrong about
 * overflowing. */
if (UNLIKELY(ret > IV_MAX)) {
    overflows = 1;
}

# endif

if (UNLIKELY(overflows)) {
    if (! do_warnings) {
        if (retlen) {
            *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
            *retlen = D_PPP_MIN(*retlen, curlen);
        }
        return UNICODE_REPLACEMENT;
    }
    else {

        /* We use the error message in use from 5.8-5.26 */
        Perl_warner(aTHX_ packWARN(WARN_UTF8),
            "Malformed UTF-8 character (overflow at 0x%" UVxf
            ", byte 0x%02x, after start byte 0x%02x)",
            ret, *cur_s, *s);
        if (retlen) {
            *retlen = (STRLEN) -1;
        }
        return 0;
    }
}

/* Here, did not overflow, but if it failed for some other reason, and
 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
 * try again, allowing anything.  (Note a return of 0 is ok if the input
 * was '\0') */
if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {

    /* If curlen is 0, we already handled the case where warnings are
     * disabled, so this 'if' will be true, and so later on, we know that
     * 's' is dereferencible */
    if (do_warnings) {
        if (retlen) {
            *retlen = (STRLEN) -1;
        }
    }
    else {
        ret = D_PPP_utf8_to_uvchr_buf_callee(
                                 (U8 *) /* Early perls: no const */
                                        s, curlen, retlen, UTF8_ALLOW_ANY);
        /* Override with the REPLACEMENT character, as that is what the
         * modern version of this function returns */
        ret = UNICODE_REPLACEMENT;

# if { VERSION < 5.16.0 }

/* Versions earlier than this don't necessarily return the proper
 * length.  It should not extend past the end of string, nor past
 * what the first byte indicates the length is, nor past the
 * continuation characters */
if (retlen && (IV) *retlen >= 0) {
    unsigned int i = 1;

    *retlen = D_PPP_MIN(*retlen, curlen);
    *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
    do {
#        ifdef UTF8_IS_CONTINUATION
        if (! UTF8_IS_CONTINUATION(s[i]))
#        else       /* Versions without the above don't support EBCDIC anyway */
        if (s[i] < 0x80 || s[i] > 0xBF)
#        endif
        {
            *retlen = i;
            break;
        }
    } while (++i < *retlen);
}

# endif /* end of < 5.16.0 */

    }
}

return ret;

# endif /* end of < 5.31.4 */

}

# endif #endif

#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses to read past a NUL, making it much less likely to read off the end of the buffer. A NUL indicates the start of the next character anyway. If the input isn't NUL-terminated, the function remains unsafe, as it always has been. */

__UNDEFINED__ utf8_to_uvchr(s, lp) \ ((*(s) == '\0') \ ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ : utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp)))

#endif

/* Hint: utf8_to_uvchr Use utf8_to_uvchr_buf() instead. But ONLY if you KNOW the upper bound of the input string (not resorting to using UTF8SKIP, etc., to infer it). The backported utf8_to_uvchr() will do a better job to prevent most cases of trying to read beyond the end of the buffer */

/* Replace utf8_to_uvchr with utf8_to_uvchr_buf */

#ifdef sv_len_utf8 # if { VERSION >= 5.17.5 } # ifndef sv_len_utf8_nomg # if defined(PERL_USE_GCC_BRACE_GROUPS) # define sv_len_utf8_nomg(sv) \ ({ \ SV *sv_ = (sv); \ sv_len_utf8(!SvGMAGICAL(sv_) \ ? sv_ \ : sv_mortalcopy_flags(sv_, SV_NOSTEAL)); \ }) # else PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv) { dTHX; if (SvGMAGICAL(sv)) return sv_len_utf8(sv_mortalcopy_flags(sv, SV_NOSTEAL)); else return sv_len_utf8(sv); } # define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv) # endif # endif # else /* < 5.17.5 */ /* Older Perl versions have broken sv_len_utf8() when passed sv does not * have SVf_UTF8 flag set */ /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */ # undef sv_len_utf8 # if defined(PERL_USE_GCC_BRACE_GROUPS) # define sv_len_utf8_nomg(sv) \ ({ \ SV *sv2 = (sv); \ STRLEN len; \ if (SvUTF8(sv2)) { \ if (SvGMAGICAL(sv2)) \ len = Perl_sv_len_utf8(aTHX_ \ sv_mortalcopy_flags(sv2, \ SV_NOSTEAL));\ else \ len = Perl_sv_len_utf8(aTHX_ sv2); \ } \ else SvPV_nomg(sv2, len); \ len; \ }) # define sv_len_utf8(sv) ({ SV *_sv1 = (sv); \ SvGETMAGIC(_sv1); \ sv_len_utf8_nomg(_sv1); \ }) # else /* Below is no brace groups */ PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8_nomg(SV * sv) { dTHX; STRLEN len; if (SvUTF8(sv)) { if (SvGMAGICAL(sv)) len = Perl_sv_len_utf8(aTHX_ sv_mortalcopy_flags(sv, SV_NOSTEAL)); else len = Perl_sv_len_utf8(aTHX_ sv); } else SvPV_nomg(sv, len); return len; } # define sv_len_utf8_nomg(sv) D_PPP_sv_len_utf8_nomg(sv)

PERL_STATIC_INLINE STRLEN D_PPP_sv_len_utf8(SV * sv)
{
   dTHX;
   SvGETMAGIC(sv);
   return sv_len_utf8_nomg(sv);
}
#      define sv_len_utf8(sv) D_PPP_sv_len_utf8(sv)
#    endif
#  endif    /* End of < 5.17.5 */
#endif

#define NEED_utf8_to_uvchr_buf

#if defined(UTF8f) && defined(newSVpvf)

void UTF8f(x) SV *x PREINIT: U32 u; STRLEN len; char *ptr; INIT: ptr = SvPV(x, len); u = SvUTF8(x); PPCODE: x = sv_2mortal(newSVpvf("[%" UTF8f "]", UTF8fARG(u, len, ptr))); XPUSHs(x); XSRETURN(1);

#endif

#if { VERSION >= 5.006 } /* This is just a helper fcn, not publicized */ \ /* as being available and params not what the */ \ /* API function has; works on EBCDIC too */

SV * uvchr_to_utf8(native)

    UV native
    PREINIT:
        int len;
        U8 string[UTF8_MAXBYTES+1];
        int i;
        UV uni;

    CODE:
	len = UVCHR_SKIP(native);

        for (i = 0; i < len; i++) {
            string[i] = '\0';
        }

        if (len <= 1) {
            string[0] = native;
        }
        else {
            i = len;
            uni = NATIVE_TO_UNI(native);
            while (i-- > 1) {
                string[i] = I8_TO_NATIVE_UTF8((uni & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
                uni >>= UTF_ACCUMULATION_SHIFT;
            }
            string[0] = I8_TO_NATIVE_UTF8((uni & UTF_START_MASK(len)) | UTF_START_MARK(len));
        }

        RETVAL = newSVpvn((char *) string, len);
        SvUTF8_on(RETVAL);
    OUTPUT:
        RETVAL

#endif #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)

STRLEN UTF8_SAFE_SKIP(s, adjustment) char * s int adjustment PREINIT: const char *const_s; CODE: const_s = s; /* Instead of passing in an 'e' ptr, use the real end, adjusted */ RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment); OUTPUT: RETVAL

#endif

#ifdef isUTF8_CHAR

STRLEN isUTF8_CHAR(s, adjustment) unsigned char * s int adjustment PREINIT: const unsigned char *const_s; const unsigned char *const_e; CODE: const_s = s; /* Instead of passing in an 'e' ptr, use the real end, adjusted */ const_e = const_s + UTF8SKIP(const_s) + adjustment; RETVAL = isUTF8_CHAR(const_s, const_e); OUTPUT: RETVAL

#endif

#ifdef foldEQ_utf8

STRLEN foldEQ_utf8(s1, l1, u1, s2, l2, u2) char *s1 UV l1 bool u1 char *s2 UV l2 bool u2 PREINIT: const char *const_s1; const char *const_s2; CODE: const_s1 = s1; const_s2 = s2; RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2); OUTPUT: RETVAL

#endif

#ifdef utf8_to_uvchr_buf

AV * utf8_to_uvchr_buf(s, adjustment) unsigned char *s int adjustment PREINIT: AV *av; STRLEN len; const unsigned char *const_s; CODE: av = newAV(); const_s = s; av_push(av, newSVuv(utf8_to_uvchr_buf(const_s, s + UTF8SKIP(s) + adjustment, &len))); if (len == (STRLEN) -1) { av_push(av, newSViv(-1)); } else { av_push(av, newSVuv(len)); } RETVAL = av; OUTPUT: RETVAL

#endif

#ifdef utf8_to_uvchr

AV * utf8_to_uvchr(s) unsigned char *s PREINIT: AV *av; STRLEN len; const unsigned char *const_s; CODE: av = newAV(); const_s = s; av_push(av, newSVuv(utf8_to_uvchr(const_s, &len))); if (len == (STRLEN) -1) { av_push(av, newSViv(-1)); } else { av_push(av, newSVuv(len)); } RETVAL = av; OUTPUT: RETVAL

#endif

#ifdef sv_len_utf8

STRLEN sv_len_utf8(sv) SV *sv CODE: RETVAL = sv_len_utf8(sv); OUTPUT: RETVAL

#endif

#ifdef sv_len_utf8_nomg

STRLEN sv_len_utf8_nomg(sv) SV *sv CODE: RETVAL = sv_len_utf8_nomg(sv); OUTPUT: RETVAL

#endif

#ifdef UVCHR_IS_INVARIANT

bool UVCHR_IS_INVARIANT(c) unsigned c PREINIT: CODE: RETVAL = UVCHR_IS_INVARIANT(c); OUTPUT: RETVAL

#endif

#ifdef UVCHR_SKIP

STRLEN UVCHR_SKIP(c) UV c PREINIT: CODE: RETVAL = UVCHR_SKIP(c); OUTPUT: RETVAL

#endif

BEGIN { # skip tests on 5.6.0 and earlier, plus 5.7.0 if (ivers($]) <= ivers(5.6) || ivers($]) == ivers(5.7) ) { skip 'skip: broken utf8 support', 98; exit; } require warnings; }

is(Devel::PPPort::UTF8f(42), '[42]'); is(Devel::PPPort::UTF8f('abc'), '[abc]'); is(Devel::PPPort::UTF8f("\x{263a}"), "[\x{263a}]");

my $str = "\x{A8}"; if (ivers($]) >= ivers(5.8)) { eval q{utf8::upgrade($str)} } is(Devel::PPPort::UTF8f($str), "[\x{A8}]"); if (ivers($]) >= ivers(5.8)) { eval q{utf8::downgrade($str)} } is(Devel::PPPort::UTF8f($str), "[\x{A8}]");

is(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); is(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);

is(&Devel::PPPort::isUTF8_CHAR("A", -1), 0); is(&Devel::PPPort::isUTF8_CHAR("A", 0), 1); is(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0); is(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);

is(&Devel::PPPort::UVCHR_IS_INVARIANT(ord("A")), 1); ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0xb6)); ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0x100));

is(&Devel::PPPort::UVCHR_SKIP(ord("A")), 1); is(&Devel::PPPort::UVCHR_SKIP(0xb6), 2, "This is a test"); is(&Devel::PPPort::UVCHR_SKIP(0x3FF), 2); is(&Devel::PPPort::UVCHR_SKIP(0x3FFF), 3); is(&Devel::PPPort::UVCHR_SKIP(0x3FFFF), 4); is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFF), 5); is(&Devel::PPPort::UVCHR_SKIP(0x3FFFFFF), ord("A") == 65 ? 5 : 6); is(&Devel::PPPort::UVCHR_SKIP(0x4000000), ord("A") == 65 ? 6 : 7); if (ord("A") != 65) { skip("Test not valid on EBCDIC", 1) } else { is(&Devel::PPPort::UVCHR_SKIP(0xFFFFFFFF), 7); }

if (ivers($]) < ivers(5.8)) { skip("Perl version too early", 3); } else { is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1); is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0); is(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0); }

my $ret = &Devel::PPPort::utf8_to_uvchr("A"); is($ret->[0], ord("A")); is($ret->[1], 1);

$ret = &Devel::PPPort::utf8_to_uvchr("\0"); is($ret->[0], 0); is($ret->[1], 1);

$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); is($ret->[0], ord("A")); is($ret->[1], 1);

$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); is($ret->[0], 0); is($ret->[1], 1);

my @buf_tests = ( { input => "A", adjustment => -1, warning => eval "qr/empty/", no_warnings_returned_length => 0, }, { input => "\xc4\xc5", adjustment => 0, warning => eval "qr/non-continuation/", no_warnings_returned_length => 1, }, { input => "\xc4\x80", adjustment => -1, warning => eval "qr/short|1 byte, need 2/", no_warnings_returned_length => 1, }, { input => "\xc0\x81", adjustment => 0, warning => eval "qr/overlong|2 bytes, need 1/", no_warnings_returned_length => 2, }, { input => "\xe0\x80\x81", adjustment => 0, warning => eval "qr/overlong|3 bytes, need 1/", no_warnings_returned_length => 3, }, { input => "\xf0\x80\x80\x81", adjustment => 0, warning => eval "qr/overlong|4 bytes, need 1/", no_warnings_returned_length => 4, }, { # Old algorithm failed to detect this input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", adjustment => 0, warning => eval "qr/overflow/", no_warnings_returned_length => 13, }, );

if (ord("A") != 65) { # tests not valid for EBCDIC skip("Perl version too early", 2 + 4 + (scalar @buf_tests * 5)); } else { $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); is($ret->[0], 0x100); is($ret->[1], 2);

my @warnings;
local $SIG{__WARN__} = sub { push @warnings, @_; };

{
    use warnings 'utf8';
    $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
    is($ret->[0], 0);
    is($ret->[1], -1);

    no warnings 'utf8';
    $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
    is($ret->[0], 0xFFFD);
    is($ret->[1], 1);
}


# An empty input is an assertion failure on debugging builds.  It is
# deliberately the first test.
require Config; Config->import;
use vars '%Config';

# VMS doesn't put DEBUGGING in ccflags, and Windows doesn't have
# $Config{config_args}.  When 5.14 or later can be assumed, use
# Config::non_bincompat_options(), but for now we're stuck with this.
if (   $Config{ccflags} =~ /-DDEBUGGING/
    || $^O eq 'VMS' && $Config{config_args} =~ /\bDDEBUGGING\b/)
{
    shift @buf_tests;
    skip("Test not valid on DEBUGGING builds", 5);
}

my $test;
for $test (@buf_tests) {
    my $input = $test->{'input'};
    my $adjustment = $test->{'adjustment'};
    my $display = 'utf8_to_uvchr_buf("';
    my $i;
    for ($i = 0; $i < length($input) + $adjustment; $i++) {
        $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
    }

    $display .= '")';
    my $warning = $test->{'warning'};

    undef @warnings;
    use warnings 'utf8';
    $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
    is($ret->[0], 0,  "returned value $display; warnings enabled");
    is($ret->[1], -1, "returned length $display; warnings enabled");
    my $all_warnings = join "; ", @warnings;
    my $contains = grep { $_ =~ $warning } $all_warnings;
    is($contains, 1, $display
                . "; Got: '$all_warnings', which should contain '$warning'");

    undef @warnings;
    no warnings 'utf8';
    $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
    is($ret->[0], 0xFFFD,  "returned value $display; warnings disabled");
    is($ret->[1], $test->{'no_warnings_returned_length'},
                  "returned length $display; warnings disabled");
}
}

if (ivers($]) ge ivers(5.008)) { BEGIN { if (ivers($]) ge ivers(5.008)) { require utf8; "utf8"->import() } }

is(Devel::PPPort::sv_len_utf8("aščť"), 4);
is(Devel::PPPort::sv_len_utf8_nomg("aščť"), 4);

my $str = "áíé";
utf8::downgrade($str);
is(Devel::PPPort::sv_len_utf8($str), 3);
utf8::downgrade($str);
is(Devel::PPPort::sv_len_utf8_nomg($str), 3);
utf8::upgrade($str);
is(Devel::PPPort::sv_len_utf8($str), 3);
utf8::upgrade($str);
is(Devel::PPPort::sv_len_utf8_nomg($str), 3);

tie my $scalar, 'TieScalarCounter', "é";

is(tied($scalar)->{fetch}, 0);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8($scalar), 2);
is(tied($scalar)->{fetch}, 1);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8($scalar), 3);
is(tied($scalar)->{fetch}, 2);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8($scalar), 4);
is(tied($scalar)->{fetch}, 3);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
is(tied($scalar)->{fetch}, 3);
is(tied($scalar)->{store}, 0);
is(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
is(tied($scalar)->{fetch}, 3);
is(tied($scalar)->{store}, 0);
} else {
skip 'skip: no utf8::downgrade/utf8::upgrade support', 23;
}

package TieScalarCounter;

sub TIESCALAR { my ($class, $value) = @_; return bless { fetch => 0, store => 0, value => $value }, $class; }

sub FETCH { BEGIN { if (main::ivers($]) ge main::ivers(5.008)) { require utf8; "utf8"->import() } } my ($self) = @_; $self->{fetch}++; return $self->{value} .= "é"; }

sub STORE { my ($self, $value) = @_; $self->{store}++; $self->{value} = $value; }

6 POD Errors

The following errors were encountered while parsing the POD:

Around line 1:

Unknown directive: =provides

Around line 11:

Unknown directive: =implementation

Around line 541:

Unknown directive: =xsinit

Around line 545:

Unknown directive: =xsubs

Around line 768:

Unknown directive: =tests

Around line 960:

Non-ASCII character seen before =encoding in 'is(Devel::PPPort::sv_len_utf8("aščť"),'. Assuming UTF-8