nginx-unit/src/nxt_utf8.c


/*
 * Copyright (C) Igor Sysoev
 * Copyright (C) NGINX, Inc.
 */

#include <nxt_main.h>

/*
 * The nxt_unicode_lowcase.h file is the auto-generated file from
 * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
 *
 *   ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
 *
 * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
 * file and utf8_file_name_test should be built with this file.
 * Then a correct system specific file should be generated:
 *
 *   ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl
 *
 * Only common and simple case foldings are supported.  Full case foldings
 * is not supported.  Combined characters are also not supported.
 */

#if (NXT_MACOSX)
#include <nxt_unicode_macosx_lowcase.h>

#else
#include <nxt_unicode_lowcase.h>
#endif


u_char *
nxt_utf8_encode(u_char *p, uint32_t u)
{
    if (u < 0x80) {
        *p++ = (u_char) (u & 0xff);
        return p;
    }

    if (u < 0x0800) {
        *p++ = (u_char) (( u >> 6)          | 0xc0);
        *p++ = (u_char) (( u        & 0x3f) | 0x80);
        return p;
    }

    if (u < 0x10000) {
        *p++ = (u_char) ( (u >> 12)         | 0xe0);
        *p++ = (u_char) (((u >>  6) & 0x3f) | 0x80);
        *p++ = (u_char) (( u        & 0x3f) | 0x80);
        return p;
    }

    if (u < 0x110000) {
        *p++ = (u_char) ( (u >> 18)         | 0xf0);
        *p++ = (u_char) (((u >> 12) & 0x3f) | 0x80);
        *p++ = (u_char) (((u >>  6) & 0x3f) | 0x80);
        *p++ = (u_char) (( u        & 0x3f) | 0x80);
        return p;
    }

    return NULL;
}


/*
 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
 * character 0x00 - 0x10ffff, or 0xffffffff for invalid or overlong
 * UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode(const u_char **start, const u_char *end)
{
    uint32_t  u;

    u = (uint32_t) **start;

    if (u < 0x80) {
        (*start)++;
        return u;
    }

    return nxt_utf8_decode2(start, end);
}


/*
 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
 * and returns a valid character 0x80 - 0x10ffff, or 0xffffffff for
 * invalid or overlong UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode2(const u_char **start, const u_char *end)
{
    u_char        c;
    size_t        n;
    uint32_t      u, overlong;
    const u_char  *p;

    p = *start;
    u = (uint32_t) *p;

    if (u >= 0xe0) {

        if (u >= 0xf0) {

            if (nxt_slow_path(u > 0xf4)) {
                /*
                 * The maximum valid Unicode character is 0x10ffff
                 * which is encoded as 0xf4 0x8f 0xbf 0xbf.
                 */
                return 0xffffffff;
            }

            u &= 0x07;
            overlong = 0x00ffff;
            n = 3;

        } else {
            u &= 0x0f;
            overlong = 0x07ff;
            n = 2;
        }

    } else if (u >= 0xc2) {

        /* 0x80 is encoded as 0xc2 0x80. */

        u &= 0x1f;
        overlong = 0x007f;
        n = 1;

    } else {
        /* u <= 0xc2 */
        return 0xffffffff;
    }

    p++;

    if (nxt_fast_path(p + n <= end)) {

        do {
            c = *p++;
            /*
             * The byte must in the 0x80 - 0xbf range.
             * Values below 0x80 become >= 0x80.
             */
            c = c - 0x80;

            if (nxt_slow_path(c > 0x3f)) {
                return 0xffffffff;
            }

            u = (u << 6) | c;
            n--;

        } while (n != 0);

        if (overlong < u && u < 0x110000) {
            *start = p;
            return u;
        }
    }

    return 0xffffffff;
}


/*
 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
 * requires lengths of both strings because otherwise nxt_utf8_decode2()
 * may fail due to incomplete sequence.
 */

nxt_int_t
nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
    size_t len2)
{
    int32_t       n;
    uint32_t      u1, u2;
    const u_char  *end1, *end2;

    end1 = start1 + len1;
    end2 = start2 + len2;

    while (start1 < end1 && start2 < end2) {

        u1 = nxt_utf8_lowcase(&start1, end1);

        u2 = nxt_utf8_lowcase(&start2, end2);

        if (nxt_slow_path((u1 | u2) == 0xffffffff)) {
            return NXT_UTF8_SORT_INVALID;
        }

        n = u1 - u2;

        if (n != 0) {
            return (nxt_int_t) n;
        }
    }

    return 0;
}


uint32_t
nxt_utf8_lowcase(const u_char **start, const u_char *end)
{
    uint32_t        u;
    const uint32_t  *block;

    u = (uint32_t) **start;

    if (nxt_fast_path(u < 0x80)) {
        (*start)++;

        return nxt_unicode_block_000[u];
    }

    u = nxt_utf8_decode2(start, end);

    if (u <= NXT_UNICODE_MAX_LOWCASE) {
        block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];

        if (block != NULL) {
            return block[u % NXT_UNICODE_BLOCK_SIZE];
        }
    }

    return u;
}


ssize_t
nxt_utf8_length(const u_char *p, size_t len)
{
    ssize_t       length;
    const u_char  *end;

    length = 0;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) {
            return -1;
        }

        length++;
    }

    return length;
}


nxt_bool_t
nxt_utf8_is_valid(const u_char *p, size_t len)
{
    const u_char  *end;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) {
            return 0;
        }
    }

    return 1;
}