Annotation of embedaddon/pcre/pcre16_valid_utf16.c, revision 1.1
1.1 ! misho 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /* PCRE is a library of functions to support regular expressions whose syntax
! 6: and semantics are as close as possible to those of the Perl 5 language.
! 7:
! 8: Written by Philip Hazel
! 9: Copyright (c) 1997-2012 University of Cambridge
! 10:
! 11: -----------------------------------------------------------------------------
! 12: Redistribution and use in source and binary forms, with or without
! 13: modification, are permitted provided that the following conditions are met:
! 14:
! 15: * Redistributions of source code must retain the above copyright notice,
! 16: this list of conditions and the following disclaimer.
! 17:
! 18: * Redistributions in binary form must reproduce the above copyright
! 19: notice, this list of conditions and the following disclaimer in the
! 20: documentation and/or other materials provided with the distribution.
! 21:
! 22: * Neither the name of the University of Cambridge nor the names of its
! 23: contributors may be used to endorse or promote products derived from
! 24: this software without specific prior written permission.
! 25:
! 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
! 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
! 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 36: POSSIBILITY OF SUCH DAMAGE.
! 37: -----------------------------------------------------------------------------
! 38: */
! 39:
! 40:
! 41: /* This module contains an internal function for validating UTF-16 character
! 42: strings. */
! 43:
! 44:
! 45: #ifdef HAVE_CONFIG_H
! 46: #include "config.h"
! 47: #endif
! 48:
! 49: /* Generate code with 16 bit character support. */
! 50: #define COMPILE_PCRE16
! 51:
! 52: #include "pcre_internal.h"
! 53:
! 54:
! 55: /*************************************************
! 56: * Validate a UTF-16 string *
! 57: *************************************************/
! 58:
! 59: /* This function is called (optionally) at the start of compile or match, to
! 60: check that a supposed UTF-16 string is actually valid. The early check means
! 61: that subsequent code can assume it is dealing with a valid string. The check
! 62: can be turned off for maximum performance, but the consequences of supplying an
! 63: invalid string are then undefined.
! 64:
! 65: From release 8.21 more information about the details of the error are passed
! 66: back in the returned value:
! 67:
! 68: PCRE_UTF16_ERR0 No error
! 69: PCRE_UTF16_ERR1 Missing low surrogate at the end of the string
! 70: PCRE_UTF16_ERR2 Invalid low surrogate
! 71: PCRE_UTF16_ERR3 Isolated low surrogate
! 72: PCRE_UTF16_ERR4 Not allowed character
! 73:
! 74: Arguments:
! 75: string points to the string
! 76: length length of string, or -1 if the string is zero-terminated
! 77: errp pointer to an error position offset variable
! 78:
! 79: Returns: = 0 if the string is a valid UTF-16 string
! 80: > 0 otherwise, setting the offset of the bad character
! 81: */
! 82:
! 83: int
! 84: PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
! 85: {
! 86: #ifdef SUPPORT_UTF
! 87: register PCRE_PUCHAR p;
! 88: register pcre_uchar c;
! 89:
! 90: if (length < 0)
! 91: {
! 92: for (p = string; *p != 0; p++);
! 93: length = p - string;
! 94: }
! 95:
! 96: for (p = string; length-- > 0; p++)
! 97: {
! 98: c = *p;
! 99:
! 100: if ((c & 0xf800) != 0xd800)
! 101: {
! 102: /* Normal UTF-16 code point. Neither high nor low surrogate. */
! 103:
! 104: /* This is probably a BOM from a different byte-order.
! 105: Regardless, the string is rejected. */
! 106: if (c == 0xfffe)
! 107: {
! 108: *erroroffset = p - string;
! 109: return PCRE_UTF16_ERR4;
! 110: }
! 111: }
! 112: else if ((c & 0x0400) == 0)
! 113: {
! 114: /* High surrogate. */
! 115:
! 116: /* Must be a followed by a low surrogate. */
! 117: if (length == 0)
! 118: {
! 119: *erroroffset = p - string;
! 120: return PCRE_UTF16_ERR1;
! 121: }
! 122: p++;
! 123: length--;
! 124: if ((*p & 0xfc00) != 0xdc00)
! 125: {
! 126: *erroroffset = p - string;
! 127: return PCRE_UTF16_ERR2;
! 128: }
! 129: }
! 130: else
! 131: {
! 132: /* Isolated low surrogate. Always an error. */
! 133: *erroroffset = p - string;
! 134: return PCRE_UTF16_ERR3;
! 135: }
! 136: }
! 137:
! 138: #else /* SUPPORT_UTF */
! 139: (void)(string); /* Keep picky compilers happy */
! 140: (void)(length);
! 141: #endif /* SUPPORT_UTF */
! 142:
! 143: return PCRE_UTF16_ERR0; /* This indicates success */
! 144: }
! 145:
! 146: /* End of pcre16_valid_utf16.c */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>