embedaddon/pcre/pcre16_valid_utf16.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre16_valid_utf16.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:50:25 2012 UTC (12 years, 4 months ago) by misho
Branches: pcre, MAIN
CVS tags: v8_31, v8_30, HEAD

pcre

1: /************************************************* 2: * Perl-Compatible Regular Expressions * 3: *************************************************/ 4: 5: /* PCRE is a library of functions to support regular expressions whose syntax 6: and semantics are as close as possible to those of the Perl 5 language. 7: 8: Written by Philip Hazel 9: Copyright (c) 1997-2012 University of Cambridge 10: 11: ----------------------------------------------------------------------------- 12: Redistribution and use in source and binary forms, with or without 13: modification, are permitted provided that the following conditions are met: 14: 15: * Redistributions of source code must retain the above copyright notice, 16: this list of conditions and the following disclaimer. 17: 18: * Redistributions in binary form must reproduce the above copyright 19: notice, this list of conditions and the following disclaimer in the 20: documentation and/or other materials provided with the distribution. 21: 22: * Neither the name of the University of Cambridge nor the names of its 23: contributors may be used to endorse or promote products derived from 24: this software without specific prior written permission. 25: 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36: POSSIBILITY OF SUCH DAMAGE. 37: ----------------------------------------------------------------------------- 38: */ 39: 40: 41: /* This module contains an internal function for validating UTF-16 character 42: strings. */ 43: 44: 45: #ifdef HAVE_CONFIG_H 46: #include "config.h" 47: #endif 48: 49: /* Generate code with 16 bit character support. */ 50: #define COMPILE_PCRE16 51: 52: #include "pcre_internal.h" 53: 54: 55: /************************************************* 56: * Validate a UTF-16 string * 57: *************************************************/ 58: 59: /* This function is called (optionally) at the start of compile or match, to 60: check that a supposed UTF-16 string is actually valid. The early check means 61: that subsequent code can assume it is dealing with a valid string. The check 62: can be turned off for maximum performance, but the consequences of supplying an 63: invalid string are then undefined. 64: 65: From release 8.21 more information about the details of the error are passed 66: back in the returned value: 67: 68: PCRE_UTF16_ERR0 No error 69: PCRE_UTF16_ERR1 Missing low surrogate at the end of the string 70: PCRE_UTF16_ERR2 Invalid low surrogate 71: PCRE_UTF16_ERR3 Isolated low surrogate 72: PCRE_UTF16_ERR4 Not allowed character 73: 74: Arguments: 75: string points to the string 76: length length of string, or -1 if the string is zero-terminated 77: errp pointer to an error position offset variable 78: 79: Returns: = 0 if the string is a valid UTF-16 string 80: > 0 otherwise, setting the offset of the bad character 81: */ 82: 83: int 84: PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset) 85: { 86: #ifdef SUPPORT_UTF 87: register PCRE_PUCHAR p; 88: register pcre_uchar c; 89: 90: if (length < 0) 91: { 92: for (p = string; *p != 0; p++); 93: length = p - string; 94: } 95: 96: for (p = string; length-- > 0; p++) 97: { 98: c = *p; 99: 100: if ((c & 0xf800) != 0xd800) 101: { 102: /* Normal UTF-16 code point. Neither high nor low surrogate. */ 103: 104: /* This is probably a BOM from a different byte-order. 105: Regardless, the string is rejected. */ 106: if (c == 0xfffe) 107: { 108: *erroroffset = p - string; 109: return PCRE_UTF16_ERR4; 110: } 111: } 112: else if ((c & 0x0400) == 0) 113: { 114: /* High surrogate. */ 115: 116: /* Must be a followed by a low surrogate. */ 117: if (length == 0) 118: { 119: *erroroffset = p - string; 120: return PCRE_UTF16_ERR1; 121: } 122: p++; 123: length--; 124: if ((*p & 0xfc00) != 0xdc00) 125: { 126: *erroroffset = p - string; 127: return PCRE_UTF16_ERR2; 128: } 129: } 130: else 131: { 132: /* Isolated low surrogate. Always an error. */ 133: *erroroffset = p - string; 134: return PCRE_UTF16_ERR3; 135: } 136: } 137: 138: #else /* SUPPORT_UTF */ 139: (void)(string); /* Keep picky compilers happy */ 140: (void)(length); 141: #endif /* SUPPORT_UTF */ 142: 143: return PCRE_UTF16_ERR0; /* This indicates success */ 144: } 145: 146: /* End of pcre16_valid_utf16.c */