Annotation of embedaddon/php/ext/sqlite/libsqlite/src/encode.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** 2002 April 25
! 3: **
! 4: ** The author disclaims copyright to this source code. In place of
! 5: ** a legal notice, here is a blessing:
! 6: **
! 7: ** May you do good and not evil.
! 8: ** May you find forgiveness for yourself and forgive others.
! 9: ** May you share freely, never taking more than you give.
! 10: **
! 11: *************************************************************************
! 12: ** This file contains helper routines used to translate binary data into
! 13: ** a null-terminated string (suitable for use in SQLite) and back again.
! 14: ** These are convenience routines for use by people who want to store binary
! 15: ** data in an SQLite database. The code in this file is not used by any other
! 16: ** part of the SQLite library.
! 17: **
! 18: ** $Id: encode.c 225725 2006-12-24 20:50:02Z iliaa $
! 19: */
! 20: #include <string.h>
! 21: #include <assert.h>
! 22:
! 23: /*
! 24: ** How This Encoder Works
! 25: **
! 26: ** The output is allowed to contain any character except 0x27 (') and
! 27: ** 0x00. This is accomplished by using an escape character to encode
! 28: ** 0x27 and 0x00 as a two-byte sequence. The escape character is always
! 29: ** 0x01. An 0x00 is encoded as the two byte sequence 0x01 0x01. The
! 30: ** 0x27 character is encoded as the two byte sequence 0x01 0x28. Finally,
! 31: ** the escape character itself is encoded as the two-character sequence
! 32: ** 0x01 0x02.
! 33: **
! 34: ** To summarize, the encoder works by using an escape sequences as follows:
! 35: **
! 36: ** 0x00 -> 0x01 0x01
! 37: ** 0x01 -> 0x01 0x02
! 38: ** 0x27 -> 0x01 0x28
! 39: **
! 40: ** If that were all the encoder did, it would work, but in certain cases
! 41: ** it could double the size of the encoded string. For example, to
! 42: ** encode a string of 100 0x27 characters would require 100 instances of
! 43: ** the 0x01 0x03 escape sequence resulting in a 200-character output.
! 44: ** We would prefer to keep the size of the encoded string smaller than
! 45: ** this.
! 46: **
! 47: ** To minimize the encoding size, we first add a fixed offset value to each
! 48: ** byte in the sequence. The addition is modulo 256. (That is to say, if
! 49: ** the sum of the original character value and the offset exceeds 256, then
! 50: ** the higher order bits are truncated.) The offset is chosen to minimize
! 51: ** the number of characters in the string that need to be escaped. For
! 52: ** example, in the case above where the string was composed of 100 0x27
! 53: ** characters, the offset might be 0x01. Each of the 0x27 characters would
! 54: ** then be converted into an 0x28 character which would not need to be
! 55: ** escaped at all and so the 100 character input string would be converted
! 56: ** into just 100 characters of output. Actually 101 characters of output -
! 57: ** we have to record the offset used as the first byte in the sequence so
! 58: ** that the string can be decoded. Since the offset value is stored as
! 59: ** part of the output string and the output string is not allowed to contain
! 60: ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
! 61: **
! 62: ** Here, then, are the encoding steps:
! 63: **
! 64: ** (1) Choose an offset value and make it the first character of
! 65: ** output.
! 66: **
! 67: ** (2) Copy each input character into the output buffer, one by
! 68: ** one, adding the offset value as you copy.
! 69: **
! 70: ** (3) If the value of an input character plus offset is 0x00, replace
! 71: ** that one character by the two-character sequence 0x01 0x01.
! 72: ** If the sum is 0x01, replace it with 0x01 0x02. If the sum
! 73: ** is 0x27, replace it with 0x01 0x03.
! 74: **
! 75: ** (4) Put a 0x00 terminator at the end of the output.
! 76: **
! 77: ** Decoding is obvious:
! 78: **
! 79: ** (5) Copy encoded characters except the first into the decode
! 80: ** buffer. Set the first encoded character aside for use as
! 81: ** the offset in step 7 below.
! 82: **
! 83: ** (6) Convert each 0x01 0x01 sequence into a single character 0x00.
! 84: ** Convert 0x01 0x02 into 0x01. Convert 0x01 0x28 into 0x27.
! 85: **
! 86: ** (7) Subtract the offset value that was the first character of
! 87: ** the encoded buffer from all characters in the output buffer.
! 88: **
! 89: ** The only tricky part is step (1) - how to compute an offset value to
! 90: ** minimize the size of the output buffer. This is accomplished by testing
! 91: ** all offset values and picking the one that results in the fewest number
! 92: ** of escapes. To do that, we first scan the entire input and count the
! 93: ** number of occurances of each character value in the input. Suppose
! 94: ** the number of 0x00 characters is N(0), the number of occurances of 0x01
! 95: ** is N(1), and so forth up to the number of occurances of 0xff is N(255).
! 96: ** An offset of 0 is not allowed so we don't have to test it. The number
! 97: ** of escapes required for an offset of 1 is N(1)+N(2)+N(40). The number
! 98: ** of escapes required for an offset of 2 is N(2)+N(3)+N(41). And so forth.
! 99: ** In this way we find the offset that gives the minimum number of escapes,
! 100: ** and thus minimizes the length of the output string.
! 101: */
! 102:
! 103: /*
! 104: ** Encode a binary buffer "in" of size n bytes so that it contains
! 105: ** no instances of characters '\'' or '\000'. The output is
! 106: ** null-terminated and can be used as a string value in an INSERT
! 107: ** or UPDATE statement. Use sqlite_decode_binary() to convert the
! 108: ** string back into its original binary.
! 109: **
! 110: ** The result is written into a preallocated output buffer "out".
! 111: ** "out" must be able to hold at least 2 +(257*n)/254 bytes.
! 112: ** In other words, the output will be expanded by as much as 3
! 113: ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
! 114: ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
! 115: **
! 116: ** The return value is the number of characters in the encoded
! 117: ** string, excluding the "\000" terminator.
! 118: **
! 119: ** If out==NULL then no output is generated but the routine still returns
! 120: ** the number of characters that would have been generated if out had
! 121: ** not been NULL.
! 122: */
! 123: int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){
! 124: int i, j, e, m;
! 125: unsigned char x;
! 126: int cnt[256];
! 127: if( n<=0 ){
! 128: if( out ){
! 129: out[0] = 'x';
! 130: out[1] = 0;
! 131: }
! 132: return 1;
! 133: }
! 134: memset(cnt, 0, sizeof(cnt));
! 135: for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
! 136: m = n;
! 137: for(i=1; i<256; i++){
! 138: int sum;
! 139: if( i=='\'' ) continue;
! 140: sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
! 141: if( sum<m ){
! 142: m = sum;
! 143: e = i;
! 144: if( m==0 ) break;
! 145: }
! 146: }
! 147: if( out==0 ){
! 148: return n+m+1;
! 149: }
! 150: out[0] = e;
! 151: j = 1;
! 152: for(i=0; i<n; i++){
! 153: x = in[i] - e;
! 154: if( x==0 || x==1 || x=='\''){
! 155: out[j++] = 1;
! 156: x++;
! 157: }
! 158: out[j++] = x;
! 159: }
! 160: out[j] = 0;
! 161: assert( j==n+m+1 );
! 162: return j;
! 163: }
! 164:
! 165: /*
! 166: ** Decode the string "in" into binary data and write it into "out".
! 167: ** This routine reverses the encoding created by sqlite_encode_binary().
! 168: ** The output will always be a few bytes less than the input. The number
! 169: ** of bytes of output is returned. If the input is not a well-formed
! 170: ** encoding, -1 is returned.
! 171: **
! 172: ** The "in" and "out" parameters may point to the same buffer in order
! 173: ** to decode a string in place.
! 174: */
! 175: int sqlite_decode_binary(const unsigned char *in, unsigned char *out){
! 176: int i, e;
! 177: unsigned char c;
! 178: e = *(in++);
! 179: if (e == 0) {
! 180: return 0;
! 181: }
! 182: i = 0;
! 183: while( (c = *(in++))!=0 ){
! 184: if (c == 1) {
! 185: c = *(in++) - 1;
! 186: }
! 187: out[i++] = c + e;
! 188: }
! 189: return i;
! 190: }
! 191:
! 192: #ifdef ENCODER_TEST
! 193: #include <stdio.h>
! 194: /*
! 195: ** The subroutines above are not tested by the usual test suite. To test
! 196: ** these routines, compile just this one file with a -DENCODER_TEST=1 option
! 197: ** and run the result.
! 198: */
! 199: int main(int argc, char **argv){
! 200: int i, j, n, m, nOut, nByteIn, nByteOut;
! 201: unsigned char in[30000];
! 202: unsigned char out[33000];
! 203:
! 204: nByteIn = nByteOut = 0;
! 205: for(i=0; i<sizeof(in); i++){
! 206: printf("Test %d: ", i+1);
! 207: n = rand() % (i+1);
! 208: if( i%100==0 ){
! 209: int k;
! 210: for(j=k=0; j<n; j++){
! 211: /* if( k==0 || k=='\'' ) k++; */
! 212: in[j] = k;
! 213: k = (k+1)&0xff;
! 214: }
! 215: }else{
! 216: for(j=0; j<n; j++) in[j] = rand() & 0xff;
! 217: }
! 218: nByteIn += n;
! 219: nOut = sqlite_encode_binary(in, n, out);
! 220: nByteOut += nOut;
! 221: if( nOut!=strlen(out) ){
! 222: printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
! 223: exit(1);
! 224: }
! 225: if( nOut!=sqlite_encode_binary(in, n, 0) ){
! 226: printf(" ERROR actual output size disagrees with predicted size\n");
! 227: exit(1);
! 228: }
! 229: m = (256*n + 1262)/253;
! 230: printf("size %d->%d (max %d)", n, strlen(out)+1, m);
! 231: if( strlen(out)+1>m ){
! 232: printf(" ERROR output too big\n");
! 233: exit(1);
! 234: }
! 235: for(j=0; out[j]; j++){
! 236: if( out[j]=='\'' ){
! 237: printf(" ERROR contains (')\n");
! 238: exit(1);
! 239: }
! 240: }
! 241: j = sqlite_decode_binary(out, out);
! 242: if( j!=n ){
! 243: printf(" ERROR decode size %d\n", j);
! 244: exit(1);
! 245: }
! 246: if( memcmp(in, out, n)!=0 ){
! 247: printf(" ERROR decode mismatch\n");
! 248: exit(1);
! 249: }
! 250: printf(" OK\n");
! 251: }
! 252: fprintf(stderr,"Finished. Total encoding: %d->%d bytes\n",
! 253: nByteIn, nByteOut);
! 254: fprintf(stderr,"Avg size increase: %.3f%%\n",
! 255: (nByteOut-nByteIn)*100.0/(double)nByteIn);
! 256: }
! 257: #endif /* ENCODER_TEST */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>