Annotation of embedaddon/php/ext/sqlite/libsqlite/src/encode.c, revision 1.1

1.1     ! misho       1: /*
        !             2: ** 2002 April 25
        !             3: **
        !             4: ** The author disclaims copyright to this source code.  In place of
        !             5: ** a legal notice, here is a blessing:
        !             6: **
        !             7: **    May you do good and not evil.
        !             8: **    May you find forgiveness for yourself and forgive others.
        !             9: **    May you share freely, never taking more than you give.
        !            10: **
        !            11: *************************************************************************
        !            12: ** This file contains helper routines used to translate binary data into
        !            13: ** a null-terminated string (suitable for use in SQLite) and back again.
        !            14: ** These are convenience routines for use by people who want to store binary
        !            15: ** data in an SQLite database.  The code in this file is not used by any other
        !            16: ** part of the SQLite library.
        !            17: **
        !            18: ** $Id: encode.c 225725 2006-12-24 20:50:02Z iliaa $
        !            19: */
        !            20: #include <string.h>
        !            21: #include <assert.h>
        !            22: 
        !            23: /*
        !            24: ** How This Encoder Works
        !            25: **
        !            26: ** The output is allowed to contain any character except 0x27 (') and
        !            27: ** 0x00.  This is accomplished by using an escape character to encode
        !            28: ** 0x27 and 0x00 as a two-byte sequence.  The escape character is always
        !            29: ** 0x01.  An 0x00 is encoded as the two byte sequence 0x01 0x01.  The
        !            30: ** 0x27 character is encoded as the two byte sequence 0x01 0x28.  Finally,
        !            31: ** the escape character itself is encoded as the two-character sequence
        !            32: ** 0x01 0x02.
        !            33: **
        !            34: ** To summarize, the encoder works by using an escape sequences as follows:
        !            35: **
        !            36: **       0x00  ->  0x01 0x01
        !            37: **       0x01  ->  0x01 0x02
        !            38: **       0x27  ->  0x01 0x28
        !            39: **
        !            40: ** If that were all the encoder did, it would work, but in certain cases
        !            41: ** it could double the size of the encoded string.  For example, to
        !            42: ** encode a string of 100 0x27 characters would require 100 instances of
        !            43: ** the 0x01 0x03 escape sequence resulting in a 200-character output.
        !            44: ** We would prefer to keep the size of the encoded string smaller than
        !            45: ** this.
        !            46: **
        !            47: ** To minimize the encoding size, we first add a fixed offset value to each 
        !            48: ** byte in the sequence.  The addition is modulo 256.  (That is to say, if
        !            49: ** the sum of the original character value and the offset exceeds 256, then
        !            50: ** the higher order bits are truncated.)  The offset is chosen to minimize
        !            51: ** the number of characters in the string that need to be escaped.  For
        !            52: ** example, in the case above where the string was composed of 100 0x27
        !            53: ** characters, the offset might be 0x01.  Each of the 0x27 characters would
        !            54: ** then be converted into an 0x28 character which would not need to be
        !            55: ** escaped at all and so the 100 character input string would be converted
        !            56: ** into just 100 characters of output.  Actually 101 characters of output - 
        !            57: ** we have to record the offset used as the first byte in the sequence so
        !            58: ** that the string can be decoded.  Since the offset value is stored as
        !            59: ** part of the output string and the output string is not allowed to contain
        !            60: ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
        !            61: **
        !            62: ** Here, then, are the encoding steps:
        !            63: **
        !            64: **     (1)   Choose an offset value and make it the first character of
        !            65: **           output.
        !            66: **
        !            67: **     (2)   Copy each input character into the output buffer, one by
        !            68: **           one, adding the offset value as you copy.
        !            69: **
        !            70: **     (3)   If the value of an input character plus offset is 0x00, replace
        !            71: **           that one character by the two-character sequence 0x01 0x01.
        !            72: **           If the sum is 0x01, replace it with 0x01 0x02.  If the sum
        !            73: **           is 0x27, replace it with 0x01 0x03.
        !            74: **
        !            75: **     (4)   Put a 0x00 terminator at the end of the output.
        !            76: **
        !            77: ** Decoding is obvious:
        !            78: **
        !            79: **     (5)   Copy encoded characters except the first into the decode 
        !            80: **           buffer.  Set the first encoded character aside for use as
        !            81: **           the offset in step 7 below.
        !            82: **
        !            83: **     (6)   Convert each 0x01 0x01 sequence into a single character 0x00.
        !            84: **           Convert 0x01 0x02 into 0x01.  Convert 0x01 0x28 into 0x27.
        !            85: **
        !            86: **     (7)   Subtract the offset value that was the first character of
        !            87: **           the encoded buffer from all characters in the output buffer.
        !            88: **
        !            89: ** The only tricky part is step (1) - how to compute an offset value to
        !            90: ** minimize the size of the output buffer.  This is accomplished by testing
        !            91: ** all offset values and picking the one that results in the fewest number
        !            92: ** of escapes.  To do that, we first scan the entire input and count the
        !            93: ** number of occurances of each character value in the input.  Suppose
        !            94: ** the number of 0x00 characters is N(0), the number of occurances of 0x01
        !            95: ** is N(1), and so forth up to the number of occurances of 0xff is N(255).
        !            96: ** An offset of 0 is not allowed so we don't have to test it.  The number
        !            97: ** of escapes required for an offset of 1 is N(1)+N(2)+N(40).  The number
        !            98: ** of escapes required for an offset of 2 is N(2)+N(3)+N(41).  And so forth.
        !            99: ** In this way we find the offset that gives the minimum number of escapes,
        !           100: ** and thus minimizes the length of the output string.
        !           101: */
        !           102: 
        !           103: /*
        !           104: ** Encode a binary buffer "in" of size n bytes so that it contains
        !           105: ** no instances of characters '\'' or '\000'.  The output is 
        !           106: ** null-terminated and can be used as a string value in an INSERT
        !           107: ** or UPDATE statement.  Use sqlite_decode_binary() to convert the
        !           108: ** string back into its original binary.
        !           109: **
        !           110: ** The result is written into a preallocated output buffer "out".
        !           111: ** "out" must be able to hold at least 2 +(257*n)/254 bytes.
        !           112: ** In other words, the output will be expanded by as much as 3
        !           113: ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
        !           114: ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
        !           115: **
        !           116: ** The return value is the number of characters in the encoded
        !           117: ** string, excluding the "\000" terminator.
        !           118: **
        !           119: ** If out==NULL then no output is generated but the routine still returns
        !           120: ** the number of characters that would have been generated if out had
        !           121: ** not been NULL.
        !           122: */
        !           123: int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){
        !           124:   int i, j, e, m;
        !           125:   unsigned char x;
        !           126:   int cnt[256];
        !           127:   if( n<=0 ){
        !           128:     if( out ){
        !           129:       out[0] = 'x';
        !           130:       out[1] = 0;
        !           131:     }
        !           132:     return 1;
        !           133:   }
        !           134:   memset(cnt, 0, sizeof(cnt));
        !           135:   for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
        !           136:   m = n;
        !           137:   for(i=1; i<256; i++){
        !           138:     int sum;
        !           139:     if( i=='\'' ) continue;
        !           140:     sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
        !           141:     if( sum<m ){
        !           142:       m = sum;
        !           143:       e = i;
        !           144:       if( m==0 ) break;
        !           145:     }
        !           146:   }
        !           147:   if( out==0 ){
        !           148:     return n+m+1;
        !           149:   }
        !           150:   out[0] = e;
        !           151:   j = 1;
        !           152:   for(i=0; i<n; i++){
        !           153:     x = in[i] - e;
        !           154:     if( x==0 || x==1 || x=='\''){
        !           155:       out[j++] = 1;
        !           156:       x++;
        !           157:     }
        !           158:     out[j++] = x;
        !           159:   }
        !           160:   out[j] = 0;
        !           161:   assert( j==n+m+1 );
        !           162:   return j;
        !           163: }
        !           164: 
        !           165: /*
        !           166: ** Decode the string "in" into binary data and write it into "out".
        !           167: ** This routine reverses the encoding created by sqlite_encode_binary().
        !           168: ** The output will always be a few bytes less than the input.  The number
        !           169: ** of bytes of output is returned.  If the input is not a well-formed
        !           170: ** encoding, -1 is returned.
        !           171: **
        !           172: ** The "in" and "out" parameters may point to the same buffer in order
        !           173: ** to decode a string in place.
        !           174: */
        !           175: int sqlite_decode_binary(const unsigned char *in, unsigned char *out){
        !           176:   int i, e;
        !           177:   unsigned char c;
        !           178:   e = *(in++);
        !           179:   if (e == 0) {
        !           180:     return 0;
        !           181:   }
        !           182:   i = 0;
        !           183:   while( (c = *(in++))!=0 ){
        !           184:     if (c == 1) {
        !           185:       c = *(in++) - 1;
        !           186:     }
        !           187:     out[i++] = c + e;
        !           188:   }
        !           189:   return i;
        !           190: }
        !           191: 
        !           192: #ifdef ENCODER_TEST
        !           193: #include <stdio.h>
        !           194: /*
        !           195: ** The subroutines above are not tested by the usual test suite.  To test
        !           196: ** these routines, compile just this one file with a -DENCODER_TEST=1 option
        !           197: ** and run the result.
        !           198: */
        !           199: int main(int argc, char **argv){
        !           200:   int i, j, n, m, nOut, nByteIn, nByteOut;
        !           201:   unsigned char in[30000];
        !           202:   unsigned char out[33000];
        !           203: 
        !           204:   nByteIn = nByteOut = 0;
        !           205:   for(i=0; i<sizeof(in); i++){
        !           206:     printf("Test %d: ", i+1);
        !           207:     n = rand() % (i+1);
        !           208:     if( i%100==0 ){
        !           209:       int k;
        !           210:       for(j=k=0; j<n; j++){
        !           211:         /* if( k==0 || k=='\'' ) k++; */
        !           212:         in[j] = k;
        !           213:         k = (k+1)&0xff;
        !           214:       }
        !           215:     }else{
        !           216:       for(j=0; j<n; j++) in[j] = rand() & 0xff;
        !           217:     }
        !           218:     nByteIn += n;
        !           219:     nOut = sqlite_encode_binary(in, n, out);
        !           220:     nByteOut += nOut;
        !           221:     if( nOut!=strlen(out) ){
        !           222:       printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
        !           223:       exit(1);
        !           224:     }
        !           225:     if( nOut!=sqlite_encode_binary(in, n, 0) ){
        !           226:       printf(" ERROR actual output size disagrees with predicted size\n");
        !           227:       exit(1);
        !           228:     }
        !           229:     m = (256*n + 1262)/253;
        !           230:     printf("size %d->%d (max %d)", n, strlen(out)+1, m);
        !           231:     if( strlen(out)+1>m ){
        !           232:       printf(" ERROR output too big\n");
        !           233:       exit(1);
        !           234:     }
        !           235:     for(j=0; out[j]; j++){
        !           236:       if( out[j]=='\'' ){
        !           237:         printf(" ERROR contains (')\n");
        !           238:         exit(1);
        !           239:       }
        !           240:     }
        !           241:     j = sqlite_decode_binary(out, out);
        !           242:     if( j!=n ){
        !           243:       printf(" ERROR decode size %d\n", j);
        !           244:       exit(1);
        !           245:     }
        !           246:     if( memcmp(in, out, n)!=0 ){
        !           247:       printf(" ERROR decode mismatch\n");
        !           248:       exit(1);
        !           249:     }
        !           250:     printf(" OK\n");
        !           251:   }
        !           252:   fprintf(stderr,"Finished.  Total encoding: %d->%d bytes\n",
        !           253:           nByteIn, nByteOut);
        !           254:   fprintf(stderr,"Avg size increase: %.3f%%\n",
        !           255:     (nByteOut-nByteIn)*100.0/(double)nByteIn);
        !           256: }
        !           257: #endif /* ENCODER_TEST */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>