Annotation of embedaddon/rsync/patches/link-by-hash.diff, revision 1.1.1.1

1.1       misho       1: Jason M. Felice wrote:
                      2: 
                      3: This patch adds the --link-by-hash=DIR option, which hard links received files
                      4: in a link farm arranged by MD4 or MD5 file hash.  The result is that the system
                      5: will only store one copy of the unique contents of each file, regardless of the
                      6: file's name.
                      7: 
                      8: To use this patch, run these commands for a successful build:
                      9: 
                     10:     patch -p1 <patches/link-by-hash.diff
                     11:     ./prepare-source
                     12:     ./configure
                     13:     make
                     14: 
                     15: based-on: e94bad1c156fc3910f24e2b3b71a81b0b0bdeb70
                     16: diff --git a/Makefile.in b/Makefile.in
                     17: --- a/Makefile.in
                     18: +++ b/Makefile.in
                     19: @@ -44,7 +44,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
                     20:        util.o util2.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
                     21:  OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
                     22:        usage.o fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
                     23: -OBJS3=progress.o pipe.o @ASM@
                     24: +OBJS3=progress.o pipe.o hashlink.o @ASM@
                     25:  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
                     26:  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
                     27:        popt/popthelp.o popt/poptparse.o
                     28: diff --git a/checksum.c b/checksum.c
                     29: --- a/checksum.c
                     30: +++ b/checksum.c
                     31: @@ -40,6 +40,8 @@ extern int whole_file;
                     32:  extern int checksum_seed;
                     33:  extern int protocol_version;
                     34:  extern int proper_seed_order;
                     35: +extern char *link_by_hash_dir;
                     36: +extern char link_by_hash_extra_sum[MAX_DIGEST_LEN];
                     37:  extern const char *checksum_choice;
                     38:  
                     39:  struct name_num_obj valid_checksums = {
                     40: @@ -444,7 +446,7 @@ static union {
                     41:        MD4_CTX m4;
                     42:  #endif
                     43:        MD5_CTX m5;
                     44: -} ctx;
                     45: +} ctx, ctx2;
                     46:  #ifdef SUPPORT_XXHASH
                     47:  static XXH64_state_t* xxh64_state;
                     48:  #endif
                     49: @@ -483,6 +485,8 @@ void sum_init(int csum_type, int seed)
                     50:  #endif
                     51:          case CSUM_MD5:
                     52:                MD5_Init(&ctx.m5);
                     53: +              if (link_by_hash_dir)
                     54: +                      MD5_Init(&ctx2.m5);
                     55:                break;
                     56:          case CSUM_MD4:
                     57:  #ifdef USE_OPENSSL
                     58: @@ -533,6 +537,8 @@ void sum_update(const char *p, int32 len)
                     59:  #endif
                     60:          case CSUM_MD5:
                     61:                MD5_Update(&ctx.m5, (uchar *)p, len);
                     62: +              if (link_by_hash_dir)
                     63: +                      MD5_Update(&ctx2.m5, (uchar *)p, len);
                     64:                break;
                     65:          case CSUM_MD4:
                     66:  #ifdef USE_OPENSSL
                     67: @@ -598,6 +604,8 @@ int sum_end(char *sum)
                     68:  #endif
                     69:          case CSUM_MD5:
                     70:                MD5_Final((uchar *)sum, &ctx.m5);
                     71: +              if (link_by_hash_dir)
                     72: +                      MD5_Final((uchar *)link_by_hash_extra_sum, &ctx2.m5);
                     73:                break;
                     74:          case CSUM_MD4:
                     75:  #ifdef USE_OPENSSL
                     76: diff --git a/clientserver.c b/clientserver.c
                     77: --- a/clientserver.c
                     78: +++ b/clientserver.c
                     79: @@ -52,6 +52,7 @@ extern int logfile_format_has_i;
                     80:  extern int logfile_format_has_o_or_i;
                     81:  extern char *bind_address;
                     82:  extern char *config_file;
                     83: +extern char *link_by_hash_dir;
                     84:  extern char *logfile_format;
                     85:  extern char *files_from;
                     86:  extern char *tmpdir;
                     87: @@ -665,6 +666,9 @@ static int rsync_module(int f_in, int f_out, int i, const char *addr, const char
                     88:                return -1;
                     89:        }
                     90:  
                     91: +      if (*lp_link_by_hash_dir(i))
                     92: +              link_by_hash_dir = lp_link_by_hash_dir(i);
                     93: +
                     94:        if (am_daemon > 0) {
                     95:                rprintf(FLOG, "rsync allowed access on module %s from %s (%s)\n",
                     96:                        name, host, addr);
                     97: diff --git a/daemon-parm.txt b/daemon-parm.txt
                     98: --- a/daemon-parm.txt
                     99: +++ b/daemon-parm.txt
                    100: @@ -29,6 +29,7 @@ STRING       hosts_deny              NULL
                    101:  STRING        include                 NULL
                    102:  STRING        include_from            NULL
                    103:  STRING        incoming_chmod          NULL
                    104: +STRING        link_by_hash_dir        NULL
                    105:  STRING        lock_file               DEFAULT_LOCK_FILE
                    106:  STRING        log_file                NULL
                    107:  STRING        log_format              "%o %h [%a] %m (%u) %f %l"
                    108: diff --git a/hashlink.c b/hashlink.c
                    109: new file mode 100644
                    110: --- /dev/null
                    111: +++ b/hashlink.c
                    112: @@ -0,0 +1,92 @@
                    113: +/*
                    114: +   Copyright (C) Cronosys, LLC 2004
                    115: +
                    116: +   This program is free software; you can redistribute it and/or modify
                    117: +   it under the terms of the GNU General Public License as published by
                    118: +   the Free Software Foundation; either version 2 of the License, or
                    119: +   (at your option) any later version.
                    120: +
                    121: +   This program is distributed in the hope that it will be useful,
                    122: +   but WITHOUT ANY WARRANTY; without even the implied warranty of
                    123: +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                    124: +   GNU General Public License for more details.
                    125: +
                    126: +   You should have received a copy of the GNU General Public License
                    127: +   along with this program; if not, write to the Free Software
                    128: +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
                    129: +*/
                    130: +
                    131: +/* This file contains code used by the --link-by-hash option. */
                    132: +
                    133: +#include "rsync.h"
                    134: +#include "inums.h"
                    135: +
                    136: +extern int protocol_version;
                    137: +extern char *link_by_hash_dir;
                    138: +extern char sender_file_sum[MAX_DIGEST_LEN];
                    139: +
                    140: +char link_by_hash_extra_sum[MAX_DIGEST_LEN]; /* Only used when md4 sums are in the transfer */
                    141: +
                    142: +#ifdef HAVE_LINK
                    143: +
                    144: +/* This function is always called after a file is received, so the
                    145: + * sender_file_sum buffer has whatever the last checksum was for the
                    146: + * transferred file. */
                    147: +void link_by_hash(const char *fname, const char *fnametmp, struct file_struct *file)
                    148: +{
                    149: +      STRUCT_STAT st;
                    150: +      char *hashname, *last_slash, *num_str;
                    151: +      const char *hex;
                    152: +      int num = 0;
                    153: +
                    154: +      /* We don't bother to hard-link 0-length files. */
                    155: +      if (F_LENGTH(file) == 0)
                    156: +              return;
                    157: +
                    158: +      hex = sum_as_hex(5, protocol_version >= 30 ? sender_file_sum : link_by_hash_extra_sum, 0);
                    159: +      if (asprintf(&hashname, "%s/%.3s/%.3s/%.3s/%s.%s.000000",
                    160: +                   link_by_hash_dir, hex, hex+3, hex+6, hex+9, big_num(F_LENGTH(file))) < 0)
                    161: +      {
                    162: +              out_of_memory("make_hash_name");
                    163: +      }
                    164: +
                    165: +      last_slash = strrchr(hashname, '/');
                    166: +      num_str = strrchr(last_slash, '.') + 1;
                    167: +
                    168: +      while (1) {
                    169: +              if (num >= 999999) { /* Surely we'll never reach this... */
                    170: +                      if (DEBUG_GTE(HASHLINK, 1))
                    171: +                              rprintf(FINFO, "link-by-hash: giving up after \"%s\".\n", hashname);
                    172: +                      goto cleanup;
                    173: +              }
                    174: +              if (num > 0 && DEBUG_GTE(HASHLINK, 1))
                    175: +                      rprintf(FINFO, "link-by-hash: max link count exceeded, starting new file \"%s\".\n", hashname);
                    176: +
                    177: +              snprintf(num_str, 7, "%d", num++);
                    178: +              if (do_stat(hashname, &st) < 0)
                    179: +                      break;
                    180: +
                    181: +              if (do_link(hashname, fnametmp) < 0) {
                    182: +                      if (errno == EMLINK)
                    183: +                              continue;
                    184: +                      rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", hashname, full_fname(fname));
                    185: +              } else {
                    186: +                      if (DEBUG_GTE(HASHLINK, 2))
                    187: +                              rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", hashname, full_fname(fname));
                    188: +                      robust_rename(fnametmp, fname, NULL, 0644);
                    189: +              }
                    190: +
                    191: +              goto cleanup;
                    192: +      }
                    193: +
                    194: +      if (DEBUG_GTE(HASHLINK, 2))
                    195: +              rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", full_fname(fname), hashname);
                    196: +
                    197: +      if (do_link(fname, hashname) < 0
                    198: +       && (errno != ENOENT || make_path(hashname, MKP_DROP_NAME) < 0 || do_link(fname, hashname) < 0))
                    199: +              rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", full_fname(fname), hashname);
                    200: +
                    201: +  cleanup:
                    202: +      free(hashname);
                    203: +}
                    204: +#endif
                    205: diff --git a/options.c b/options.c
                    206: --- a/options.c
                    207: +++ b/options.c
                    208: @@ -164,6 +164,7 @@ char *backup_suffix = NULL;
                    209:  char *tmpdir = NULL;
                    210:  char *partial_dir = NULL;
                    211:  char *basis_dir[MAX_BASIS_DIRS+1];
                    212: +char *link_by_hash_dir = NULL;
                    213:  char *config_file = NULL;
                    214:  char *shell_cmd = NULL;
                    215:  char *logfile_name = NULL;
                    216: @@ -221,7 +222,7 @@ static const char *debug_verbosity[] = {
                    217:        /*2*/ "BIND,CMD,CONNECT,DEL,DELTASUM,DUP,FILTER,FLIST,ICONV",
                    218:        /*3*/ "ACL,BACKUP,CONNECT2,DELTASUM2,DEL2,EXIT,FILTER2,FLIST2,FUZZY,GENR,OWN,RECV,SEND,TIME",
                    219:        /*4*/ "CMD2,DELTASUM3,DEL3,EXIT2,FLIST3,ICONV2,OWN2,PROTO,TIME2",
                    220: -      /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HLINK",
                    221: +      /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HASHLINK,HLINK",
                    222:  };
                    223:  
                    224:  #define MAX_VERBOSITY ((int)(sizeof debug_verbosity / sizeof debug_verbosity[0]) - 1)
                    225: @@ -291,6 +292,7 @@ static struct output_struct debug_words[COUNT_DEBUG+1] = {
                    226:        DEBUG_WORD(FUZZY, W_REC, "Debug fuzzy scoring (levels 1-2)"),
                    227:        DEBUG_WORD(GENR, W_REC, "Debug generator functions"),
                    228:        DEBUG_WORD(HASH, W_SND|W_REC, "Debug hashtable code"),
                    229: +      DEBUG_WORD(HASHLINK, W_REC, "Debug hashlink code (levels 1-2)"),
                    230:        DEBUG_WORD(HLINK, W_SND|W_REC, "Debug hard-link actions (levels 1-3)"),
                    231:        DEBUG_WORD(ICONV, W_CLI|W_SRV, "Debug iconv character conversions (levels 1-2)"),
                    232:        DEBUG_WORD(IO, W_CLI|W_SRV, "Debug I/O routines (levels 1-4)"),
                    233: @@ -573,7 +575,7 @@ enum {OPT_SERVER = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
                    234:        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
                    235:        OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
                    236:        OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_BLOCK_SIZE,
                    237: -      OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR,
                    238: +      OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR, OPT_LINK_BY_HASH,
                    239:        OPT_OLD_COMPRESS, OPT_NEW_COMPRESS, OPT_NO_COMPRESS,
                    240:        OPT_STOP_AFTER, OPT_STOP_AT,
                    241:        OPT_REFUSED_BASE = 9000};
                    242: @@ -733,6 +735,7 @@ static struct poptOption long_options[] = {
                    243:    {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
                    244:    {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
                    245:    {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
                    246: +  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
                    247:    {"fuzzy",           'y', POPT_ARG_NONE,   0, 'y', 0, 0 },
                    248:    {"no-fuzzy",         0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
                    249:    {"no-y",             0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
                    250: @@ -972,6 +975,9 @@ static void set_refuse_options(void)
                    251:                ref = cp + 1;
                    252:        }
                    253:  
                    254: +      if (*lp_link_by_hash_dir(module_id))
                    255: +              parse_one_refuse_match(0, "link-by-hash", list_end);
                    256: +
                    257:        if (am_daemon) {
                    258:  #ifdef ICONV_OPTION
                    259:                if (!*lp_charset(module_id))
                    260: @@ -1834,6 +1840,20 @@ int parse_arguments(int *argc_p, const char ***argv_p)
                    261:                        return 0;
                    262:  #endif
                    263:  
                    264: +                case OPT_LINK_BY_HASH:
                    265: +#ifdef HAVE_LINK
                    266: +                      arg = poptGetOptArg(pc);
                    267: +                      if (sanitize_paths)
                    268: +                              arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
                    269: +                      link_by_hash_dir = (char *)arg;
                    270: +                      break;
                    271: +#else
                    272: +                      snprintf(err_buf, sizeof err_buf,
                    273: +                               "hard links are not supported on this %s\n",
                    274: +                               am_server ? "server" : "client");
                    275: +                      return 0;
                    276: +#endif
                    277: +
                    278:                case OPT_STOP_AFTER: {
                    279:                        long val;
                    280:                        arg = poptGetOptArg(pc);
                    281: @@ -2186,6 +2206,8 @@ int parse_arguments(int *argc_p, const char ***argv_p)
                    282:                        tmpdir = sanitize_path(NULL, tmpdir, NULL, 0, SP_DEFAULT);
                    283:                if (backup_dir)
                    284:                        backup_dir = sanitize_path(NULL, backup_dir, NULL, 0, SP_DEFAULT);
                    285: +              if (link_by_hash_dir)
                    286: +                      link_by_hash_dir = sanitize_path(NULL, link_by_hash_dir, NULL, 0, SP_DEFAULT);
                    287:        }
                    288:        if (daemon_filter_list.head && !am_sender) {
                    289:                filter_rule_list *elp = &daemon_filter_list;
                    290: @@ -2870,6 +2892,12 @@ void server_options(char **args, int *argc_p)
                    291:        } else if (inplace)
                    292:                args[ac++] = "--inplace";
                    293:  
                    294: +      if (link_by_hash_dir && am_sender) {
                    295: +              args[ac++] = "--link-by-hash";
                    296: +              args[ac++] = link_by_hash_dir;
                    297: +              link_by_hash_dir = NULL; /* optimize sending-side checksums */
                    298: +      }
                    299: +
                    300:        if (files_from && (!am_sender || filesfrom_host)) {
                    301:                if (filesfrom_host) {
                    302:                        args[ac++] = "--files-from";
                    303: diff --git a/rsync.1.md b/rsync.1.md
                    304: --- a/rsync.1.md
                    305: +++ b/rsync.1.md
                    306: @@ -424,6 +424,7 @@ detailed description below for a complete description.
                    307:  --compare-dest=DIR       also compare destination files relative to DIR
                    308:  --copy-dest=DIR          ... and include copies of unchanged files
                    309:  --link-dest=DIR          hardlink to files in DIR when unchanged
                    310: +--link-by-hash=DIR       create hardlinks by hash into DIR
                    311:  --compress, -z           compress file data during the transfer
                    312:  --compress-choice=STR    choose the compression algorithm (aka --zc)
                    313:  --compress-level=NUM     explicitly set compression level (aka --zl)
                    314: @@ -2331,6 +2332,50 @@ your home directory (remove the '=' for that).
                    315:      specified (or implied by `-a`).  You can work-around this bug by avoiding
                    316:      the `-o` option when sending to an old rsync.
                    317:  
                    318: +0.  `--link-by-hash=DIR`
                    319: +
                    320: +    This option hard links the destination files into `DIR`, a link farm
                    321: +    arranged by MD5 file hash. The result is that the system will only store
                    322: +    (usually) one copy of the unique contents of each file, regardless of the
                    323: +    file's name (it will use extra files if the links overflow the available
                    324: +    maximum).
                    325: +
                    326: +    This patch does not take into account file permissions, extended
                    327: +    attributes, or ACLs when linking things together, so you should only use
                    328: +    this if you don't care about preserving those extra file attributes (or if
                    329: +    they are always the same for identical files).
                    330: +
                    331: +    The DIR is relative to the destination directory, so either specify a full
                    332: +    path to the hash hierarchy, or specify a relative path that puts the links
                    333: +    outside the destination (e.g. "../links").
                    334: +
                    335: +    Keep in mind that the hierarchy is never pruned, so if you need to reclaim
                    336: +    space, you should remove any files that have just one link (since they are
                    337: +    not linked into any destination dirs anymore):
                    338: +
                    339: +    >     find $DIR -links 1 -delete
                    340: +
                    341: +    The link farm's directory hierarchy is determined by the file's (32-char)
                    342: +    MD5 hash and the file-length.  The hash is split up into directory shards.
                    343: +    For example, if a file is 54321 bytes long, it could be stored like this:
                    344: +
                    345: +    >     $DIR/123/456/789/01234567890123456789012.54321.0
                    346: +
                    347: +    Note that the directory layout in this patch was modified for version
                    348: +    3.1.0, so anyone using an older version of this patch should move their
                    349: +    existing link hierarchy out of the way and then use the newer rsync to copy
                    350: +    the saved hierarchy into its new layout.  Assuming that no files have
                    351: +    overflowed their link limits, this would work:
                    352: +
                    353: +    >     mv $DIR $DIR.old
                    354: +    >     rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
                    355: +    >     rm -rf $DIR.tmp
                    356: +    >     rm -rf $DIR.old
                    357: +
                    358: +    If some of your files are at their link limit, you'd be better of using a
                    359: +    script to calculate the md5 sum of each file in the hierarchy and move it
                    360: +    to its new location.
                    361: +
                    362:  0.  `--compress`, `-z`
                    363:  
                    364:      With this option, rsync compresses the file data as it is sent to the
                    365: diff --git a/rsync.c b/rsync.c
                    366: --- a/rsync.c
                    367: +++ b/rsync.c
                    368: @@ -50,6 +50,7 @@ extern int flist_eof;
                    369:  extern int file_old_total;
                    370:  extern int keep_dirlinks;
                    371:  extern int make_backups;
                    372: +extern char *link_by_hash_dir;
                    373:  extern int sanitize_paths;
                    374:  extern struct file_list *cur_flist, *first_flist, *dir_flist;
                    375:  extern struct chmod_mode_struct *daemon_chmod_modes;
                    376: @@ -748,6 +749,10 @@ int finish_transfer(const char *fname, const char *fnametmp,
                    377:        }
                    378:        if (ret == 0) {
                    379:                /* The file was moved into place (not copied), so it's done. */
                    380: +#ifdef HAVE_LINK
                    381: +              if (link_by_hash_dir)
                    382: +                      link_by_hash(fname, fnametmp, file);
                    383: +#endif
                    384:                return 1;
                    385:        }
                    386:        /* The file was copied, so tweak the perms of the copied file.  If it
                    387: diff --git a/rsync.h b/rsync.h
                    388: --- a/rsync.h
                    389: +++ b/rsync.h
                    390: @@ -1428,7 +1428,8 @@ extern short info_levels[], debug_levels[];
                    391:  #define DEBUG_FUZZY (DEBUG_FLIST+1)
                    392:  #define DEBUG_GENR (DEBUG_FUZZY+1)
                    393:  #define DEBUG_HASH (DEBUG_GENR+1)
                    394: -#define DEBUG_HLINK (DEBUG_HASH+1)
                    395: +#define DEBUG_HASHLINK (DEBUG_HASH+1)
                    396: +#define DEBUG_HLINK (DEBUG_HASHLINK+1)
                    397:  #define DEBUG_ICONV (DEBUG_HLINK+1)
                    398:  #define DEBUG_IO (DEBUG_ICONV+1)
                    399:  #define DEBUG_NSTR (DEBUG_IO+1)
                    400: diff --git a/rsyncd.conf.5.md b/rsyncd.conf.5.md
                    401: --- a/rsyncd.conf.5.md
                    402: +++ b/rsyncd.conf.5.md
                    403: @@ -354,6 +354,23 @@ the values of parameters.  See the GLOBAL PARAMETERS section for more details.
                    404:      is 0, which means no limit.  A negative value disables the module.  See
                    405:      also the "lock file" parameter.
                    406:  
                    407: +0.  `link by hash dir`
                    408: +
                    409: +    When the "link by hash dir" parameter is set to a non-empty string,
                    410: +    received files will be hard linked into **DIR**, a link farm arranged by
                    411: +    MD5 file hash. See the `--link-by-hash` option for a full explanation.
                    412: +
                    413: +    The **DIR** must be accessible inside any chroot restrictions for the
                    414: +    module, but can exist outside the transfer location if there is an
                    415: +    inside-the-chroot path to the module (see "use chroot").  Note that a
                    416: +    user-specified option does not allow this outside-the-transfer-area
                    417: +    placement.
                    418: +
                    419: +    If this parameter is set, it will disable the `--link-by-hash` command-line
                    420: +    option for copies into the module.
                    421: +
                    422: +The default is for this parameter to be unset.
                    423: +
                    424:  0.  `log file`
                    425:  
                    426:      When the "log file" parameter is set to a non-empty string, the rsync
                    427: diff -Nurp a/rsync.1 b/rsync.1
                    428: --- a/rsync.1
                    429: +++ b/rsync.1
                    430: @@ -500,6 +500,7 @@ detailed description below for a complet
                    431:  --compare-dest=DIR       also compare destination files relative to DIR
                    432:  --copy-dest=DIR          ... and include copies of unchanged files
                    433:  --link-dest=DIR          hardlink to files in DIR when unchanged
                    434: +--link-by-hash=DIR       create hardlinks by hash into DIR
                    435:  --compress, -z           compress file data during the transfer
                    436:  --compress-choice=STR    choose the compression algorithm (aka --zc)
                    437:  --compress-level=NUM     explicitly set compression level (aka --zl)
                    438: @@ -2372,6 +2373,60 @@ Note that rsync versions prior to 2.6.1
                    439:  \fB\-\-link-dest\fP from working properly for a non-super-user when \fB\-o\fP was
                    440:  specified (or implied by \fB\-a\fP).  You can work-around this bug by avoiding
                    441:  the \fB\-o\fP option when sending to an old rsync.
                    442: +.IP "\fB\-\-link-by-hash=DIR\fP"
                    443: +This option hard links the destination files into \fBDIR\fP, a link farm
                    444: +arranged by MD5 file hash. The result is that the system will only store
                    445: +(usually) one copy of the unique contents of each file, regardless of the
                    446: +file's name (it will use extra files if the links overflow the available
                    447: +maximum).
                    448: +.IP
                    449: +This patch does not take into account file permissions, extended
                    450: +attributes, or ACLs when linking things together, so you should only use
                    451: +this if you don't care about preserving those extra file attributes (or if
                    452: +they are always the same for identical files).
                    453: +.IP
                    454: +The DIR is relative to the destination directory, so either specify a full
                    455: +path to the hash hierarchy, or specify a relative path that puts the links
                    456: +outside the destination (e.g. "../links").
                    457: +.IP
                    458: +Keep in mind that the hierarchy is never pruned, so if you need to reclaim
                    459: +space, you should remove any files that have just one link (since they are
                    460: +not linked into any destination dirs anymore):
                    461: +.RS 4
                    462: +.IP
                    463: +.nf
                    464: +find $DIR -links 1 -delete
                    465: +.fi
                    466: +.RE
                    467: +.IP
                    468: +The link farm's directory hierarchy is determined by the file's (32-char)
                    469: +MD5 hash and the file-length.  The hash is split up into directory shards.
                    470: +For example, if a file is 54321 bytes long, it could be stored like this:
                    471: +.RS 4
                    472: +.IP
                    473: +.nf
                    474: +$DIR/123/456/789/01234567890123456789012.54321.0
                    475: +.fi
                    476: +.RE
                    477: +.IP
                    478: +Note that the directory layout in this patch was modified for version
                    479: +3.1.0, so anyone using an older version of this patch should move their
                    480: +existing link hierarchy out of the way and then use the newer rsync to copy
                    481: +the saved hierarchy into its new layout.  Assuming that no files have
                    482: +overflowed their link limits, this would work:
                    483: +.RS 4
                    484: +.IP
                    485: +.nf
                    486: +mv $DIR $DIR.old
                    487: +rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
                    488: +rm -rf $DIR.tmp
                    489: +rm -rf $DIR.old
                    490: +.fi
                    491: +.RE
                    492: +.IP
                    493: +If some of your files are at their link limit, you'd be better of using a
                    494: +script to calculate the md5 sum of each file in the hierarchy and move it
                    495: +to its new location.
                    496:  .IP "\fB\-\-compress\fP, \fB\-z\fP"
                    497:  With this option, rsync compresses the file data as it is sent to the
                    498:  destination machine, which reduces the amount of data being transmitted\ \-\-
                    499: diff -Nurp a/rsync.1.html b/rsync.1.html
                    500: --- a/rsync.1.html
                    501: +++ b/rsync.1.html
                    502: @@ -415,6 +415,7 @@ detailed description below for a complet
                    503:  --compare-dest=DIR       also compare destination files relative to DIR
                    504:  --copy-dest=DIR          ... and include copies of unchanged files
                    505:  --link-dest=DIR          hardlink to files in DIR when unchanged
                    506: +--link-by-hash=DIR       create hardlinks by hash into DIR
                    507:  --compress, -z           compress file data during the transfer
                    508:  --compress-choice=STR    choose the compression algorithm (aka --zc)
                    509:  --compress-level=NUM     explicitly set compression level (aka --zl)
                    510: @@ -2210,6 +2211,50 @@ specified (or implied by <code>-a</code>
                    511:  the <code>-o</code> option when sending to an old rsync.</p>
                    512:  </dd>
                    513:  
                    514: +<dt><code>--link-by-hash=DIR</code></dt><dd>
                    515: +<p>This option hard links the destination files into <code>DIR</code>, a link farm
                    516: +arranged by MD5 file hash. The result is that the system will only store
                    517: +(usually) one copy of the unique contents of each file, regardless of the
                    518: +file's name (it will use extra files if the links overflow the available
                    519: +maximum).</p>
                    520: +<p>This patch does not take into account file permissions, extended
                    521: +attributes, or ACLs when linking things together, so you should only use
                    522: +this if you don't care about preserving those extra file attributes (or if
                    523: +they are always the same for identical files).</p>
                    524: +<p>The DIR is relative to the destination directory, so either specify a full
                    525: +path to the hash hierarchy, or specify a relative path that puts the links
                    526: +outside the destination (e.g. &quot;../links&quot;).</p>
                    527: +<p>Keep in mind that the hierarchy is never pruned, so if you need to reclaim
                    528: +space, you should remove any files that have just one link (since they are
                    529: +not linked into any destination dirs anymore):</p>
                    530: +<blockquote>
                    531: +<pre><code>find $DIR -links 1 -delete
                    532: +</code></pre>
                    533: +</blockquote>
                    534: +<p>The link farm's directory hierarchy is determined by the file's (32-char)
                    535: +MD5 hash and the file-length.  The hash is split up into directory shards.
                    536: +For example, if a file is 54321 bytes long, it could be stored like this:</p>
                    537: +<blockquote>
                    538: +<pre><code>$DIR/123/456/789/01234567890123456789012.54321.0
                    539: +</code></pre>
                    540: +</blockquote>
                    541: +<p>Note that the directory layout in this patch was modified for version
                    542: +3.1.0, so anyone using an older version of this patch should move their
                    543: +existing link hierarchy out of the way and then use the newer rsync to copy
                    544: +the saved hierarchy into its new layout.  Assuming that no files have
                    545: +overflowed their link limits, this would work:</p>
                    546: +<blockquote>
                    547: +<pre><code>mv $DIR $DIR.old
                    548: +rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
                    549: +rm -rf $DIR.tmp
                    550: +rm -rf $DIR.old
                    551: +</code></pre>
                    552: +</blockquote>
                    553: +<p>If some of your files are at their link limit, you'd be better of using a
                    554: +script to calculate the md5 sum of each file in the hierarchy and move it
                    555: +to its new location.</p>
                    556: +</dd>
                    557: +
                    558:  <dt><code>--compress</code>, <code>-z</code></dt><dd>
                    559:  <p>With this option, rsync compresses the file data as it is sent to the
                    560:  destination machine, which reduces the amount of data being transmitted&nbsp;-&#8288;-&#8288;
                    561: diff -Nurp a/rsyncd.conf.5 b/rsyncd.conf.5
                    562: --- a/rsyncd.conf.5
                    563: +++ b/rsyncd.conf.5
                    564: @@ -335,6 +335,22 @@ connections you will allow.  Any clients
                    565:  been reached will receive a message telling them to try later.  The default
                    566:  is 0, which means no limit.  A negative value disables the module.  See
                    567:  also the "lock file" parameter.
                    568: +.IP "\fBlink\ by\ hash\ dir\fP"
                    569: +When the "link by hash dir" parameter is set to a non-empty string,
                    570: +received files will be hard linked into \fBDIR\fP, a link farm arranged by
                    571: +MD5 file hash. See the \fB\-\-link-by-hash\fP option for a full explanation.
                    572: +.IP
                    573: +The \fBDIR\fP must be accessible inside any chroot restrictions for the
                    574: +module, but can exist outside the transfer location if there is an
                    575: +inside-the-chroot path to the module (see "use chroot").  Note that a
                    576: +user-specified option does not allow this outside-the-transfer-area
                    577: +placement.
                    578: +.IP
                    579: +If this parameter is set, it will disable the \fB\-\-link-by-hash\fP command-line
                    580: +option for copies into the module.
                    581: +.P
                    582: +The default is for this parameter to be unset.
                    583: +.P
                    584:  .IP "\fBlog\ file\fP"
                    585:  When the "log file" parameter is set to a non-empty string, the rsync
                    586:  daemon will log messages to the indicated file rather than using syslog.
                    587: diff -Nurp a/rsyncd.conf.5.html b/rsyncd.conf.5.html
                    588: --- a/rsyncd.conf.5.html
                    589: +++ b/rsyncd.conf.5.html
                    590: @@ -342,6 +342,22 @@ is 0, which means no limit.  A negative
                    591:  also the &quot;lock file&quot; parameter.</p>
                    592:  </dd>
                    593:  
                    594: +<dt><code>link by hash dir</code></dt><dd>
                    595: +<p>When the &quot;link by hash dir&quot; parameter is set to a non-empty string,
                    596: +received files will be hard linked into <strong>DIR</strong>, a link farm arranged by
                    597: +MD5 file hash. See the <code>--link-by-hash</code> option for a full explanation.</p>
                    598: +<p>The <strong>DIR</strong> must be accessible inside any chroot restrictions for the
                    599: +module, but can exist outside the transfer location if there is an
                    600: +inside-the-chroot path to the module (see &quot;use chroot&quot;).  Note that a
                    601: +user-specified option does not allow this outside-the-transfer-area
                    602: +placement.</p>
                    603: +<p>If this parameter is set, it will disable the <code>--link-by-hash</code> command-line
                    604: +option for copies into the module.</p>
                    605: +</dd>
                    606: +</dl>
                    607: +<p>The default is for this parameter to be unset.</p>
                    608: +<dl>
                    609: +
                    610:  <dt><code>log file</code></dt><dd>
                    611:  <p>When the &quot;log file&quot; parameter is set to a non-empty string, the rsync
                    612:  daemon will log messages to the indicated file rather than using syslog.

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>