File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / patches / link-by-hash.diff
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (4 years ago) by misho
Branches: rsync, MAIN
CVS tags: v3_2_3, HEAD
rsync 3.2.3

    1: Jason M. Felice wrote:
    2: 
    3: This patch adds the --link-by-hash=DIR option, which hard links received files
    4: in a link farm arranged by MD4 or MD5 file hash.  The result is that the system
    5: will only store one copy of the unique contents of each file, regardless of the
    6: file's name.
    7: 
    8: To use this patch, run these commands for a successful build:
    9: 
   10:     patch -p1 <patches/link-by-hash.diff
   11:     ./prepare-source
   12:     ./configure
   13:     make
   14: 
   15: based-on: e94bad1c156fc3910f24e2b3b71a81b0b0bdeb70
   16: diff --git a/Makefile.in b/Makefile.in
   17: --- a/Makefile.in
   18: +++ b/Makefile.in
   19: @@ -44,7 +44,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
   20:  	util.o util2.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
   21:  OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
   22:  	usage.o fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
   23: -OBJS3=progress.o pipe.o @ASM@
   24: +OBJS3=progress.o pipe.o hashlink.o @ASM@
   25:  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
   26:  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
   27:  	popt/popthelp.o popt/poptparse.o
   28: diff --git a/checksum.c b/checksum.c
   29: --- a/checksum.c
   30: +++ b/checksum.c
   31: @@ -40,6 +40,8 @@ extern int whole_file;
   32:  extern int checksum_seed;
   33:  extern int protocol_version;
   34:  extern int proper_seed_order;
   35: +extern char *link_by_hash_dir;
   36: +extern char link_by_hash_extra_sum[MAX_DIGEST_LEN];
   37:  extern const char *checksum_choice;
   38:  
   39:  struct name_num_obj valid_checksums = {
   40: @@ -444,7 +446,7 @@ static union {
   41:  	MD4_CTX m4;
   42:  #endif
   43:  	MD5_CTX m5;
   44: -} ctx;
   45: +} ctx, ctx2;
   46:  #ifdef SUPPORT_XXHASH
   47:  static XXH64_state_t* xxh64_state;
   48:  #endif
   49: @@ -483,6 +485,8 @@ void sum_init(int csum_type, int seed)
   50:  #endif
   51:  	  case CSUM_MD5:
   52:  		MD5_Init(&ctx.m5);
   53: +		if (link_by_hash_dir)
   54: +			MD5_Init(&ctx2.m5);
   55:  		break;
   56:  	  case CSUM_MD4:
   57:  #ifdef USE_OPENSSL
   58: @@ -533,6 +537,8 @@ void sum_update(const char *p, int32 len)
   59:  #endif
   60:  	  case CSUM_MD5:
   61:  		MD5_Update(&ctx.m5, (uchar *)p, len);
   62: +		if (link_by_hash_dir)
   63: +			MD5_Update(&ctx2.m5, (uchar *)p, len);
   64:  		break;
   65:  	  case CSUM_MD4:
   66:  #ifdef USE_OPENSSL
   67: @@ -598,6 +604,8 @@ int sum_end(char *sum)
   68:  #endif
   69:  	  case CSUM_MD5:
   70:  		MD5_Final((uchar *)sum, &ctx.m5);
   71: +		if (link_by_hash_dir)
   72: +			MD5_Final((uchar *)link_by_hash_extra_sum, &ctx2.m5);
   73:  		break;
   74:  	  case CSUM_MD4:
   75:  #ifdef USE_OPENSSL
   76: diff --git a/clientserver.c b/clientserver.c
   77: --- a/clientserver.c
   78: +++ b/clientserver.c
   79: @@ -52,6 +52,7 @@ extern int logfile_format_has_i;
   80:  extern int logfile_format_has_o_or_i;
   81:  extern char *bind_address;
   82:  extern char *config_file;
   83: +extern char *link_by_hash_dir;
   84:  extern char *logfile_format;
   85:  extern char *files_from;
   86:  extern char *tmpdir;
   87: @@ -665,6 +666,9 @@ static int rsync_module(int f_in, int f_out, int i, const char *addr, const char
   88:  		return -1;
   89:  	}
   90:  
   91: +	if (*lp_link_by_hash_dir(i))
   92: +		link_by_hash_dir = lp_link_by_hash_dir(i);
   93: +
   94:  	if (am_daemon > 0) {
   95:  		rprintf(FLOG, "rsync allowed access on module %s from %s (%s)\n",
   96:  			name, host, addr);
   97: diff --git a/daemon-parm.txt b/daemon-parm.txt
   98: --- a/daemon-parm.txt
   99: +++ b/daemon-parm.txt
  100: @@ -29,6 +29,7 @@ STRING	hosts_deny		NULL
  101:  STRING	include			NULL
  102:  STRING	include_from		NULL
  103:  STRING	incoming_chmod		NULL
  104: +STRING	link_by_hash_dir	NULL
  105:  STRING	lock_file		DEFAULT_LOCK_FILE
  106:  STRING	log_file		NULL
  107:  STRING	log_format		"%o %h [%a] %m (%u) %f %l"
  108: diff --git a/hashlink.c b/hashlink.c
  109: new file mode 100644
  110: --- /dev/null
  111: +++ b/hashlink.c
  112: @@ -0,0 +1,92 @@
  113: +/*
  114: +   Copyright (C) Cronosys, LLC 2004
  115: +
  116: +   This program is free software; you can redistribute it and/or modify
  117: +   it under the terms of the GNU General Public License as published by
  118: +   the Free Software Foundation; either version 2 of the License, or
  119: +   (at your option) any later version.
  120: +
  121: +   This program is distributed in the hope that it will be useful,
  122: +   but WITHOUT ANY WARRANTY; without even the implied warranty of
  123: +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  124: +   GNU General Public License for more details.
  125: +
  126: +   You should have received a copy of the GNU General Public License
  127: +   along with this program; if not, write to the Free Software
  128: +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  129: +*/
  130: +
  131: +/* This file contains code used by the --link-by-hash option. */
  132: +
  133: +#include "rsync.h"
  134: +#include "inums.h"
  135: +
  136: +extern int protocol_version;
  137: +extern char *link_by_hash_dir;
  138: +extern char sender_file_sum[MAX_DIGEST_LEN];
  139: +
  140: +char link_by_hash_extra_sum[MAX_DIGEST_LEN]; /* Only used when md4 sums are in the transfer */
  141: +
  142: +#ifdef HAVE_LINK
  143: +
  144: +/* This function is always called after a file is received, so the
  145: + * sender_file_sum buffer has whatever the last checksum was for the
  146: + * transferred file. */
  147: +void link_by_hash(const char *fname, const char *fnametmp, struct file_struct *file)
  148: +{
  149: +	STRUCT_STAT st;
  150: +	char *hashname, *last_slash, *num_str;
  151: +	const char *hex;
  152: +	int num = 0;
  153: +
  154: +	/* We don't bother to hard-link 0-length files. */
  155: +	if (F_LENGTH(file) == 0)
  156: +		return;
  157: +
  158: +	hex = sum_as_hex(5, protocol_version >= 30 ? sender_file_sum : link_by_hash_extra_sum, 0);
  159: +	if (asprintf(&hashname, "%s/%.3s/%.3s/%.3s/%s.%s.000000",
  160: +		     link_by_hash_dir, hex, hex+3, hex+6, hex+9, big_num(F_LENGTH(file))) < 0)
  161: +	{
  162: +		out_of_memory("make_hash_name");
  163: +	}
  164: +
  165: +	last_slash = strrchr(hashname, '/');
  166: +	num_str = strrchr(last_slash, '.') + 1;
  167: +
  168: +	while (1) {
  169: +		if (num >= 999999) { /* Surely we'll never reach this... */
  170: +			if (DEBUG_GTE(HASHLINK, 1))
  171: +				rprintf(FINFO, "link-by-hash: giving up after \"%s\".\n", hashname);
  172: +			goto cleanup;
  173: +		}
  174: +		if (num > 0 && DEBUG_GTE(HASHLINK, 1))
  175: +			rprintf(FINFO, "link-by-hash: max link count exceeded, starting new file \"%s\".\n", hashname);
  176: +
  177: +		snprintf(num_str, 7, "%d", num++);
  178: +		if (do_stat(hashname, &st) < 0)
  179: +			break;
  180: +
  181: +		if (do_link(hashname, fnametmp) < 0) {
  182: +			if (errno == EMLINK)
  183: +				continue;
  184: +			rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", hashname, full_fname(fname));
  185: +		} else {
  186: +			if (DEBUG_GTE(HASHLINK, 2))
  187: +				rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", hashname, full_fname(fname));
  188: +			robust_rename(fnametmp, fname, NULL, 0644);
  189: +		}
  190: +
  191: +		goto cleanup;
  192: +	}
  193: +
  194: +	if (DEBUG_GTE(HASHLINK, 2))
  195: +		rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", full_fname(fname), hashname);
  196: +
  197: +	if (do_link(fname, hashname) < 0
  198: +	 && (errno != ENOENT || make_path(hashname, MKP_DROP_NAME) < 0 || do_link(fname, hashname) < 0))
  199: +		rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", full_fname(fname), hashname);
  200: +
  201: +  cleanup:
  202: +	free(hashname);
  203: +}
  204: +#endif
  205: diff --git a/options.c b/options.c
  206: --- a/options.c
  207: +++ b/options.c
  208: @@ -164,6 +164,7 @@ char *backup_suffix = NULL;
  209:  char *tmpdir = NULL;
  210:  char *partial_dir = NULL;
  211:  char *basis_dir[MAX_BASIS_DIRS+1];
  212: +char *link_by_hash_dir = NULL;
  213:  char *config_file = NULL;
  214:  char *shell_cmd = NULL;
  215:  char *logfile_name = NULL;
  216: @@ -221,7 +222,7 @@ static const char *debug_verbosity[] = {
  217:  	/*2*/ "BIND,CMD,CONNECT,DEL,DELTASUM,DUP,FILTER,FLIST,ICONV",
  218:  	/*3*/ "ACL,BACKUP,CONNECT2,DELTASUM2,DEL2,EXIT,FILTER2,FLIST2,FUZZY,GENR,OWN,RECV,SEND,TIME",
  219:  	/*4*/ "CMD2,DELTASUM3,DEL3,EXIT2,FLIST3,ICONV2,OWN2,PROTO,TIME2",
  220: -	/*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HLINK",
  221: +	/*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HASHLINK,HLINK",
  222:  };
  223:  
  224:  #define MAX_VERBOSITY ((int)(sizeof debug_verbosity / sizeof debug_verbosity[0]) - 1)
  225: @@ -291,6 +292,7 @@ static struct output_struct debug_words[COUNT_DEBUG+1] = {
  226:  	DEBUG_WORD(FUZZY, W_REC, "Debug fuzzy scoring (levels 1-2)"),
  227:  	DEBUG_WORD(GENR, W_REC, "Debug generator functions"),
  228:  	DEBUG_WORD(HASH, W_SND|W_REC, "Debug hashtable code"),
  229: +	DEBUG_WORD(HASHLINK, W_REC, "Debug hashlink code (levels 1-2)"),
  230:  	DEBUG_WORD(HLINK, W_SND|W_REC, "Debug hard-link actions (levels 1-3)"),
  231:  	DEBUG_WORD(ICONV, W_CLI|W_SRV, "Debug iconv character conversions (levels 1-2)"),
  232:  	DEBUG_WORD(IO, W_CLI|W_SRV, "Debug I/O routines (levels 1-4)"),
  233: @@ -573,7 +575,7 @@ enum {OPT_SERVER = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
  234:        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
  235:        OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
  236:        OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_BLOCK_SIZE,
  237: -      OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR,
  238: +      OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR, OPT_LINK_BY_HASH,
  239:        OPT_OLD_COMPRESS, OPT_NEW_COMPRESS, OPT_NO_COMPRESS,
  240:        OPT_STOP_AFTER, OPT_STOP_AT,
  241:        OPT_REFUSED_BASE = 9000};
  242: @@ -733,6 +735,7 @@ static struct poptOption long_options[] = {
  243:    {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
  244:    {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
  245:    {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
  246: +  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
  247:    {"fuzzy",           'y', POPT_ARG_NONE,   0, 'y', 0, 0 },
  248:    {"no-fuzzy",         0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
  249:    {"no-y",             0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
  250: @@ -972,6 +975,9 @@ static void set_refuse_options(void)
  251:  		ref = cp + 1;
  252:  	}
  253:  
  254: +	if (*lp_link_by_hash_dir(module_id))
  255: +		parse_one_refuse_match(0, "link-by-hash", list_end);
  256: +
  257:  	if (am_daemon) {
  258:  #ifdef ICONV_OPTION
  259:  		if (!*lp_charset(module_id))
  260: @@ -1834,6 +1840,20 @@ int parse_arguments(int *argc_p, const char ***argv_p)
  261:  			return 0;
  262:  #endif
  263:  
  264: +                case OPT_LINK_BY_HASH:
  265: +#ifdef HAVE_LINK
  266: +			arg = poptGetOptArg(pc);
  267: +			if (sanitize_paths)
  268: +				arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
  269: +			link_by_hash_dir = (char *)arg;
  270: +			break;
  271: +#else
  272: +			snprintf(err_buf, sizeof err_buf,
  273: +				 "hard links are not supported on this %s\n",
  274: +				 am_server ? "server" : "client");
  275: +			return 0;
  276: +#endif
  277: +
  278:  		case OPT_STOP_AFTER: {
  279:  			long val;
  280:  			arg = poptGetOptArg(pc);
  281: @@ -2186,6 +2206,8 @@ int parse_arguments(int *argc_p, const char ***argv_p)
  282:  			tmpdir = sanitize_path(NULL, tmpdir, NULL, 0, SP_DEFAULT);
  283:  		if (backup_dir)
  284:  			backup_dir = sanitize_path(NULL, backup_dir, NULL, 0, SP_DEFAULT);
  285: +		if (link_by_hash_dir)
  286: +			link_by_hash_dir = sanitize_path(NULL, link_by_hash_dir, NULL, 0, SP_DEFAULT);
  287:  	}
  288:  	if (daemon_filter_list.head && !am_sender) {
  289:  		filter_rule_list *elp = &daemon_filter_list;
  290: @@ -2870,6 +2892,12 @@ void server_options(char **args, int *argc_p)
  291:  	} else if (inplace)
  292:  		args[ac++] = "--inplace";
  293:  
  294: +	if (link_by_hash_dir && am_sender) {
  295: +		args[ac++] = "--link-by-hash";
  296: +		args[ac++] = link_by_hash_dir;
  297: +		link_by_hash_dir = NULL; /* optimize sending-side checksums */
  298: +	}
  299: +
  300:  	if (files_from && (!am_sender || filesfrom_host)) {
  301:  		if (filesfrom_host) {
  302:  			args[ac++] = "--files-from";
  303: diff --git a/rsync.1.md b/rsync.1.md
  304: --- a/rsync.1.md
  305: +++ b/rsync.1.md
  306: @@ -424,6 +424,7 @@ detailed description below for a complete description.
  307:  --compare-dest=DIR       also compare destination files relative to DIR
  308:  --copy-dest=DIR          ... and include copies of unchanged files
  309:  --link-dest=DIR          hardlink to files in DIR when unchanged
  310: +--link-by-hash=DIR       create hardlinks by hash into DIR
  311:  --compress, -z           compress file data during the transfer
  312:  --compress-choice=STR    choose the compression algorithm (aka --zc)
  313:  --compress-level=NUM     explicitly set compression level (aka --zl)
  314: @@ -2331,6 +2332,50 @@ your home directory (remove the '=' for that).
  315:      specified (or implied by `-a`).  You can work-around this bug by avoiding
  316:      the `-o` option when sending to an old rsync.
  317:  
  318: +0.  `--link-by-hash=DIR`
  319: +
  320: +    This option hard links the destination files into `DIR`, a link farm
  321: +    arranged by MD5 file hash. The result is that the system will only store
  322: +    (usually) one copy of the unique contents of each file, regardless of the
  323: +    file's name (it will use extra files if the links overflow the available
  324: +    maximum).
  325: +
  326: +    This patch does not take into account file permissions, extended
  327: +    attributes, or ACLs when linking things together, so you should only use
  328: +    this if you don't care about preserving those extra file attributes (or if
  329: +    they are always the same for identical files).
  330: +
  331: +    The DIR is relative to the destination directory, so either specify a full
  332: +    path to the hash hierarchy, or specify a relative path that puts the links
  333: +    outside the destination (e.g. "../links").
  334: +
  335: +    Keep in mind that the hierarchy is never pruned, so if you need to reclaim
  336: +    space, you should remove any files that have just one link (since they are
  337: +    not linked into any destination dirs anymore):
  338: +
  339: +    >     find $DIR -links 1 -delete
  340: +
  341: +    The link farm's directory hierarchy is determined by the file's (32-char)
  342: +    MD5 hash and the file-length.  The hash is split up into directory shards.
  343: +    For example, if a file is 54321 bytes long, it could be stored like this:
  344: +
  345: +    >     $DIR/123/456/789/01234567890123456789012.54321.0
  346: +
  347: +    Note that the directory layout in this patch was modified for version
  348: +    3.1.0, so anyone using an older version of this patch should move their
  349: +    existing link hierarchy out of the way and then use the newer rsync to copy
  350: +    the saved hierarchy into its new layout.  Assuming that no files have
  351: +    overflowed their link limits, this would work:
  352: +
  353: +    >     mv $DIR $DIR.old
  354: +    >     rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
  355: +    >     rm -rf $DIR.tmp
  356: +    >     rm -rf $DIR.old
  357: +
  358: +    If some of your files are at their link limit, you'd be better of using a
  359: +    script to calculate the md5 sum of each file in the hierarchy and move it
  360: +    to its new location.
  361: +
  362:  0.  `--compress`, `-z`
  363:  
  364:      With this option, rsync compresses the file data as it is sent to the
  365: diff --git a/rsync.c b/rsync.c
  366: --- a/rsync.c
  367: +++ b/rsync.c
  368: @@ -50,6 +50,7 @@ extern int flist_eof;
  369:  extern int file_old_total;
  370:  extern int keep_dirlinks;
  371:  extern int make_backups;
  372: +extern char *link_by_hash_dir;
  373:  extern int sanitize_paths;
  374:  extern struct file_list *cur_flist, *first_flist, *dir_flist;
  375:  extern struct chmod_mode_struct *daemon_chmod_modes;
  376: @@ -748,6 +749,10 @@ int finish_transfer(const char *fname, const char *fnametmp,
  377:  	}
  378:  	if (ret == 0) {
  379:  		/* The file was moved into place (not copied), so it's done. */
  380: +#ifdef HAVE_LINK
  381: +		if (link_by_hash_dir)
  382: +			link_by_hash(fname, fnametmp, file);
  383: +#endif
  384:  		return 1;
  385:  	}
  386:  	/* The file was copied, so tweak the perms of the copied file.  If it
  387: diff --git a/rsync.h b/rsync.h
  388: --- a/rsync.h
  389: +++ b/rsync.h
  390: @@ -1428,7 +1428,8 @@ extern short info_levels[], debug_levels[];
  391:  #define DEBUG_FUZZY (DEBUG_FLIST+1)
  392:  #define DEBUG_GENR (DEBUG_FUZZY+1)
  393:  #define DEBUG_HASH (DEBUG_GENR+1)
  394: -#define DEBUG_HLINK (DEBUG_HASH+1)
  395: +#define DEBUG_HASHLINK (DEBUG_HASH+1)
  396: +#define DEBUG_HLINK (DEBUG_HASHLINK+1)
  397:  #define DEBUG_ICONV (DEBUG_HLINK+1)
  398:  #define DEBUG_IO (DEBUG_ICONV+1)
  399:  #define DEBUG_NSTR (DEBUG_IO+1)
  400: diff --git a/rsyncd.conf.5.md b/rsyncd.conf.5.md
  401: --- a/rsyncd.conf.5.md
  402: +++ b/rsyncd.conf.5.md
  403: @@ -354,6 +354,23 @@ the values of parameters.  See the GLOBAL PARAMETERS section for more details.
  404:      is 0, which means no limit.  A negative value disables the module.  See
  405:      also the "lock file" parameter.
  406:  
  407: +0.  `link by hash dir`
  408: +
  409: +    When the "link by hash dir" parameter is set to a non-empty string,
  410: +    received files will be hard linked into **DIR**, a link farm arranged by
  411: +    MD5 file hash. See the `--link-by-hash` option for a full explanation.
  412: +
  413: +    The **DIR** must be accessible inside any chroot restrictions for the
  414: +    module, but can exist outside the transfer location if there is an
  415: +    inside-the-chroot path to the module (see "use chroot").  Note that a
  416: +    user-specified option does not allow this outside-the-transfer-area
  417: +    placement.
  418: +
  419: +    If this parameter is set, it will disable the `--link-by-hash` command-line
  420: +    option for copies into the module.
  421: +
  422: +The default is for this parameter to be unset.
  423: +
  424:  0.  `log file`
  425:  
  426:      When the "log file" parameter is set to a non-empty string, the rsync
  427: diff -Nurp a/rsync.1 b/rsync.1
  428: --- a/rsync.1
  429: +++ b/rsync.1
  430: @@ -500,6 +500,7 @@ detailed description below for a complet
  431:  --compare-dest=DIR       also compare destination files relative to DIR
  432:  --copy-dest=DIR          ... and include copies of unchanged files
  433:  --link-dest=DIR          hardlink to files in DIR when unchanged
  434: +--link-by-hash=DIR       create hardlinks by hash into DIR
  435:  --compress, -z           compress file data during the transfer
  436:  --compress-choice=STR    choose the compression algorithm (aka --zc)
  437:  --compress-level=NUM     explicitly set compression level (aka --zl)
  438: @@ -2372,6 +2373,60 @@ Note that rsync versions prior to 2.6.1
  439:  \fB\-\-link-dest\fP from working properly for a non-super-user when \fB\-o\fP was
  440:  specified (or implied by \fB\-a\fP).  You can work-around this bug by avoiding
  441:  the \fB\-o\fP option when sending to an old rsync.
  442: +.IP "\fB\-\-link-by-hash=DIR\fP"
  443: +This option hard links the destination files into \fBDIR\fP, a link farm
  444: +arranged by MD5 file hash. The result is that the system will only store
  445: +(usually) one copy of the unique contents of each file, regardless of the
  446: +file's name (it will use extra files if the links overflow the available
  447: +maximum).
  448: +.IP
  449: +This patch does not take into account file permissions, extended
  450: +attributes, or ACLs when linking things together, so you should only use
  451: +this if you don't care about preserving those extra file attributes (or if
  452: +they are always the same for identical files).
  453: +.IP
  454: +The DIR is relative to the destination directory, so either specify a full
  455: +path to the hash hierarchy, or specify a relative path that puts the links
  456: +outside the destination (e.g. "../links").
  457: +.IP
  458: +Keep in mind that the hierarchy is never pruned, so if you need to reclaim
  459: +space, you should remove any files that have just one link (since they are
  460: +not linked into any destination dirs anymore):
  461: +.RS 4
  462: +.IP
  463: +.nf
  464: +find $DIR -links 1 -delete
  465: +.fi
  466: +.RE
  467: +.IP
  468: +The link farm's directory hierarchy is determined by the file's (32-char)
  469: +MD5 hash and the file-length.  The hash is split up into directory shards.
  470: +For example, if a file is 54321 bytes long, it could be stored like this:
  471: +.RS 4
  472: +.IP
  473: +.nf
  474: +$DIR/123/456/789/01234567890123456789012.54321.0
  475: +.fi
  476: +.RE
  477: +.IP
  478: +Note that the directory layout in this patch was modified for version
  479: +3.1.0, so anyone using an older version of this patch should move their
  480: +existing link hierarchy out of the way and then use the newer rsync to copy
  481: +the saved hierarchy into its new layout.  Assuming that no files have
  482: +overflowed their link limits, this would work:
  483: +.RS 4
  484: +.IP
  485: +.nf
  486: +mv $DIR $DIR.old
  487: +rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
  488: +rm -rf $DIR.tmp
  489: +rm -rf $DIR.old
  490: +.fi
  491: +.RE
  492: +.IP
  493: +If some of your files are at their link limit, you'd be better of using a
  494: +script to calculate the md5 sum of each file in the hierarchy and move it
  495: +to its new location.
  496:  .IP "\fB\-\-compress\fP, \fB\-z\fP"
  497:  With this option, rsync compresses the file data as it is sent to the
  498:  destination machine, which reduces the amount of data being transmitted\ \-\-
  499: diff -Nurp a/rsync.1.html b/rsync.1.html
  500: --- a/rsync.1.html
  501: +++ b/rsync.1.html
  502: @@ -415,6 +415,7 @@ detailed description below for a complet
  503:  --compare-dest=DIR       also compare destination files relative to DIR
  504:  --copy-dest=DIR          ... and include copies of unchanged files
  505:  --link-dest=DIR          hardlink to files in DIR when unchanged
  506: +--link-by-hash=DIR       create hardlinks by hash into DIR
  507:  --compress, -z           compress file data during the transfer
  508:  --compress-choice=STR    choose the compression algorithm (aka --zc)
  509:  --compress-level=NUM     explicitly set compression level (aka --zl)
  510: @@ -2210,6 +2211,50 @@ specified (or implied by <code>-a</code>
  511:  the <code>-o</code> option when sending to an old rsync.</p>
  512:  </dd>
  513:  
  514: +<dt><code>--link-by-hash=DIR</code></dt><dd>
  515: +<p>This option hard links the destination files into <code>DIR</code>, a link farm
  516: +arranged by MD5 file hash. The result is that the system will only store
  517: +(usually) one copy of the unique contents of each file, regardless of the
  518: +file's name (it will use extra files if the links overflow the available
  519: +maximum).</p>
  520: +<p>This patch does not take into account file permissions, extended
  521: +attributes, or ACLs when linking things together, so you should only use
  522: +this if you don't care about preserving those extra file attributes (or if
  523: +they are always the same for identical files).</p>
  524: +<p>The DIR is relative to the destination directory, so either specify a full
  525: +path to the hash hierarchy, or specify a relative path that puts the links
  526: +outside the destination (e.g. &quot;../links&quot;).</p>
  527: +<p>Keep in mind that the hierarchy is never pruned, so if you need to reclaim
  528: +space, you should remove any files that have just one link (since they are
  529: +not linked into any destination dirs anymore):</p>
  530: +<blockquote>
  531: +<pre><code>find $DIR -links 1 -delete
  532: +</code></pre>
  533: +</blockquote>
  534: +<p>The link farm's directory hierarchy is determined by the file's (32-char)
  535: +MD5 hash and the file-length.  The hash is split up into directory shards.
  536: +For example, if a file is 54321 bytes long, it could be stored like this:</p>
  537: +<blockquote>
  538: +<pre><code>$DIR/123/456/789/01234567890123456789012.54321.0
  539: +</code></pre>
  540: +</blockquote>
  541: +<p>Note that the directory layout in this patch was modified for version
  542: +3.1.0, so anyone using an older version of this patch should move their
  543: +existing link hierarchy out of the way and then use the newer rsync to copy
  544: +the saved hierarchy into its new layout.  Assuming that no files have
  545: +overflowed their link limits, this would work:</p>
  546: +<blockquote>
  547: +<pre><code>mv $DIR $DIR.old
  548: +rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
  549: +rm -rf $DIR.tmp
  550: +rm -rf $DIR.old
  551: +</code></pre>
  552: +</blockquote>
  553: +<p>If some of your files are at their link limit, you'd be better of using a
  554: +script to calculate the md5 sum of each file in the hierarchy and move it
  555: +to its new location.</p>
  556: +</dd>
  557: +
  558:  <dt><code>--compress</code>, <code>-z</code></dt><dd>
  559:  <p>With this option, rsync compresses the file data as it is sent to the
  560:  destination machine, which reduces the amount of data being transmitted&nbsp;-&#8288;-&#8288;
  561: diff -Nurp a/rsyncd.conf.5 b/rsyncd.conf.5
  562: --- a/rsyncd.conf.5
  563: +++ b/rsyncd.conf.5
  564: @@ -335,6 +335,22 @@ connections you will allow.  Any clients
  565:  been reached will receive a message telling them to try later.  The default
  566:  is 0, which means no limit.  A negative value disables the module.  See
  567:  also the "lock file" parameter.
  568: +.IP "\fBlink\ by\ hash\ dir\fP"
  569: +When the "link by hash dir" parameter is set to a non-empty string,
  570: +received files will be hard linked into \fBDIR\fP, a link farm arranged by
  571: +MD5 file hash. See the \fB\-\-link-by-hash\fP option for a full explanation.
  572: +.IP
  573: +The \fBDIR\fP must be accessible inside any chroot restrictions for the
  574: +module, but can exist outside the transfer location if there is an
  575: +inside-the-chroot path to the module (see "use chroot").  Note that a
  576: +user-specified option does not allow this outside-the-transfer-area
  577: +placement.
  578: +.IP
  579: +If this parameter is set, it will disable the \fB\-\-link-by-hash\fP command-line
  580: +option for copies into the module.
  581: +.P
  582: +The default is for this parameter to be unset.
  583: +.P
  584:  .IP "\fBlog\ file\fP"
  585:  When the "log file" parameter is set to a non-empty string, the rsync
  586:  daemon will log messages to the indicated file rather than using syslog.
  587: diff -Nurp a/rsyncd.conf.5.html b/rsyncd.conf.5.html
  588: --- a/rsyncd.conf.5.html
  589: +++ b/rsyncd.conf.5.html
  590: @@ -342,6 +342,22 @@ is 0, which means no limit.  A negative
  591:  also the &quot;lock file&quot; parameter.</p>
  592:  </dd>
  593:  
  594: +<dt><code>link by hash dir</code></dt><dd>
  595: +<p>When the &quot;link by hash dir&quot; parameter is set to a non-empty string,
  596: +received files will be hard linked into <strong>DIR</strong>, a link farm arranged by
  597: +MD5 file hash. See the <code>--link-by-hash</code> option for a full explanation.</p>
  598: +<p>The <strong>DIR</strong> must be accessible inside any chroot restrictions for the
  599: +module, but can exist outside the transfer location if there is an
  600: +inside-the-chroot path to the module (see &quot;use chroot&quot;).  Note that a
  601: +user-specified option does not allow this outside-the-transfer-area
  602: +placement.</p>
  603: +<p>If this parameter is set, it will disable the <code>--link-by-hash</code> command-line
  604: +option for copies into the module.</p>
  605: +</dd>
  606: +</dl>
  607: +<p>The default is for this parameter to be unset.</p>
  608: +<dl>
  609: +
  610:  <dt><code>log file</code></dt><dd>
  611:  <p>When the &quot;log file&quot; parameter is set to a non-empty string, the rsync
  612:  daemon will log messages to the indicated file rather than using syslog.

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>