File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / bird / proto / bgp / attrs.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 19:50:23 2021 UTC (3 years, 3 months ago) by misho
Branches: bird, MAIN
CVS tags: v1_6_8p3, HEAD
bird 1.6.8

/*
 *	BIRD -- BGP Attributes
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

#undef LOCAL_DEBUG

#include <stdlib.h>

#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
#include "nest/attrs.h"
#include "conf/conf.h"
#include "lib/resource.h"
#include "lib/string.h"
#include "lib/unaligned.h"

#include "bgp.h"

/*
 *   UPDATE message error handling
 *
 * All checks from RFC 4271 6.3 are done as specified with these exceptions:
 *  - The semantic check of an IP address from NEXT_HOP attribute is missing.
 *  - Checks of some optional attribute values are missing.
 *  - Syntactic and semantic checks of NLRIs (done in DECODE_PREFIX())
 *    are probably inadequate.
 *
 * Loop detection based on AS_PATH causes updates to be withdrawn. RFC
 * 4271 does not explicitly specifiy the behavior in that case.
 *
 * Loop detection related to route reflection (based on ORIGINATOR_ID
 * and CLUSTER_LIST) causes updates to be withdrawn. RFC 4456 8
 * specifies that such updates should be ignored, but that is generally
 * a bad idea.
 *
 * Error checking of optional transitive attributes is done according to
 * draft-ietf-idr-optional-transitive-03, but errors are handled always
 * as withdraws.
 *
 * Unexpected AS_CONFED_* segments in AS_PATH are logged and removed,
 * but unknown segments cause a session drop with Malformed AS_PATH
 * error (see validate_path()). The behavior in such case is not
 * explicitly specified by RFC 4271. RFC 5065 specifies that
 * inconsistent AS_CONFED_* segments should cause a session drop, but
 * implementations that pass invalid AS_CONFED_* segments are
 * widespread.
 *
 * Error handling of AS4_* attributes is done as specified by RFC 6793. There
 * are several possible inconsistencies between AGGREGATOR and AS4_AGGREGATOR
 * that are not handled by that RFC, these are logged and ignored (see
 * bgp_reconstruct_4b_attrs()).
 */


static byte bgp_mandatory_attrs[] = { BA_ORIGIN, BA_AS_PATH
#ifndef IPV6
,BA_NEXT_HOP
#endif
};

struct attr_desc {
  char *name;
  int expected_length;
  int expected_flags;
  int type;
  int allow_in_ebgp;
  int (*validate)(struct bgp_proto *p, byte *attr, int len);
  void (*format)(eattr *ea, byte *buf, int buflen);
};

#define IGNORE -1
#define WITHDRAW -2

static int
bgp_check_origin(struct bgp_proto *p UNUSED, byte *a, int len UNUSED)
{
  if (*a > 2)
    return 6;
  return 0;
}

static void
bgp_format_origin(eattr *a, byte *buf, int buflen UNUSED)
{
  static char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" };

  bsprintf(buf, bgp_origin_names[a->u.data]);
}

static int
path_segment_contains(byte *p, int bs, u32 asn)
{
  int i;
  int len = p[1];
  p += 2;

  for(i=0; i<len; i++)
    {
      u32 asn2 = (bs == 4) ? get_u32(p) : get_u16(p);
      if (asn2 == asn)
	return 1;
      p += bs;
    }

  return 0;
}

/* Validates path attribute, removes AS_CONFED_* segments, and also returns path length */
static int
validate_path(struct bgp_proto *p, int as_path, int bs, byte *idata, uint *ilength)
{
  int res = 0;
  u8 *a, *dst;
  int len, plen;

  dst = a = idata;
  len = *ilength;

  while (len)
    {
      if (len < 2)
	return -1;

      plen = 2 + bs * a[1];
      if (len < plen)
	return -1;

      if (a[1] == 0)
        {
	  log(L_WARN "%s: %s_PATH attribute contains empty segment, skipping it",
	      p->p.name, as_path ? "AS" : "AS4");
	  goto skip;
	}

      switch (a[0])
	{
	case AS_PATH_SET:
	  res++;
	  break;

	case AS_PATH_SEQUENCE:
	  res += a[1];
	  break;

	case AS_PATH_CONFED_SEQUENCE:
	case AS_PATH_CONFED_SET:
	  if (as_path && path_segment_contains(a, bs, p->remote_as))
	    {
	      log(L_WARN "%s: AS_CONFED_* segment with peer ASN found, misconfigured confederation?", p->p.name);
	      return -1;
	    }

	  log(L_WARN "%s: %s_PATH attribute contains AS_CONFED_* segment, skipping segment",
	      p->p.name, as_path ? "AS" : "AS4");
	  goto skip;

	default:
	  return -1;
	}

      if (dst != a)
	memmove(dst, a, plen);
      dst += plen;

    skip:
      len -= plen;
      a += plen;
    }

  *ilength = dst - idata;
  return res;
}

static inline int
validate_as_path(struct bgp_proto *p, byte *a, int *len)
{
  return validate_path(p, 1, p->as4_session ? 4 : 2, a, len);
}

static inline int
validate_as4_path(struct bgp_proto *p, struct adata *path)
{
  return validate_path(p, 0, 4, path->data, &path->length);
}

static int
bgp_check_next_hop(struct bgp_proto *p UNUSED, byte *a UNUSED6, int len UNUSED6)
{
#ifdef IPV6
  return IGNORE;
#else
  ip_addr addr;

  memcpy(&addr, a, len);
  ipa_ntoh(addr);
  if (ipa_classify(addr) & IADDR_HOST)
    return 0;
  else
    return 8;
#endif
}

static void
bgp_format_next_hop(eattr *a, byte *buf, int buflen UNUSED)
{
  ip_addr *ipp = (ip_addr *) a->u.ptr->data;
#ifdef IPV6
  /* in IPv6, we might have two addresses in NEXT HOP */
  if ((a->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(ipp[1]))
    {
      bsprintf(buf, "%I %I", ipp[0], ipp[1]);
      return;
    }
#endif

  bsprintf(buf, "%I", ipp[0]);
}

static int
bgp_check_aggregator(struct bgp_proto *p, byte *a UNUSED, int len)
{
  int exp_len = p->as4_session ? 8 : 6;
  
  return (len == exp_len) ? 0 : WITHDRAW;
}

static void
bgp_format_aggregator(eattr *a, byte *buf, int buflen UNUSED)
{
  struct adata *ad =  a->u.ptr;
  byte *data = ad->data;
  u32 as;

  as = get_u32(data);
  data += 4;

  bsprintf(buf, "%d.%d.%d.%d AS%u", data[0], data[1], data[2], data[3], as);
}

static int
bgp_check_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
{
  return ((len % 4) == 0) ? 0 : WITHDRAW;
}

static int
bgp_check_cluster_list(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
{
  return ((len % 4) == 0) ? 0 : 5;
}

static void
bgp_format_cluster_list(eattr *a, byte *buf, int buflen)
{
  /* Truncates cluster lists larger than buflen, probably not a problem */
  int_set_format(a->u.ptr, 0, -1, buf, buflen);
}

static int
bgp_check_reach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
{
#ifdef IPV6
  p->mp_reach_start = a;
  p->mp_reach_len = len;
#endif
  return IGNORE;
}

static int
bgp_check_unreach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
{
#ifdef IPV6
  p->mp_unreach_start = a;
  p->mp_unreach_len = len;
#endif
  return IGNORE;
}

static int
bgp_check_ext_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
{
  return ((len % 8) == 0) ? 0 : WITHDRAW;
}

static int
bgp_check_large_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
{
  return ((len % 12) == 0) ? 0 : WITHDRAW;
}


static struct attr_desc bgp_attr_table[] = {
  { NULL, -1, 0, 0, 0,								/* Undefined */
    NULL, NULL },
  { "origin", 1, BAF_TRANSITIVE, EAF_TYPE_INT, 1,				/* BA_ORIGIN */
    bgp_check_origin, bgp_format_origin },
  { "as_path", -1, BAF_TRANSITIVE, EAF_TYPE_AS_PATH, 1,				/* BA_AS_PATH */
    NULL, NULL }, /* is checked by validate_as_path() as a special case */
  { "next_hop", 4, BAF_TRANSITIVE, EAF_TYPE_IP_ADDRESS, 1,			/* BA_NEXT_HOP */
    bgp_check_next_hop, bgp_format_next_hop },
  { "med", 4, BAF_OPTIONAL, EAF_TYPE_INT, 1,					/* BA_MULTI_EXIT_DISC */
    NULL, NULL },
  { "local_pref", 4, BAF_TRANSITIVE, EAF_TYPE_INT, 1,				/* BA_LOCAL_PREF */
    NULL, NULL },
  { "atomic_aggr", 0, BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,			/* BA_ATOMIC_AGGR */
    NULL, NULL },
  { "aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,	/* BA_AGGREGATOR */
    bgp_check_aggregator, bgp_format_aggregator },
  { "community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_INT_SET, 1,	/* BA_COMMUNITY */
    bgp_check_community, NULL },
  { "originator_id", 4, BAF_OPTIONAL, EAF_TYPE_ROUTER_ID, 0,			/* BA_ORIGINATOR_ID */
    NULL, NULL },
  { "cluster_list", -1, BAF_OPTIONAL, EAF_TYPE_INT_SET, 0,			/* BA_CLUSTER_LIST */
    bgp_check_cluster_list, bgp_format_cluster_list }, 
  { .name = NULL },								/* BA_DPA */
  { .name = NULL },								/* BA_ADVERTISER */
  { .name = NULL },								/* BA_RCID_PATH */
  { "mp_reach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1,			/* BA_MP_REACH_NLRI */
    bgp_check_reach_nlri, NULL },
  { "mp_unreach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1,			/* BA_MP_UNREACH_NLRI */
    bgp_check_unreach_nlri, NULL },
  { "ext_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_EC_SET, 1,	/* BA_EXT_COMMUNITY */
    bgp_check_ext_community, NULL },
  { "as4_path", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,		/* BA_AS4_PATH */
    NULL, NULL },
  { "as4_aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,	/* BA_AS4_PATH */
    NULL, NULL },
  [BA_LARGE_COMMUNITY] =
  { "large_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_LC_SET, 1,
    bgp_check_large_community, NULL }
};

/* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH.
 * It does not matter as this attribute does not appear on routes in the routing table.
 */

#define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name)

static inline struct adata *
bgp_alloc_adata(struct linpool *pool, unsigned len)
{
  struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len);
  ad->length = len;
  return ad;
}

static void
bgp_set_attr(eattr *e, unsigned attr, uintptr_t val)
{
  ASSERT(ATTR_KNOWN(attr));
  e->id = EA_CODE(EAP_BGP, attr);
  e->type = bgp_attr_table[attr].type;
  e->flags = bgp_attr_table[attr].expected_flags;
  if (e->type & EAF_EMBEDDED)
    e->u.data = val;
  else
    e->u.ptr = (struct adata *) val;
}

static byte *
bgp_set_attr_wa(eattr *e, struct linpool *pool, unsigned attr, unsigned len)
{
  struct adata *ad = bgp_alloc_adata(pool, len);
  bgp_set_attr(e, attr, (uintptr_t) ad);
  return ad->data;
}

void
bgp_attach_attr(ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val)
{
  ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
  a->next = *to;
  *to = a;
  a->flags = EALF_SORTED;
  a->count = 1;
  bgp_set_attr(a->attrs, attr, val);
}

byte *
bgp_attach_attr_wa(ea_list **to, struct linpool *pool, unsigned attr, unsigned len)
{
  struct adata *ad = bgp_alloc_adata(pool, len);
  bgp_attach_attr(to, pool, attr, (uintptr_t) ad);
  return ad->data;
}

static int
bgp_encode_attr_hdr(byte *dst, uint flags, unsigned code, int len)
{
  int wlen;

  DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code, len, flags);

  if (len < 256)
    {
      *dst++ = flags;
      *dst++ = code;
      *dst++ = len;
      wlen = 3;
    }
  else
    {
      *dst++ = flags | BAF_EXT_LEN;
      *dst++ = code;
      put_u16(dst, len);
      wlen = 4;
    }

  return wlen;
}

static void
aggregator_convert_to_old(struct adata *aggr, byte *dst, int *new_used)
{
  byte *src = aggr->data;
  *new_used = 0;

  u32 as = get_u32(src);
  if (as > 0xFFFF) 
    {
      as = AS_TRANS;
      *new_used = 1;
    }
  put_u16(dst, as);

  /* Copy IPv4 address */
  memcpy(dst + 2, src + 4, 4);
}

static void
aggregator_convert_to_new(struct adata *aggr, byte *dst)
{
  byte *src = aggr->data;

  u32 as   = get_u16(src);
  put_u32(dst, as);

  /* Copy IPv4 address */
  memcpy(dst + 4, src + 2, 4);
}

static int
bgp_get_attr_len(eattr *a)
{
  int len;
  if (ATTR_KNOWN(EA_ID(a->id)))
    {
      int code = EA_ID(a->id);
      struct attr_desc *desc = &bgp_attr_table[code];
      len = desc->expected_length;
      if (len < 0)
	{
	  ASSERT(!(a->type & EAF_EMBEDDED));
	  len = a->u.ptr->length;
	}
    }
  else
    {
      ASSERT((a->type & EAF_TYPE_MASK) == EAF_TYPE_OPAQUE);
      len = a->u.ptr->length;
    }
  
  return len;
}

/**
 * bgp_encode_attrs - encode BGP attributes
 * @p: BGP instance (or NULL)
 * @w: buffer
 * @attrs: a list of extended attributes
 * @remains: remaining space in the buffer
 *
 * The bgp_encode_attrs() function takes a list of extended attributes
 * and converts it to its BGP representation (a part of an Update message).
 *
 * Result: Length of the attribute block generated or -1 if not enough space.
 */
uint
bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains)
{
  uint i, code, type, flags;
  int as4_session = p ? p->as4_session : 1;
  byte *start = w;
  int len, rv;

  for(i=0; i<attrs->count; i++)
    {
      eattr *a = &attrs->attrs[i];
      ASSERT(EA_PROTO(a->id) == EAP_BGP);
      code = EA_ID(a->id);

#ifdef IPV6
      /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */
      if (code == BA_NEXT_HOP)
	continue;
#endif

      /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker,
       * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH 
       * as optional AS4_PATH attribute.
       */
      if ((code == BA_AS_PATH) && !as4_session)
	{
	  len = a->u.ptr->length;

	  if (remains < (len + 4))
	    goto err_no_buffer;

	  /* Using temporary buffer because don't know a length of created attr
	   * and therefore a length of a header. Perhaps i should better always
	   * use BAF_EXT_LEN. */
	  
	  byte buf[len];
	  int new_used;
	  int nl = as_path_convert_to_old(a->u.ptr, buf, &new_used);

	  DBG("BGP: Encoding old AS_PATH\n");
	  rv = bgp_encode_attr_hdr(w, BAF_TRANSITIVE, BA_AS_PATH, nl);
	  ADVANCE(w, remains, rv);
	  memcpy(w, buf, nl);
	  ADVANCE(w, remains, nl);

	  if (! new_used)
	    continue;

	  if (remains < (len + 4))
	    goto err_no_buffer;

	  /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments 
	   * here but we don't support confederations and such paths we already
	   * discarded in bgp_check_as_path().
	   */

	  DBG("BGP: Encoding AS4_PATH\n");
	  rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_PATH, len);
	  ADVANCE(w, remains, rv);
	  memcpy(w, a->u.ptr->data, len);
	  ADVANCE(w, remains, len);

	  continue;
	}

      /* The same issue with AGGREGATOR attribute */
      if ((code == BA_AGGREGATOR) && !as4_session)
	{
	  int new_used;

	  len = 6;
	  if (remains < (len + 3))
	    goto err_no_buffer;

	  rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AGGREGATOR, len);
	  ADVANCE(w, remains, rv);
	  aggregator_convert_to_old(a->u.ptr, w, &new_used);
	  ADVANCE(w, remains, len);

	  if (! new_used)
	    continue;

	  len = 8;
	  if (remains < (len + 3))
	    goto err_no_buffer;

	  rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_AGGREGATOR, len);
	  ADVANCE(w, remains, rv);
	  memcpy(w, a->u.ptr->data, len);
	  ADVANCE(w, remains, len);

	  continue;
	}

      /* Standard path continues here ... */

      type = a->type & EAF_TYPE_MASK;
      flags = a->flags & (BAF_OPTIONAL | BAF_TRANSITIVE | BAF_PARTIAL);
      len = bgp_get_attr_len(a);

      /* Skip empty sets */ 
      if (((type == EAF_TYPE_INT_SET) || (type == EAF_TYPE_EC_SET) || (type == EAF_TYPE_LC_SET)) && (len == 0))
	continue; 

      if (remains < len + 4)
	goto err_no_buffer;

      rv = bgp_encode_attr_hdr(w, flags, code, len);
      ADVANCE(w, remains, rv);

      switch (type)
	{
	case EAF_TYPE_INT:
	case EAF_TYPE_ROUTER_ID:
	  if (len == 4)
	    put_u32(w, a->u.data);
	  else
	    *w = a->u.data;
	  break;
	case EAF_TYPE_IP_ADDRESS:
	  {
	    ip_addr ip = *(ip_addr *)a->u.ptr->data;
	    ipa_hton(ip);
	    memcpy(w, &ip, len);
	    break;
	  }
	case EAF_TYPE_INT_SET:
	case EAF_TYPE_LC_SET:
	case EAF_TYPE_EC_SET:
	  {
	    u32 *z = int_set_get_data(a->u.ptr);
	    int i;
	    for(i=0; i<len; i+=4)
	      put_u32(w+i, *z++);
	    break;
	  }
	case EAF_TYPE_OPAQUE:
	case EAF_TYPE_AS_PATH:
	  memcpy(w, a->u.ptr->data, len);
	  break;
	default:
	  bug("bgp_encode_attrs: unknown attribute type %02x", a->type);
	}
      ADVANCE(w, remains, len);
    }
  return w - start;

 err_no_buffer:
  return -1;
}

/*
static void
bgp_init_prefix(struct fib_node *N)
{
  struct bgp_prefix *p = (struct bgp_prefix *) N;
  p->bucket_node.next = NULL;
}
*/

static int
bgp_compare_u32(const u32 *x, const u32 *y)
{
  return (*x < *y) ? -1 : (*x > *y) ? 1 : 0;
}

static inline void
bgp_normalize_int_set(u32 *dest, u32 *src, unsigned cnt)
{
  memcpy(dest, src, sizeof(u32) * cnt);
  qsort(dest, cnt, sizeof(u32), (int(*)(const void *, const void *)) bgp_compare_u32);
}

static int
bgp_compare_ec(const u32 *xp, const u32 *yp)
{
  u64 x = ec_get(xp, 0);
  u64 y = ec_get(yp, 0);
  return (x < y) ? -1 : (x > y) ? 1 : 0;
}

static inline void
bgp_normalize_ec_set(struct adata *ad, u32 *src, int internal)
{
  u32 *dst = int_set_get_data(ad);

  /* Remove non-transitive communities (EC_TBIT active) on external sessions */
  if (! internal)
    {
      int len = int_set_get_size(ad);
      u32 *t = dst;
      int i;

      for (i=0; i < len; i += 2)
	{
	  if (src[i] & EC_TBIT)
	    continue;
	  
	  *t++ = src[i];
	  *t++ = src[i+1];
	}

      ad->length = (t - dst) * 4;
    }
  else
    memcpy(dst, src, ad->length);

  qsort(dst, ad->length / 8, 8, (int(*)(const void *, const void *)) bgp_compare_ec);
}

static int
bgp_compare_lc(const u32 *x, const u32 *y)
{
  if (x[0] != y[0])
    return (x[0] > y[0]) ? 1 : -1;
  if (x[1] != y[1])
    return (x[1] > y[1]) ? 1 : -1;
  if (x[2] != y[2])
    return (x[2] > y[2]) ? 1 : -1;
  return 0;
}

static inline void
bgp_normalize_lc_set(u32 *dest, u32 *src, unsigned cnt)
{
  memcpy(dest, src, LCOMM_LENGTH * cnt);
  qsort(dest, cnt, LCOMM_LENGTH, (int(*)(const void *, const void *)) bgp_compare_lc);
}

static void
bgp_rehash_buckets(struct bgp_proto *p)
{
  struct bgp_bucket **old = p->bucket_hash;
  struct bgp_bucket **new;
  unsigned oldn = p->hash_size;
  unsigned i, e, mask;
  struct bgp_bucket *b;

  p->hash_size = p->hash_limit;
  DBG("BGP: Rehashing bucket table from %d to %d\n", oldn, p->hash_size);
  p->hash_limit *= 4;
  if (p->hash_limit >= 65536)
    p->hash_limit = ~0;
  new = p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
  mask = p->hash_size - 1;
  for (i=0; i<oldn; i++)
    while (b = old[i])
      {
	old[i] = b->hash_next;
	e = b->hash & mask;
	b->hash_next = new[e];
	if (b->hash_next)
	  b->hash_next->hash_prev = b;
	b->hash_prev = NULL;
	new[e] = b;
      }
  mb_free(old);
}

static struct bgp_bucket *
bgp_new_bucket(struct bgp_proto *p, ea_list *new, unsigned hash)
{
  struct bgp_bucket *b;
  unsigned ea_size = sizeof(ea_list) + new->count * sizeof(eattr);
  unsigned ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN);
  unsigned size = sizeof(struct bgp_bucket) + ea_size_aligned;
  unsigned i;
  byte *dest;
  unsigned index = hash & (p->hash_size - 1);

  /* Gather total size of non-inline attributes */
  for (i=0; i<new->count; i++)
    {
      eattr *a = &new->attrs[i];
      if (!(a->type & EAF_EMBEDDED))
	size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN);
    }

  /* Create the bucket and hash it */
  b = mb_alloc(p->p.pool, size);
  b->hash_next = p->bucket_hash[index];
  if (b->hash_next)
    b->hash_next->hash_prev = b;
  p->bucket_hash[index] = b;
  b->hash_prev = NULL;
  b->hash = hash;
  add_tail(&p->bucket_queue, &b->send_node);
  init_list(&b->prefixes);
  memcpy(b->eattrs, new, ea_size);
  dest = ((byte *)b->eattrs) + ea_size_aligned;

  /* Copy values of non-inline attributes */
  for (i=0; i<new->count; i++)
    {
      eattr *a = &b->eattrs->attrs[i];
      if (!(a->type & EAF_EMBEDDED))
	{
	  struct adata *oa = a->u.ptr;
	  struct adata *na = (struct adata *) dest;
	  memcpy(na, oa, sizeof(struct adata) + oa->length);
	  a->u.ptr = na;
	  dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN);
	}
    }

  /* If needed, rehash */
  p->hash_count++;
  if (p->hash_count > p->hash_limit)
    bgp_rehash_buckets(p);

  return b;
}

static struct bgp_bucket *
bgp_get_bucket(struct bgp_proto *p, net *n, ea_list *attrs, int originate)
{
  ea_list *new;
  unsigned i, cnt, hash, code;
  eattr *a, *d;
  u32 seen = 0;
  struct bgp_bucket *b;

  /* Merge the attribute list */
  new = alloca(ea_scan(attrs));
  ea_merge(attrs, new);
  ea_sort(new);

  /* Normalize attributes */
  d = new->attrs;
  cnt = new->count;
  new->count = 0;
  for(i=0; i<cnt; i++)
    {
      a = &new->attrs[i];
      if (EA_PROTO(a->id) != EAP_BGP)
	continue;
      code = EA_ID(a->id);
      if (ATTR_KNOWN(code))
	{
	  if (!p->is_internal)
	    {
	      if (!bgp_attr_table[code].allow_in_ebgp)
		continue;
	      if ((code == BA_LOCAL_PREF) && !p->cf->allow_local_pref)
		continue;
	    }
	  /* The flags might have been zero if the attr was added by filters */
	  a->flags = (a->flags & BAF_PARTIAL) | bgp_attr_table[code].expected_flags;
	  if (code < 32)
	    seen |= 1 << code;
	}
      else
	{
	  /* Don't re-export unknown non-transitive attributes */
	  if (!(a->flags & BAF_TRANSITIVE))
	    continue;
	}
      *d = *a;
      if ((d->type & EAF_ORIGINATED) && !originate && (d->flags & BAF_TRANSITIVE) && (d->flags & BAF_OPTIONAL))
	d->flags |= BAF_PARTIAL;
      switch (d->type & EAF_TYPE_MASK)
	{
	case EAF_TYPE_INT_SET:
	  {
	    struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
	    z->length = d->u.ptr->length;
	    bgp_normalize_int_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / 4);
	    d->u.ptr = z;
	    break;
	  }
	case EAF_TYPE_EC_SET:
	  {
	    struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
	    z->length = d->u.ptr->length;
	    bgp_normalize_ec_set(z, (u32 *) d->u.ptr->data, p->is_internal);
	    d->u.ptr = z;
	    break;
	  }
	case EAF_TYPE_LC_SET:
	  {
	    struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
	    z->length = d->u.ptr->length;
	    bgp_normalize_lc_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / LCOMM_LENGTH);
	    d->u.ptr = z;
	    break;
	  }
	default: ;
	}
      d++;
      new->count++;
    }

  /* Hash */
  hash = ea_hash(new);
  for(b=p->bucket_hash[hash & (p->hash_size - 1)]; b; b=b->hash_next)
    if (b->hash == hash && ea_same(b->eattrs, new))
      {
	DBG("Found bucket.\n");
	return b;
      }

  /* Ensure that there are all mandatory attributes */
  for(i=0; i<ARRAY_SIZE(bgp_mandatory_attrs); i++)
    if (!(seen & (1 << bgp_mandatory_attrs[i])))
      {
	log(L_ERR "%s: Mandatory attribute %s missing in route %I/%d", p->p.name, bgp_attr_table[bgp_mandatory_attrs[i]].name, n->n.prefix, n->n.pxlen);
	return NULL;
      }

  /* Check if next hop is valid */
  a = ea_find(new, EA_CODE(EAP_BGP, BA_NEXT_HOP));
  if (!a || ipa_equal(p->cf->remote_ip, *(ip_addr *)a->u.ptr->data))
    {
      log(L_ERR "%s: Invalid NEXT_HOP attribute in route %I/%d", p->p.name, n->n.prefix, n->n.pxlen);
      return NULL;
    }

  /* Create new bucket */
  DBG("Creating bucket.\n");
  return bgp_new_bucket(p, new, hash);
}

void
bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck)
{
  if (buck->hash_next)
    buck->hash_next->hash_prev = buck->hash_prev;
  if (buck->hash_prev)
    buck->hash_prev->hash_next = buck->hash_next;
  else
    p->bucket_hash[buck->hash & (p->hash_size-1)] = buck->hash_next;
  mb_free(buck);
}


/* Prefix hash table */

#define PXH_KEY(n1)		n1->n.prefix, n1->n.pxlen, n1->path_id
#define PXH_NEXT(n)		n->next
#define PXH_EQ(p1,l1,i1,p2,l2,i2) ipa_equal(p1, p2) && l1 == l2 && i1 == i2
#define PXH_FN(p,l,i)		ipa_hash32(p) ^ u32_hash((l << 16) ^ i)

#define PXH_REHASH		bgp_pxh_rehash
#define PXH_PARAMS		/8, *2, 2, 2, 8, 24


HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix)

void
bgp_init_prefix_table(struct bgp_proto *p, u32 order)
{
  HASH_INIT(p->prefix_hash, p->p.pool, order);

  p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix));
}

void
bgp_free_prefix_table(struct bgp_proto *p)
{
  HASH_FREE(p->prefix_hash);

  rfree(p->prefix_slab);
  p->prefix_slab = NULL;
}

static struct bgp_prefix *
bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id)
{
  struct bgp_prefix *bp = HASH_FIND(p->prefix_hash, PXH, prefix, pxlen, path_id);

  if (bp)
    return bp;

  bp = sl_alloc(p->prefix_slab);
  bp->n.prefix = prefix;
  bp->n.pxlen = pxlen;
  bp->path_id = path_id;
  bp->bucket_node.next = NULL;

  HASH_INSERT2(p->prefix_hash, PXH, p->p.pool, bp);

  return bp;
}

void
bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp)
{
  HASH_REMOVE2(p->prefix_hash, PXH, p->p.pool, bp);
  sl_free(p->prefix_slab, bp);
}


void
bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct bgp_bucket *buck;
  struct bgp_prefix *px;
  rte *key;
  u32 path_id;

  DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down");

  if (new)
    {
      key = new;
      buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP);
      if (!buck)			/* Inconsistent attribute list */
	return;
    }
  else
    {
      key = old;
      if (!(buck = p->withdraw_bucket))
	{
	  buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket));
	  init_list(&buck->prefixes);
	}
    }
  path_id = p->add_path_tx ? key->attrs->src->global_id : 0;
  px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id);
  if (px->bucket_node.next)
    {
      DBG("\tRemoving old entry.\n");
      rem_node(&px->bucket_node);
    }
  add_tail(&buck->prefixes, &px->bucket_node);
  bgp_schedule_packet(p->conn, PKT_UPDATE);
}

static int
bgp_create_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool)
{
  ea_list *ea = lp_alloc(pool, sizeof(ea_list) + 4*sizeof(eattr));
  rta *rta = e->attrs;
  byte *z;

  ea->next = *attrs;
  *attrs = ea;
  ea->flags = EALF_SORTED;
  ea->count = 4;

  bgp_set_attr(ea->attrs, BA_ORIGIN,
       ((rta->source == RTS_OSPF_EXT1) || (rta->source == RTS_OSPF_EXT2)) ? ORIGIN_INCOMPLETE : ORIGIN_IGP);

  if (p->is_internal)
    bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 0);
  else
    {
      z = bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 6);
      z[0] = AS_PATH_SEQUENCE;
      z[1] = 1;				/* 1 AS */
      put_u32(z+2, p->local_as);
    }

  /* iBGP -> use gw, eBGP multi-hop -> use source_addr,
     eBGP single-hop -> use gw if on the same iface */
  z = bgp_set_attr_wa(ea->attrs+2, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
  if (p->cf->next_hop_self ||
      rta->dest != RTD_ROUTER ||
      ipa_equal(rta->gw, IPA_NONE) ||
      ipa_is_link_local(rta->gw) ||
      (!p->is_internal && !p->cf->next_hop_keep &&
       (!p->neigh || (rta->iface != p->neigh->iface))))
    set_next_hop(z, p->source_addr);
  else
    set_next_hop(z, rta->gw);

  bgp_set_attr(ea->attrs+3, BA_LOCAL_PREF, p->cf->default_local_pref);

  return 0;				/* Leave decision to the filters */
}


static inline int
bgp_as_path_loopy(struct bgp_proto *p, rta *a)
{
  int num = p->cf->allow_local_as + 1;
  eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  return (e && (num > 0) && as_path_contains(e->u.ptr, p->local_as, num));
}

static inline int
bgp_originator_id_loopy(struct bgp_proto *p, rta *a)
{
  eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
  return (e && (e->u.data == p->local_id));
}

static inline int
bgp_cluster_list_loopy(struct bgp_proto *p, rta *a)
{
  eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  return (e && p->rr_client && int_set_contains(e->u.ptr, p->rr_cluster_id));
}


static inline void
bgp_path_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 as)
{
  eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  bgp_attach_attr(attrs, pool, BA_AS_PATH, (uintptr_t) as_path_prepend(pool, a->u.ptr, as));
}

static inline void
bgp_cluster_list_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 cid)
{
  eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  bgp_attach_attr(attrs, pool, BA_CLUSTER_LIST, (uintptr_t) int_set_prepend(pool, a ? a->u.ptr : NULL, cid));
}

static int
bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool, int rr)
{
  eattr *a;

  if (!p->is_internal && !p->rs_client)
    {
      bgp_path_prepend(e, attrs, pool, p->local_as);

      /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be
       * propagated to other neighboring ASes.
       * Perhaps it would be better to undefine it.
       */
      a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      if (a)
	bgp_attach_attr(attrs, pool, BA_MULTI_EXIT_DISC, 0);
    }

  /* iBGP -> keep next_hop, eBGP multi-hop -> use source_addr,
   * eBGP single-hop -> keep next_hop if on the same iface.
   * If the next_hop is zero (i.e. link-local), keep only if on the same iface.
   *
   * Note that same-iface-check uses iface from route, which is based on gw.
   */
  a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
  if (a && !p->cf->next_hop_self && 
      (p->cf->next_hop_keep ||
       (p->is_internal && ipa_nonzero(*((ip_addr *) a->u.ptr->data))) ||
       (p->neigh && (e->attrs->iface == p->neigh->iface))))
    {
      /* Leave the original next hop attribute, will check later where does it point */
    }
  else
    {
      /* Need to create new one */
      byte *b = bgp_attach_attr_wa(attrs, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
      set_next_hop(b, p->source_addr);
    }

  if (rr)
    {
      /* Handling route reflection, RFC 4456 */
      struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto;

      a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
      if (!a)
	bgp_attach_attr(attrs, pool, BA_ORIGINATOR_ID, src->remote_id);

      /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */
      bgp_cluster_list_prepend(e, attrs, pool, src->rr_client ? src->rr_cluster_id : p->rr_cluster_id);

      /* Two RR clients with different cluster ID, hmmm */
      if (src->rr_client && p->rr_client && (src->rr_cluster_id != p->rr_cluster_id))
	bgp_cluster_list_prepend(e, attrs, pool, p->rr_cluster_id);
    }

  return 0;				/* Leave decision to the filters */
}

static int
bgp_community_filter(struct bgp_proto *p, rte *e)
{
  eattr *a;
  struct adata *d;

  /* Check if we aren't forbidden to export the route by communities */
  a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY));
  if (a)
    {
      d = a->u.ptr;
      if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
	{
	  DBG("\tNO_ADVERTISE\n");
	  return 1;
	}
      if (!p->is_internal &&
	  (int_set_contains(d, BGP_COMM_NO_EXPORT) ||
	   int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED)))
	{
	  DBG("\tNO_EXPORT\n");
	  return 1;
	}

      if (!p->conn->peer_llgr_aware && int_set_contains(d, BGP_COMM_LLGR_STALE))
	return 1;
    }

  return 0;
}

int
bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool)
{
  rte *e = *new;
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ?
    (struct bgp_proto *) e->attrs->src->proto : NULL;

  if (p == new_bgp)			/* Poison reverse updates */
    return -1;
  if (new_bgp)
    {
      /* We should check here for cluster list loop, because the receiving BGP instance
	 might have different cluster ID  */
      if (bgp_cluster_list_loopy(p, e->attrs))
	return -1;

      if (p->cf->interpret_communities && bgp_community_filter(p, e))
	return -1;

      if (p->local_as == new_bgp->local_as && p->is_internal && new_bgp->is_internal)
	{
	  /* Redistribution of internal routes with IBGP */
	  if (p->rr_client || new_bgp->rr_client)
	    /* Route reflection, RFC 4456 */
	    return bgp_update_attrs(p, e, attrs, pool, 1);
	  else
	    return -1;
	}
      else
	return bgp_update_attrs(p, e, attrs, pool, 0);
    }
  else
    return bgp_create_attrs(p, e, attrs, pool);
}

static inline u32
bgp_get_neighbor(rte *r)
{
  eattr *e = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  u32 as;

  if (e && as_path_get_first(e->u.ptr, &as))
    return as;
  else
    return ((struct bgp_proto *) r->attrs->src->proto)->remote_as;
}

static inline int
rte_resolvable(rte *rt)
{
  int rd = rt->attrs->dest;  
  return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH);
}

static inline int
rte_stale(rte *r)
{
  if (r->u.bgp.stale < 0)
  {
    /* If staleness is unknown, compute and cache it */
    eattr *a = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY));
    r->u.bgp.stale = a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE);
  }

  return r->u.bgp.stale;
}

int
bgp_rte_better(rte *new, rte *old)
{
  struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->src->proto;
  struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->src->proto;
  eattr *x, *y;
  u32 n, o;

  /* Skip suppressed routes (see bgp_rte_recalculate()) */
  n = new->u.bgp.suppressed;
  o = old->u.bgp.suppressed;
  if (n > o)
    return 0;
  if (n < o)
    return 1;

  /* RFC 4271 9.1.2.1. Route resolvability test */
  n = rte_resolvable(new);
  o = rte_resolvable(old);
  if (n > o)
    return 1;
  if (n < o)
    return 0;

  /* LLGR draft - depreference stale routes */
  n = rte_stale(new);
  o = rte_stale(old);
  if (n > o)
    return 0;
  if (n < o)
    return 1;

  /* Start with local preferences */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
  n = x ? x->u.data : new_bgp->cf->default_local_pref;
  o = y ? y->u.data : old_bgp->cf->default_local_pref;
  if (n > o)
    return 1;
  if (n < o)
    return 0;

  /* RFC 4271 9.1.2.2. a)  Use AS path lengths */
  if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths)
    {
      x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
      y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
      n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
      o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
      if (n < o)
	return 1;
      if (n > o)
	return 0;
    }

  /* RFC 4271 9.1.2.2. b) Use origins */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  n = x ? x->u.data : ORIGIN_INCOMPLETE;
  o = y ? y->u.data : ORIGIN_INCOMPLETE;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. c) Compare MED's */
  /* Proper RFC 4271 path selection cannot be interpreted as finding
   * the best path in some ordering. It is implemented partially in
   * bgp_rte_recalculate() when deterministic_med option is
   * active. Without that option, the behavior is just an
   * approximation, which in specific situations may lead to
   * persistent routing loops, because it is nondeterministic - it
   * depends on the order in which routes appeared. But it is also the
   * same behavior as used by default in Cisco routers, so it is
   * probably not a big issue.
   */
  if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
      (bgp_get_neighbor(new) == bgp_get_neighbor(old)))
    {
      x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      n = x ? x->u.data : new_bgp->cf->default_med;
      o = y ? y->u.data : old_bgp->cf->default_med;
      if (n < o)
	return 1;
      if (n > o)
	return 0;
    }

  /* RFC 4271 9.1.2.2. d) Prefer external peers */
  if (new_bgp->is_internal > old_bgp->is_internal)
    return 0;
  if (new_bgp->is_internal < old_bgp->is_internal)
    return 1;

  /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
  n = new_bgp->cf->igp_metric ? new->attrs->igp_metric : 0;
  o = old_bgp->cf->igp_metric ? old->attrs->igp_metric : 0;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
  /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
  n = x ? x->u.data : new_bgp->remote_id;
  o = y ? y->u.data : old_bgp->remote_id;

  /* RFC 5004 - prefer older routes */
  /* (if both are external and from different peer) */
  if ((new_bgp->cf->prefer_older || old_bgp->cf->prefer_older) &&
      !new_bgp->is_internal && n != o)
    return 0;

  /* rest of RFC 4271 9.1.2.2. f) */
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4456 9. b) Compare cluster list lengths */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  n = x ? int_set_get_size(x->u.ptr) : 0;
  o = y ? int_set_get_size(y->u.ptr) : 0;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */
  return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
}


int
bgp_rte_mergable(rte *pri, rte *sec)
{
  struct bgp_proto *pri_bgp = (struct bgp_proto *) pri->attrs->src->proto;
  struct bgp_proto *sec_bgp = (struct bgp_proto *) sec->attrs->src->proto;
  eattr *x, *y;
  u32 p, s;

  /* Skip suppressed routes (see bgp_rte_recalculate()) */
  if (pri->u.bgp.suppressed != sec->u.bgp.suppressed)
    return 0;

  /* RFC 4271 9.1.2.1. Route resolvability test */
  if (!rte_resolvable(sec))
    return 0;

  /* LLGR draft - depreference stale routes */
  if (rte_stale(pri) != rte_stale(sec))
    return 0;

  /* Start with local preferences */
  x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
  y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
  p = x ? x->u.data : pri_bgp->cf->default_local_pref;
  s = y ? y->u.data : sec_bgp->cf->default_local_pref;
  if (p != s)
    return 0;

  /* RFC 4271 9.1.2.2. a)  Use AS path lengths */
  if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths)
    {
      x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
      y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
      p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
      s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;

      if (p != s)
	return 0;

//      if (DELTA(p, s) > pri_bgp->cf->relax_multipath)
//	return 0;
    }

  /* RFC 4271 9.1.2.2. b) Use origins */
  x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  p = x ? x->u.data : ORIGIN_INCOMPLETE;
  s = y ? y->u.data : ORIGIN_INCOMPLETE;
  if (p != s)
    return 0;

  /* RFC 4271 9.1.2.2. c) Compare MED's */
  if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric ||
      (bgp_get_neighbor(pri) == bgp_get_neighbor(sec)))
    {
      x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      p = x ? x->u.data : pri_bgp->cf->default_med;
      s = y ? y->u.data : sec_bgp->cf->default_med;
      if (p != s)
	return 0;
    }

  /* RFC 4271 9.1.2.2. d) Prefer external peers */
  if (pri_bgp->is_internal != sec_bgp->is_internal)
    return 0;

  /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
  p = pri_bgp->cf->igp_metric ? pri->attrs->igp_metric : 0;
  s = sec_bgp->cf->igp_metric ? sec->attrs->igp_metric : 0;
  if (p != s)
    return 0;

  /* Remaining criteria are ignored */

  return 1;
}



static inline int
same_group(rte *r, u32 lpref, u32 lasn)
{
  return (r->pref == lpref) && (bgp_get_neighbor(r) == lasn);
}

static inline int
use_deterministic_med(rte *r)
{
  struct proto *P = r->attrs->src->proto;
  return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med;
}

int
bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
{
  rte *r, *s;
  rte *key = new ? new : old;
  u32 lpref = key->pref;
  u32 lasn = bgp_get_neighbor(key);
  int old_is_group_best = 0;

  /*
   * Proper RFC 4271 path selection is a bit complicated, it cannot be
   * implemented just by rte_better(), because it is not a linear
   * ordering. But it can be splitted to two levels, where the lower
   * level chooses the best routes in each group of routes from the
   * same neighboring AS and higher level chooses the best route (with
   * a slightly different ordering) between the best-in-group routes.
   *
   * When deterministic_med is disabled, we just ignore this issue and
   * choose the best route by bgp_rte_better() alone. If enabled, the
   * lower level of the route selection is done here (for the group
   * to which the changed route belongs), all routes in group are
   * marked as suppressed, just chosen best-in-group is not.
   *
   * Global best route selection then implements higher level by
   * choosing between non-suppressed routes (as they are always
   * preferred over suppressed routes). Routes from BGP protocols
   * that do not set deterministic_med are just never suppressed. As
   * they do not participate in the lower level selection, it is OK
   * that this fn is not called for them.
   *
   * The idea is simple, the implementation is more problematic,
   * mostly because of optimizations in rte_recalculate() that 
   * avoids full recalculation in most cases.
   *
   * We can assume that at least one of new, old is non-NULL and both
   * are from the same protocol with enabled deterministic_med. We
   * group routes by both neighbor AS (lasn) and preference (lpref),
   * because bgp_rte_better() does not handle preference itself.
   */

  /* If new and old are from different groups, we just process that
     as two independent events */
  if (new && old && !same_group(old, lpref, lasn))
    {
      int i1, i2;
      i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
      i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
      return i1 || i2;
    }

  /* 
   * We could find the best-in-group and then make some shortcuts like
   * in rte_recalculate, but as we would have to walk through all
   * net->routes just to find it, it is probably not worth. So we
   * just have two simpler fast cases that use just the old route.
   * We also set suppressed flag to avoid using it in bgp_rte_better().
   */

  if (new)
    new->u.bgp.suppressed = 1;

  if (old)
    {
      old_is_group_best = !old->u.bgp.suppressed;
      old->u.bgp.suppressed = 1;
      int new_is_better = new && bgp_rte_better(new, old);

      /* The first case - replace not best with worse (or remove not best) */
      if (!old_is_group_best && !new_is_better)
	return 0;

      /* The second case - replace the best with better */
      if (old_is_group_best && new_is_better)
	{
	  /* new is best-in-group, the see discussion below - this is
	     a special variant of NBG && OBG. From OBG we can deduce
	     that same_group(old_best) iff (old == old_best)  */
	  new->u.bgp.suppressed = 0;
	  return (old == old_best);
	}
    }

  /* The default case - find a new best-in-group route */
  r = new; /* new may not be in the list */
  for (s=net->routes; rte_is_valid(s); s=s->next)
    if (use_deterministic_med(s) && same_group(s, lpref, lasn))
      {
	s->u.bgp.suppressed = 1;
	if (!r || bgp_rte_better(s, r))
	  r = s;
      }

  /* Simple case - the last route in group disappears */
  if (!r)
    return 0;

  /* Found best-in-group */
  r->u.bgp.suppressed = 0;

  /*
   * There are generally two reasons why we have to force
   * recalculation (return 1): First, the new route may be wrongfully
   * chosen to be the best in the first case check in
   * rte_recalculate(), this may happen only if old_best is from the
   * same group. Second, another (different than new route)
   * best-in-group is chosen and that may be the proper best (although
   * rte_recalculate() without ignore that possibility).
   *
   * There are three possible cases according to whether the old route
   * was the best in group (OBG, stored in old_is_group_best) and
   * whether the new route is the best in group (NBG, tested by r == new).
   * These cases work even if old or new is NULL.
   *
   * NBG -> new is a possible candidate for the best route, so we just
   *        check for the first reason using same_group().
   *
   * !NBG && OBG -> Second reason applies, return 1
   *
   * !NBG && !OBG -> Best in group does not change, old != old_best,
   *                 rte_better(new, old_best) is false and therefore
   *                 the first reason does not apply, return 0
   */

  if (r == new)
    return old_best && same_group(old_best, lpref, lasn);
  else
    return old_is_group_best;
}

struct rte *
bgp_rte_modify_stale(struct rte *r, struct linpool *pool)
{
  eattr *a = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY));
  struct adata *ad = a ? a->u.ptr : NULL;

  if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR))
    return NULL;

  if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE))
    return r;

  r = rte_cow_rta(r, pool);
  bgp_attach_attr(&(r->attrs->eattrs), pool, BA_COMMUNITY,
		  (uintptr_t) int_set_add(pool, ad, BGP_COMM_LLGR_STALE));
  r->u.bgp.stale = 1;

  return r;
}


static struct adata *
bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
{
  struct adata *newa = lp_alloc(pool, sizeof(struct adata) + 8);
  newa->length = 8;
  aggregator_convert_to_new(old, newa->data);
  return newa;
}

/* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format
 * and append path old4 (in 4B format).
 */
static struct adata *
bgp_merge_as_paths(struct adata *old2, struct adata *old4, int req_as, struct linpool *pool)
{
  byte buf[old2->length * 2];

  int ol = as_path_convert_to_new(old2, buf, req_as);
  int nl = ol + (old4 ? old4->length : 0);

  struct adata *newa = lp_alloc(pool, sizeof(struct adata) + nl);
  newa->length = nl;
  memcpy(newa->data, buf, ol);
  if (old4) memcpy(newa->data + ol, old4->data, old4->length);

  return newa;
}

static int
as4_aggregator_valid(struct adata *aggr)
{
  return aggr->length == 8;
}


/* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */
static void
bgp_reconstruct_4b_atts(struct bgp_proto *p, rta *a, struct linpool *pool)
{
  eattr *p2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  eattr *p4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_PATH));
  eattr *a2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AGGREGATOR));
  eattr *a4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR));
  int a4_removed = 0;

  if (a4 && !as4_aggregator_valid(a4->u.ptr))
    {
      log(L_WARN "%s: AS4_AGGREGATOR attribute is invalid, skipping attribute", p->p.name);
      a4 = NULL;
      a4_removed = 1;
    }

  if (a2)
    {
      u32 a2_as = get_u16(a2->u.ptr->data);

      if (a4)
	{
	  if (a2_as != AS_TRANS)
	    {
	      /* Routes were aggregated by old router and therefore AS4_PATH
	       * and AS4_AGGREGATOR is invalid
	       *
	       * Convert AS_PATH and AGGREGATOR to 4B format and finish.
	       */

	      a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
	      p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);

	      return;
	    }
	  else
	    {
	      /* Common case, use AS4_AGGREGATOR attribute */
	      a2->u.ptr = a4->u.ptr;
	    }
	}
      else
	{
	  /* Common case, use old AGGREGATOR attribute */
	  a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);

	  if ((a2_as == AS_TRANS) && !a4_removed)
	    log(L_WARN "%s: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing", p->p.name);
	}
    }
  else
    if (a4)
      log(L_WARN "%s: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing", p->p.name);

  int p2_len = as_path_getlen_int(p2->u.ptr, 2);
  int p4_len = p4 ? validate_as4_path(p, p4->u.ptr) : -1;

  if (p4 && (p4_len < 0))
    log(L_WARN "%s: AS4_PATH attribute is malformed, skipping attribute", p->p.name);

  if ((p4_len <= 0) || (p2_len < p4_len))
    p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
  else
    p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, p4->u.ptr, p2_len - p4_len, pool);
}

static void
bgp_remove_as4_attrs(struct bgp_proto *p, rta *a)
{
  unsigned id1 = EA_CODE(EAP_BGP, BA_AS4_PATH);
  unsigned id2 = EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR);
  ea_list **el = &(a->eattrs);

  /* We know that ea_lists constructed in bgp_decode attrs have one attribute per ea_list struct */
  while (*el != NULL)
    {
      unsigned fid = (*el)->attrs[0].id;

      if ((fid == id1) || (fid == id2))
	{
	  *el = (*el)->next;
	  if (p->as4_session)
	    log(L_WARN "%s: Unexpected AS4_* attributes received", p->p.name);
	}
      else
	el = &((*el)->next);
    }
}

/**
 * bgp_decode_attrs - check and decode BGP attributes
 * @conn: connection
 * @attr: start of attribute block
 * @len: length of attribute block
 * @pool: linear pool to make all the allocations in
 * @mandatory: 1 iff presence of mandatory attributes has to be checked
 *
 * This function takes a BGP attribute block (a part of an Update message), checks
 * its consistency and converts it to a list of BIRD route attributes represented
 * by a &rta.
 */
struct rta *
bgp_decode_attrs(struct bgp_conn *conn, byte *attr, uint len, struct linpool *pool, int mandatory)
{
  struct bgp_proto *bgp = conn->bgp;
  rta *a = lp_alloc(pool, sizeof(struct rta));
  uint flags, code, l, i, type;
  int errcode;
  byte *z, *attr_start;
  byte seen[256/8];
  ea_list *ea;
  struct adata *ad;
  int withdraw = 0;

  bzero(a, sizeof(rta));
  a->source = RTS_BGP;
  a->scope = SCOPE_UNIVERSE;
  a->cast = RTC_UNICAST;
  /* a->dest = RTD_ROUTER;  -- set in bgp_set_next_hop() */
  a->from = bgp->cf->remote_ip;

  /* Parse the attributes */
  bzero(seen, sizeof(seen));
  DBG("BGP: Parsing attributes\n");
  while (len)
    {
      if (len < 2)
	goto malformed;
      attr_start = attr;
      flags = *attr++;
      code = *attr++;
      len -= 2;
      if (flags & BAF_EXT_LEN)
	{
	  if (len < 2)
	    goto malformed;
	  l = get_u16(attr);
	  attr += 2;
	  len -= 2;
	}
      else
	{
	  if (len < 1)
	    goto malformed;
	  l = *attr++;
	  len--;
	}
      if (l > len)
	goto malformed;
      len -= l;
      z = attr;
      attr += l;
      DBG("Attr %02x %02x %d\n", code, flags, l);
      if (seen[code/8] & (1 << (code%8)))
	goto malformed;
      if (ATTR_KNOWN(code))
	{
	  struct attr_desc *desc = &bgp_attr_table[code];
	  if (desc->expected_length >= 0 && desc->expected_length != (int) l)
	    { errcode = 5; goto err; }
	  if ((desc->expected_flags ^ flags) & (BAF_OPTIONAL | BAF_TRANSITIVE))
	    { errcode = 4; goto err; }
	  if (!bgp->is_internal)
	    {
	      if (!desc->allow_in_ebgp)
		continue;
	      if ((code == BA_LOCAL_PREF) && !bgp->cf->allow_local_pref)
		continue;
	    }
	  if (desc->validate)
	    {
	      errcode = desc->validate(bgp, z, l);
	      if (errcode > 0)
		goto err;
	      if (errcode == IGNORE)
		continue;
	      if (errcode <= WITHDRAW)
		{
		  log(L_WARN "%s: Attribute %s is malformed, withdrawing update",
		      bgp->p.name, desc->name);
		  withdraw = 1;
		}
	    }
	  else if (code == BA_AS_PATH)
	    {
	      /* Special case as it might also trim the attribute */
	      if (validate_as_path(bgp, z, &l) < 0)
		{ errcode = 11; goto err; }
	    }
	  type = desc->type;
	}
      else				/* Unknown attribute */
	{
	  if (!(flags & BAF_OPTIONAL))
	    { errcode = 2; goto err; }
	  type = EAF_TYPE_OPAQUE;
	}
      
      // Only OPTIONAL and TRANSITIVE attributes may have non-zero PARTIAL flag
      // if (!((flags & BAF_OPTIONAL) && (flags & BAF_TRANSITIVE)) && (flags & BAF_PARTIAL))
      //   { errcode = 4; goto err; }

      seen[code/8] |= (1 << (code%8));
      ea = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
      ea->next = a->eattrs;
      a->eattrs = ea;
      ea->flags = 0;
      ea->count = 1;
      ea->attrs[0].id = EA_CODE(EAP_BGP, code);
      ea->attrs[0].flags = flags;
      ea->attrs[0].type = type;
      if (type & EAF_EMBEDDED)
	ad = NULL;
      else
	{
	  ad = lp_alloc(pool, sizeof(struct adata) + l);
	  ea->attrs[0].u.ptr = ad;
	  ad->length = l;
	  memcpy(ad->data, z, l);
	}
      switch (type)
	{
	case EAF_TYPE_ROUTER_ID:
	case EAF_TYPE_INT:
	  if (l == 1)
	    ea->attrs[0].u.data = *z;
	  else
	    ea->attrs[0].u.data = get_u32(z);
	  break;
	case EAF_TYPE_IP_ADDRESS:
	  ipa_ntoh(*(ip_addr *)ad->data);
	  break;
	case EAF_TYPE_INT_SET:
	case EAF_TYPE_LC_SET:
	case EAF_TYPE_EC_SET:
	  {
	    u32 *z = (u32 *) ad->data;
	    for(i=0; i<ad->length/4; i++)
	      z[i] = ntohl(z[i]);
	    break;
	  }
	}
    }

  if (withdraw)
    goto withdraw;

#ifdef IPV6
  /* If we received MP_REACH_NLRI we should check mandatory attributes */
  if (bgp->mp_reach_len != 0)
    mandatory = 1;
#endif

  /* If there is no (reachability) NLRI, we should exit now */
  if (! mandatory)
    return a;

  /* Check if all mandatory attributes are present */
  for(i=0; i < ARRAY_SIZE(bgp_mandatory_attrs); i++)
    {
      code = bgp_mandatory_attrs[i];
      if (!(seen[code/8] & (1 << (code%8))))
	{
	  bgp_error(conn, 3, 3, &bgp_mandatory_attrs[i], 1);
	  return NULL;
	}
    }

  /* When receiving attributes from non-AS4-aware BGP speaker,
   * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes
   */
  if (! bgp->as4_session)
    bgp_reconstruct_4b_atts(bgp, a, pool);

  bgp_remove_as4_attrs(bgp, a);

  /* If the AS path attribute contains our AS, reject the routes */
  if (bgp_as_path_loopy(bgp, a))
    goto withdraw;

  /* Two checks for IBGP loops caused by route reflection, RFC 4456 */ 
  if (bgp_originator_id_loopy(bgp, a) ||
      bgp_cluster_list_loopy(bgp, a))
    goto withdraw;

  /* If there's no local preference, define one */
  if (!(seen[0] & (1 << BA_LOCAL_PREF)))
    bgp_attach_attr(&a->eattrs, pool, BA_LOCAL_PREF, bgp->cf->default_local_pref);

  return a;

withdraw:
  return NULL;

malformed:
  bgp_error(conn, 3, 1, NULL, 0);
  return NULL;

err:
  bgp_error(conn, 3, errcode, attr_start, z+l-attr_start);
  return NULL;
}

int
bgp_get_attr(eattr *a, byte *buf, int buflen)
{
  uint i = EA_ID(a->id);
  struct attr_desc *d;
  int len;

  if (ATTR_KNOWN(i))
    {
      d = &bgp_attr_table[i];
      len = bsprintf(buf, "%s", d->name);
      buf += len;
      if (d->format)
	{
	  *buf++ = ':';
	  *buf++ = ' ';
	  d->format(a, buf, buflen - len - 2);
	  return GA_FULL;
	}
      return GA_NAME;
    }
  bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : "");
  return GA_NAME;
}

void
bgp_init_bucket_table(struct bgp_proto *p)
{
  p->hash_size = 256;
  p->hash_limit = p->hash_size * 4;
  p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
  init_list(&p->bucket_queue);
  p->withdraw_bucket = NULL;
  // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix);
}

void
bgp_free_bucket_table(struct bgp_proto *p)
{
  mb_free(p->bucket_hash);
  p->bucket_hash = NULL;

  struct bgp_bucket *b;
  WALK_LIST_FIRST(b, p->bucket_queue)
  {
    rem_node(&b->send_node);
    mb_free(b);
  }

  mb_free(p->withdraw_bucket);
  p->withdraw_bucket = NULL;
}

void
bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
{
  eattr *p = ea_find(attrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  u32 origas;

  buf += bsprintf(buf, " (%d", e->pref);

  if (e->u.bgp.suppressed)
    buf += bsprintf(buf, "-");

  if (rte_stale(e))
    buf += bsprintf(buf, "s");

  if (e->attrs->hostentry)
    {
      if (!rte_resolvable(e))
	buf += bsprintf(buf, "/-");
      else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN)
	buf += bsprintf(buf, "/?");
      else
	buf += bsprintf(buf, "/%d", e->attrs->igp_metric);
    }
  buf += bsprintf(buf, ") [");

  if (p && as_path_get_last(p->u.ptr, &origas))
    buf += bsprintf(buf, "AS%u", origas);
  if (o)
    buf += bsprintf(buf, "%c", "ie?"[o->u.data]);
  strcpy(buf, "]");
}

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>