[ovs-dev] [patch_v4 3/6] Userspace Datapath: Introduce NAT support.

Darrell Ball dball at vmware.com
Wed Feb 8 02:21:09 UTC 2017



On 1/27/17, 5:57 PM, "ovs-dev-bounces at openvswitch.org on behalf of Daniele Di Proietto" <ovs-dev-bounces at openvswitch.org on behalf of diproiettod at ovn.org> wrote:

    2017-01-24 20:40 GMT-08:00 Darrell Ball <dlu998 at gmail.com>:
    > This patch introduces NAT support for the userspace datapath.
    > The conntrack module changes are in this patch.
    >
    > The per packet scope of lookups for NAT and un_NAT is at
    > the bucket level rather than global. One hash table is
    > introduced to support create/delete handling. The create/delete
    > events may be further optimized, if the need becomes clear.
    >
    > Some NAT options with limited utility (persistent, random) are
    > not supported yet, but will be supported in a later patch.
    >
    > Signed-off-by: Darrell Ball <dlu998 at gmail.com>
    
    Sparse reports some problems:
    
    ../lib/conntrack.c:1375:16: warning: constant 0xffffffffffffffff is so
    big it is unsigned long
    ../lib/conntrack.c:1398:9: warning: constant 0xffffffffffffffff is so
    big it is unsigned long
    ../lib/conntrack.c:1400:36: warning: constant 0xffffffffffffffff is so
    big it is unsigned long
    ../lib/conntrack.c:1403:33: warning: constant 0xffffffffffffffff is so
    big it is unsigned long



    ../lib/conntrack.c:214:30: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:240:30: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1360:52: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1362:52: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1365:52: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1367:52: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1395:44: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1396:44: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1409:25: error: no member 's6_addr32' in struct in6_addr
    ../lib/conntrack.c:1410:25: error: no member 's6_addr32' in struct in6_addr

This second one is real; this would affect windows build. I fixed it.
I have more appreciation for Sparse now.
    
    There are some minor coding style problems, you can find them with
    utilities/checkpatch.py

yeah; I found checkpatch.py helpful in the past.
    
    > ---
    >  lib/conntrack-private.h |  16 +-
    >  lib/conntrack.c         | 740 ++++++++++++++++++++++++++++++++++++++++++------
    >  lib/conntrack.h         |  44 +++
    >  3 files changed, 717 insertions(+), 83 deletions(-)
    >
    > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
    > index 493865f..b71af37 100644
    > --- a/lib/conntrack-private.h
    > +++ b/lib/conntrack-private.h
    > @@ -51,14 +51,23 @@ struct conn_key {
    >      uint16_t zone;
    >  };
    >
    > +struct nat_conn_key_node {
    > +    struct hmap_node node;
    > +    struct conn_key key;
    > +    struct conn_key value;
    > +};
    > +
    >  struct conn {
    >      struct conn_key key;
    >      struct conn_key rev_key;
    >      long long expiration;
    >      struct ovs_list exp_node;
    >      struct hmap_node node;
    > -    uint32_t mark;
    >      ovs_u128 label;
    > +    /* XXX: consider flattening. */
    > +    struct nat_action_info_t *nat_info;
    > +    uint32_t mark;
    > +    uint8_t conn_type;
    >  };
    >
    >  enum ct_update_res {
    > @@ -67,6 +76,11 @@ enum ct_update_res {
    >      CT_UPDATE_NEW,
    >  };
    >
    > +enum ct_conn_type {
    > +    CT_CONN_TYPE_DEFAULT,
    > +       CT_CONN_TYPE_UN_NAT,
    > +};
    > +
    >  struct ct_l4_proto {
    >      struct conn *(*new_conn)(struct conntrack_bucket *, struct dp_packet *pkt,
    >                               long long now);
    > diff --git a/lib/conntrack.c b/lib/conntrack.c
    > index 0a611a2..34728a6 100644
    > --- a/lib/conntrack.c
    > +++ b/lib/conntrack.c
    > @@ -76,6 +76,20 @@ static void set_label(struct dp_packet *, struct conn *,
    >                        const struct ovs_key_ct_labels *mask);
    >  static void *clean_thread_main(void *f_);
    >
    > +static struct nat_conn_key_node *
    > +nat_conn_keys_lookup(struct hmap *nat_conn_keys,
    > +                     const struct conn_key *key,
    > +                     uint32_t basis);
    > +
    > +static void
    > +nat_conn_keys_remove(struct hmap *nat_conn_keys,
    > +                    const struct conn_key *key,
    > +                    uint32_t basis);
    > +
    > +static bool
    > +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
    > +                              struct conn *nat_conn);
    > +
    >  static struct ct_l4_proto *l4_protos[] = {
    >      [IPPROTO_TCP] = &ct_proto_tcp,
    >      [IPPROTO_UDP] = &ct_proto_other,
    > @@ -90,9 +104,11 @@ long long ct_timeout_val[] = {
    >  };
    >
    >  /* If the total number of connections goes above this value, no new connections
    > - * are accepted */
    > + * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
    >  #define DEFAULT_N_CONN_LIMIT 3000000
    >
    > +#define DT
    > +
    
    I guess this is left here from debugging
    
    >  /* Initializes the connection tracker 'ct'.  The caller is responsible for
    >   * calling 'conntrack_destroy()', when the instance is not needed anymore */
    >  void
    > @@ -101,6 +117,11 @@ conntrack_init(struct conntrack *ct)
    >      unsigned i, j;
    >      long long now = time_msec();
    >
    > +    ct_rwlock_init(&ct->nat_resources_lock);
    > +    ct_rwlock_wrlock(&ct->nat_resources_lock);
    > +    hmap_init(&ct->nat_conn_keys);
    > +    ct_rwlock_unlock(&ct->nat_resources_lock);
    > +
    >      for (i = 0; i < CONNTRACK_BUCKETS; i++) {
    >          struct conntrack_bucket *ctb = &ct->buckets[i];
    >
    > @@ -139,13 +160,24 @@ conntrack_destroy(struct conntrack *ct)
    >          ovs_mutex_destroy(&ctb->cleanup_mutex);
    >          ct_lock_lock(&ctb->lock);
    >          HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
    > -            atomic_count_dec(&ct->n_conn);
    > +            if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
    > +                atomic_count_dec(&ct->n_conn);
    > +            }
    >              delete_conn(conn);
    >          }
    >          hmap_destroy(&ctb->connections);
    >          ct_lock_unlock(&ctb->lock);
    >          ct_lock_destroy(&ctb->lock);
    >      }
    > +    ct_rwlock_wrlock(&ct->nat_resources_lock);
    > +    struct nat_conn_key_node *nat_conn_key_node;
    > +    HMAP_FOR_EACH_POP(nat_conn_key_node, node, &ct->nat_conn_keys) {
    > +        free(nat_conn_key_node);
    > +    }
    > +    hmap_destroy(&ct->nat_conn_keys);
    > +    ct_rwlock_unlock(&ct->nat_resources_lock);
    > +    ct_rwlock_destroy(&ct->nat_resources_lock);
    > +
    >  }
    >
    >  static unsigned hash_to_bucket(uint32_t hash)
    > @@ -167,10 +199,175 @@ write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
    >      pkt->md.ct_label = label;
    >  }
    >
    > +static void
    > +nat_packet(struct dp_packet *pkt, const struct conn *conn, uint16_t *state)
    > +{
    > +    if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
    > +           *state |= CS_SRC_NAT;
    > +        if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
    > +            struct ip_header *nh = dp_packet_l3(pkt);
    > +            packet_set_ipv4_addr(pkt, &nh->ip_src,
    > +                conn->rev_key.dst.addr.ipv4_aligned);
    > +        } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
    > +            struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
    > +            struct in6_addr ipv6_addr;
    > +            memcpy(&ipv6_addr.s6_addr32, conn->rev_key.dst.addr.ipv6.be32,
    > +                   sizeof ipv6_addr);
    > +            packet_set_ipv6_addr(pkt, conn->key.nw_proto,
    > +                                 nh6->ip6_src.be32,
    > +                                                                &ipv6_addr,
    > +                                 true);
    > +        }
    > +
    > +        if (conn->key.nw_proto == IPPROTO_TCP) {
    > +            struct tcp_header *th = dp_packet_l4(pkt);
    > +            packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
    > +        } else if (conn->key.nw_proto == IPPROTO_UDP) {
    > +            struct udp_header *uh = dp_packet_l4(pkt);
    > +            packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
    > +        }
    > +    } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
    > +        *state |= CS_DST_NAT;
    > +
    > +        if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
    > +            struct ip_header *nh = dp_packet_l3(pkt);
    > +            packet_set_ipv4_addr(pkt, &nh->ip_dst,
    > +                                 conn->rev_key.src.addr.ipv4_aligned);
    > +        } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
    > +            struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
    > +
    > +            struct in6_addr ipv6_addr;
    > +            memcpy(&ipv6_addr.s6_addr32, conn->rev_key.dst.addr.ipv6.be32,
    > +                   sizeof ipv6_addr);
    > +            packet_set_ipv6_addr(pkt, conn->key.nw_proto,
    > +                                 nh6->ip6_dst.be32,
    > +                                                                &ipv6_addr,
    > +                                                                true);
    > +        }
    > +        if (conn->key.nw_proto == IPPROTO_TCP) {
    > +            struct tcp_header *th = dp_packet_l4(pkt);
    > +            packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
    > +        } else if (conn->key.nw_proto == IPPROTO_UDP) {
    > +            struct udp_header *uh = dp_packet_l4(pkt);
    > +            packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
    > +        }
    > +    }
    > +}
    > +
    > +static void
    > +un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
    > +              uint16_t *state)
    > +{
    > +    if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
    > +        *state |= CS_SRC_NAT;
    > +        if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
    > +            struct ip_header *nh = dp_packet_l3(pkt);
    > +            packet_set_ipv4_addr(pkt, &nh->ip_dst,
    > +                conn->key.src.addr.ipv4_aligned);
    > +        } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
    > +            struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
    > +            struct in6_addr ipv6_addr;
    > +            memcpy(&ipv6_addr, conn->key.src.addr.ipv6.be32,
    > +                   sizeof ipv6_addr);
    > +            packet_set_ipv6_addr(pkt, conn->key.nw_proto,
    > +                                 nh6->ip6_dst.be32,
    > +                                                                &ipv6_addr,
    > +                                                                true);
    > +        }
    > +
    > +        if (conn->key.nw_proto == IPPROTO_TCP) {
    > +            struct tcp_header *th = dp_packet_l4(pkt);
    > +            packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
    > +        } else if (conn->key.nw_proto == IPPROTO_UDP) {
    > +            struct udp_header *uh = dp_packet_l4(pkt);
    > +            packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
    > +        }
    > +    } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
    > +        *state |= CS_DST_NAT;
    > +        if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
    > +            struct ip_header *nh = dp_packet_l3(pkt);
    > +            packet_set_ipv4_addr(pkt, &nh->ip_src,
    > +                conn->key.dst.addr.ipv4_aligned);
    > +        } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
    > +            struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
    > +            struct in6_addr ipv6_addr;
    > +            memcpy(&ipv6_addr, conn->key.dst.addr.ipv6.be32,
    > +                   sizeof ipv6_addr);
    > +            packet_set_ipv6_addr(pkt, conn->key.nw_proto,
    > +                                 nh6->ip6_src.be32,
    > +                                                                &ipv6_addr,
    > +                                                                true);
    > +        }
    > +
    > +        if (conn->key.nw_proto == IPPROTO_TCP) {
    > +            struct tcp_header *th = dp_packet_l4(pkt);
    > +            packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
    > +        } else if (conn->key.nw_proto == IPPROTO_UDP) {
    > +            struct udp_header *uh = dp_packet_l4(pkt);
    > +            packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
    > +        }
    > +    }
    > +}
    > +
    > +/* Typical usage of this helper is in non per-packet code;
    > + * this is because the bucket lock needs to be held for lookup
    > + * and a hash would have already been needed. Hence, this function
    > + * is just intended for code clarity. */
    > +static struct conn *
    > +conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
    > +{
    > +    struct conn_lookup_ctx ctx;
    > +    ctx.conn = NULL;
    > +    ctx.key = *key;
    > +    ctx.hash = conn_key_hash(key, ct->hash_basis);
    > +    unsigned bucket = hash_to_bucket(ctx.hash);
    > +    conn_key_lookup(&ct->buckets[bucket], &ctx, now);
    > +    return ctx.conn;
    > +}
    > +
    > +static void
    > +nat_clean(struct conntrack *ct, struct conn *conn,
    > +          struct conntrack_bucket *ctb)
    > +    OVS_REQUIRES(ctb->lock)
    > +{
    > +    long long now = time_msec();
    > +    ct_rwlock_wrlock(&ct->nat_resources_lock);
    > +    nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
    > +    ct_rwlock_unlock(&ct->nat_resources_lock);
    > +    ct_lock_unlock(&ctb->lock);
    > +
    > +    uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
    > +    unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
    > +
    > +    ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
    > +    ct_rwlock_wrlock(&ct->nat_resources_lock);
    > +
    > +    struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
    > +
    > +       struct nat_conn_key_node *nat_conn_key_node =
    > +        nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
    > +
    > +    /* In the unlikely event, rev conn was recreated, then skip rev_conn cleanup. */
    > +    if ((rev_conn) && (!nat_conn_key_node ||
    > +         memcmp(&nat_conn_key_node->value, &rev_conn->rev_key,
    > +                sizeof nat_conn_key_node->value))) {
    > +        hmap_remove(&ct->buckets[bucket_rev_conn].connections,
    > +                    &rev_conn->node);
    > +        free(rev_conn);
    > +    }
    > +    delete_conn(conn);
    > +
    > +    ct_rwlock_unlock(&ct->nat_resources_lock);
    > +    ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
    > +    ct_lock_lock(&ctb->lock);
    > +
    > +}
    > +
    >  static struct conn *
    >  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >                 struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
    > -               long long now)
    > +               long long now, const struct nat_action_info_t *nat_action_info,
    > +                          struct conn *conn_for_un_nat_copy)
    >  {
    >      unsigned bucket = hash_to_bucket(ctx->hash);
    >      struct conn *nc = NULL;
    > @@ -179,7 +376,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >          *state |= CS_INVALID;
    >          return nc;
    >      }
    > -
    >      *state |= CS_NEW;
    >
    >      if (commit) {
    > @@ -193,71 +389,210 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
    >          }
    >
    >          nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
    > +        ctx->conn = nc;
    > +        memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key);
    > +        conn_key_reverse(&nc->rev_key);
    >
    > -        memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
    > +        if (nat_action_info && nat_action_info->nat_action & NAT_ACTION) {
    > +            nc->nat_info = xzalloc(sizeof *nat_action_info);
    > +            memcpy(nc->nat_info, nat_action_info, sizeof *nc->nat_info);
    > +            ct_rwlock_wrlock(&ct->nat_resources_lock);
    >
    > -        conn_key_reverse(&nc->rev_key);
    > +            bool nat_res = nat_select_range_tuple(ct, nc, conn_for_un_nat_copy);
    > +
    > +            if (!nat_res) {
    > +                free(nc->nat_info);
    > +                nc->nat_info = NULL;
    > +                free (nc);
    > +                ct_rwlock_unlock(&ct->nat_resources_lock);
    > +                return NULL;
    > +            }
    > +
    > +            if (conn_for_un_nat_copy && nc->conn_type == CT_CONN_TYPE_DEFAULT) {
    > +                *nc = *conn_for_un_nat_copy;
    > +                conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
    > +            }
    > +            ct_rwlock_unlock(&ct->nat_resources_lock);
    > +
    > +            nat_packet(pkt, nc, state);
    > +        }
    >          hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
    >          atomic_count_inc(&ct->n_conn);
    >      }
    > -
    >      return nc;
    >  }
    >
    > -static struct conn *
    > -process_one(struct conntrack *ct, struct dp_packet *pkt,
    > -            struct conn_lookup_ctx *ctx, uint16_t zone,
    > -            bool commit, long long now)
    > +static bool
    > +conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
    > +                  struct conn_lookup_ctx *ctx, uint16_t *state,
    > +                  struct conn **conn, long long now,
    > +                  unsigned bucket)
    > +    OVS_REQUIRES(ct->buckets[bucket].lock)
    >  {
    > -    unsigned bucket = hash_to_bucket(ctx->hash);
    > -    struct conn *conn = ctx->conn;
    > -    uint16_t state = 0;
    > +       bool create_new_conn = false;
    >
    > -    if (conn) {
    > -        if (ctx->related) {
    > -            state |= CS_RELATED;
    > +    if (ctx->related) {
    > +        *state |= CS_RELATED;
    > +        if (ctx->reply) {
    > +            *state |= CS_REPLY_DIR;
    > +        }
    > +    } else {
    > +        enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
    > +                                             pkt, ctx->reply, now);
    > +
    > +        switch (res) {
    > +        case CT_UPDATE_VALID:
    > +            *state |= CS_ESTABLISHED;
    >              if (ctx->reply) {
    > -                state |= CS_REPLY_DIR;
    > +                *state |= CS_REPLY_DIR;
    > +            }
    > +            break;
    > +        case CT_UPDATE_INVALID:
    > +            *state |= CS_INVALID;
    > +            break;
    > +        case CT_UPDATE_NEW:
    > +            ovs_list_remove(&(*conn)->exp_node);
    > +            hmap_remove(&ct->buckets[bucket].connections, &(*conn)->node);
    > +            atomic_count_dec(&ct->n_conn);
    > +            if ((*conn)->nat_info) {
    > +                nat_clean(ct, *conn, &ct->buckets[bucket]);
    > +            } else {
    > +                delete_conn(*conn);
    >              }
    > +            create_new_conn = true;
    > +            break;
    > +        default:
    > +            OVS_NOT_REACHED();
    > +        }
    > +    }
    > +    return create_new_conn;
    > +}
    > +
    > +static void
    > +create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
    > +                   long long now)
    > +{
    > +        struct conn *nc = xzalloc(sizeof *nc);
    > +        memcpy(nc, conn_for_un_nat_copy, sizeof *nc);
    > +        nc->key = conn_for_un_nat_copy->rev_key;
    > +        nc->rev_key = conn_for_un_nat_copy->key;
    > +        uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
    > +        unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
    > +        ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
    > +        ct_rwlock_rdlock(&ct->nat_resources_lock);
    > +
    > +        struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
    > +
    > +        struct nat_conn_key_node *nat_conn_key_node =
    > +            nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
    > +        if (nat_conn_key_node && !memcmp(&nat_conn_key_node->value,
    > +            &nc->rev_key, sizeof nat_conn_key_node->value) && !rev_conn) {
    > +
    > +            hmap_insert(&ct->buckets[un_nat_conn_bucket].connections, &nc->node,
    > +                        un_nat_hash);
    >          } else {
    > -            enum ct_update_res res;
    > +            free(nc);
    > +        }
    > +        ct_rwlock_unlock(&ct->nat_resources_lock);
    > +        ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
    > +}
    > +
    > +static void
    > +handle_nat(struct dp_packet *pkt, struct conn *conn,
    > +           uint16_t zone, uint16_t *state, bool reply)
    > +{
    > +    if ((conn->nat_info && conn->nat_info->nat_action & NAT_ACTION) &&
    > +        (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
    > +          (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
    > +           zone != pkt->md.ct_zone))){
    >
    > -            res = conn_update(conn, &ct->buckets[bucket], pkt,
    > -                              ctx->reply, now);
    > +        if(reply) {
    > +            un_nat_packet(pkt, conn, state);
    > +        } else {
    > +            nat_packet(pkt, conn, state);
    > +        }
    > +    }
    > +}
    >
    > -            switch (res) {
    > -            case CT_UPDATE_VALID:
    > -                state |= CS_ESTABLISHED;
    > -                if (ctx->reply) {
    > -                    state |= CS_REPLY_DIR;
    > -                }
    > -                break;
    > -            case CT_UPDATE_INVALID:
    > -                state |= CS_INVALID;
    > -                break;
    > -            case CT_UPDATE_NEW:
    > -                ovs_list_remove(&conn->exp_node);
    > -                hmap_remove(&ct->buckets[bucket].connections, &conn->node);
    > -                atomic_count_dec(&ct->n_conn);
    > -                delete_conn(conn);
    > -                conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
    > -                break;
    > -            default:
    > -                OVS_NOT_REACHED();
    > +static void
    > +process_one(struct conntrack *ct, struct dp_packet *pkt,
    > +            struct conn_lookup_ctx *ctx, uint16_t zone,
    > +            bool commit, long long now, const uint32_t *setmark,
    > +            const struct ovs_key_ct_labels *setlabel,
    > +            const struct nat_action_info_t *nat_action_info)
    > +{
    > +    struct conn *conn;
    > +    uint16_t state = 0;
    > +    unsigned bucket = hash_to_bucket(ctx->hash);
    > +    ct_lock_lock(&ct->buckets[bucket].lock);
    > +    conn_key_lookup(&ct->buckets[bucket], ctx, now);
    > +    conn = ctx->conn;
    > +    struct conn conn_for_un_nat_copy;
    > +    memset(&conn_for_un_nat_copy, 0, sizeof conn_for_un_nat_copy);
    > +
    > +    if (OVS_LIKELY(conn)) {
    > +        if (conn->conn_type == CT_CONN_TYPE_UN_NAT){
    > +            ctx->reply = 1;
    > +
    > +            struct conn_lookup_ctx ctx2;
    > +            ctx2.conn = NULL;
    > +            ctx2.key = conn->rev_key;
    > +            ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
    > +
    > +                   ct_lock_unlock(&ct->buckets[bucket].lock);
    > +            bucket = hash_to_bucket(ctx2.hash);
    > +
    > +            ct_lock_lock(&ct->buckets[bucket].lock);
    > +            conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
    > +
    > +            if (ctx2.conn) {
    > +                conn = ctx2.conn;
    > +            } else {
    > +                /* It is a race condition where conn has timed out and removed
    > +                 * between unlock of the rev_conn and lock of the forward conn;
    > +                 * nothing to do. */
    > +                ct_lock_unlock(&ct->buckets[bucket].lock);
    > +                return;
    >              }
    >          }
    > +    }
    > +
    > +    bool create_new_conn = false;
    > +    if (OVS_LIKELY(conn)) {
    > +       create_new_conn = conn_update_state(ct, pkt, ctx, &state, &conn,
    > +                                            now, bucket);
    > +       if (nat_action_info->nat_action && !create_new_conn) {
    > +            handle_nat(pkt, conn, zone, &state, ctx->reply);
    > +       }
    >      } else {
    >          if (ctx->related) {
    >              state |= CS_INVALID;
    >          } else {
    > -            conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
    > +               create_new_conn = true;
    >          }
    >      }
    >
    > +    if (OVS_UNLIKELY(create_new_conn)) {
    > +        conn = conn_not_found(ct, pkt, ctx, &state, commit,
    > +                              now, nat_action_info, &conn_for_un_nat_copy);
    > +    }
    > +
    >      write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
    >                  conn ? conn->label : OVS_U128_ZERO);
    >
    > -    return conn;
    > +    if (conn && setmark) {
    > +        set_mark(pkt, conn, setmark[0], setmark[1]);
    > +    }
    > +
    > +    if (conn && setlabel) {
    > +        set_label(pkt, conn, &setlabel[0], &setlabel[1]);
    > +    }
    > +
    > +    ct_lock_unlock(&ct->buckets[bucket].lock);
    > +
    > +    if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
    > +        create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
    > +    }
    >  }
    >
    >  /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
    > @@ -274,7 +609,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
    >                    const uint32_t *setmark,
    >                    const struct ovs_key_ct_labels *setlabel,
    >                    const char *helper,
    > -                  const struct nat_action_info_t *nat_action_info OVS_UNUSED)
    > +                  const struct nat_action_info_t *nat_action_info)
    >  {
    >      struct dp_packet **pkts = pkt_batch->packets;
    >      size_t cnt = pkt_batch->count;
    > @@ -324,27 +659,12 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
    >      }
    >
    >      for (i = 0; i < arrcnt; i++) {
    > -        struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
    >          size_t j;
    >
    > -        ct_lock_lock(&ctb->lock);
    > -
    >          ULLONG_FOR_EACH_1(j, arr[i].maps) {
    > -            struct conn *conn;
    > -
    > -            conn_key_lookup(ctb, &ctxs[j], now);
    > -
    > -            conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
    > -
    > -            if (conn && setmark) {
    > -                set_mark(pkts[j], conn, setmark[0], setmark[1]);
    > -            }
    > -
    > -            if (conn && setlabel) {
    > -                set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
    > -            }
    > +            process_one(ct, pkts[j], &ctxs[j], zone, commit,
    > +                               now, setmark, setlabel, nat_action_info);
    
    I think it's fine to remove the lock batching.  If we do that can we
    can simplify
    a lot the code above that groups based on the bucket.


For those not privy to our offline conversation, the main reason
I wanted to remove the batching is for performance benefit for real
flow set handling rather than artificial benchmarking. 
Code simplification is also nice.
I’ll include the changes with this patchset, then.


    >          }
    > -        ct_lock_unlock(&ctb->lock);
    >      }
    >
    >      return 0;
    > @@ -373,6 +693,7 @@ set_label(struct dp_packet *pkt, struct conn *conn,
    >                                | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
    >      conn->label = pkt->md.ct_label;
    >  }
    > +
    >
    >  /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
    >   * earliest expiration time among the remaining connections in 'ctb'.  Returns
    > @@ -390,20 +711,27 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
    >
    >      for (i = 0; i < N_CT_TM; i++) {
    >          LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
    > -            if (!conn_expired(conn, now) || count >= limit) {
    > -                min_expiration = MIN(min_expiration, conn->expiration);
    > -                if (count >= limit) {
    > -                    /* Do not check other lists. */
    > -                    COVERAGE_INC(conntrack_long_cleanup);
    > -                    return min_expiration;
    > +            if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
    > +                if (!conn_expired(conn, now) || count >= limit) {
    > +                    min_expiration = MIN(min_expiration, conn->expiration);
    > +                    if (count >= limit) {
    > +                        /* Do not check other lists. */
    > +                        COVERAGE_INC(conntrack_long_cleanup);
    > +                        return min_expiration;
    > +                    }
    > +                    break;
    >                  }
    > -                break;
    > +                ovs_list_remove(&conn->exp_node);
    > +                hmap_remove(&ctb->connections, &conn->node);
    > +                if (conn->nat_info) {
    > +                    nat_clean(ct, conn, ctb);
    > +                } else {
    > +                    delete_conn(conn);
    > +                }
    > +
    > +                atomic_count_dec(&ct->n_conn);
    > +                count++;
    >              }
    > -            ovs_list_remove(&conn->exp_node);
    > -            hmap_remove(&ctb->connections, &conn->node);
    > -            atomic_count_dec(&ct->n_conn);
    > -            delete_conn(conn);
    > -            count++;
    >          }
    >      }
    >
    > @@ -774,7 +1102,6 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
    >              return false;
    >          }
    >
    > -        /* pf doesn't do this, but it seems a good idea */
    >          if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
    >              || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
    >              return false;
    > @@ -998,7 +1325,6 @@ conn_key_hash(const struct conn_key *key, uint32_t basis)
    >
    >      hsrc = hdst = basis;
    >
    > -    /* Hash the source and destination tuple */
    >      for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
    >          hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
    >          hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
    > @@ -1025,6 +1351,247 @@ conn_key_reverse(struct conn_key *key)
    >      key->dst = tmp;
    >  }
    >
    > +static uint32_t
    > +nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min, struct in6_addr *ipv6_aligned_max)
    > +{
    > +    uint64_t diff = 0;
    > +    ovs_u128 addr6_128_min;
    > +    memcpy(&addr6_128_min.u64.hi, &ipv6_aligned_min->s6_addr32[0],
    > +           sizeof addr6_128_min.u64.hi);
    > +    memcpy(&addr6_128_min.u64.lo, &ipv6_aligned_min->s6_addr32[2],
    > +           sizeof addr6_128_min.u64.lo);
    > +    ovs_u128 addr6_128_max;
    > +    memcpy(&addr6_128_max.u64.hi, &ipv6_aligned_max->s6_addr32[0],
    > +           sizeof addr6_128_max.u64.hi);
    > +    memcpy(&addr6_128_max.u64.lo, &ipv6_aligned_max->s6_addr32[2],
    > +           sizeof addr6_128_max.u64.lo);
    > +
    > +    if ((ntohll(addr6_128_min.u64.hi) == ntohll(addr6_128_max.u64.hi)) &&
    > +        (ntohll(addr6_128_min.u64.lo) <= ntohll(addr6_128_max.u64.lo))){
    > +        diff = ntohll(addr6_128_max.u64.lo) - ntohll(addr6_128_min.u64.lo);
    > +       } else if ((ntohll(addr6_128_min.u64.hi) + 1 == ntohll(addr6_128_max.u64.hi)) &&
    > +               (ntohll(addr6_128_min.u64.lo) > ntohll(addr6_128_max.u64.lo))) {
    > +        diff = 0xffffffffffffffff - (ntohll(addr6_128_min.u64.lo) -
    > +                   ntohll(addr6_128_max.u64.lo) - 1);
    > +       } else {
    > +        /* Limit address delta supported to 32 bits or 4 billion approximately.
    > +         * Possibly, this should be visible to the user through a datapath
    > +         * support check, however the practical impact is probably nil. */
    > +        diff = 0xfffffffe;
    > +       }
    > +    if (diff > 0xfffffffe) {
    > +        diff = 0xfffffffe;
    > +    }
    > +    return (uint32_t)diff;
    > +}
    > +
    > +/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
    > + * restricts the input parameters. */
    > +static void
    > +nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
    > +{
    > +    ovs_u128 addr6_128;
    > +    memcpy(&addr6_128.u64.hi, &ipv6_aligned->s6_addr32[0], sizeof addr6_128.u64.hi);
    > +    memcpy(&addr6_128.u64.lo, &ipv6_aligned->s6_addr32[2], sizeof addr6_128.u64.lo);
    > +
    > +    if (0xffffffffffffffff - increment >= ntohll(addr6_128.u64.lo)) {
    > +        addr6_128.u64.lo = htonll(increment + ntohll(addr6_128.u64.lo));
    > +    } else if (addr6_128.u64.hi != 0xffffffffffffffff) {
    > +        addr6_128.u64.hi = htonll(1 + ntohll(addr6_128.u64.hi));
    > +        addr6_128.u64.lo =
    > +            htonll(increment - (0xffffffffffffffff -
    > +                   ntohll(addr6_128.u64.lo) + 1));
    > +    } else {
    > +        OVS_NOT_REACHED();
    > +    }
    > +
    > +    memcpy(&ipv6_aligned->s6_addr32[0], &addr6_128.u64.hi, sizeof addr6_128.u64.hi);
    > +    memcpy(&ipv6_aligned->s6_addr32[2], &addr6_128.u64.lo, sizeof addr6_128.u64.lo);
    > +
    > +       return;
    > +}
    > +
    > +static uint32_t
    > +nat_range_hash(const struct conn *conn, uint32_t basis)
    > +{
    > +    uint32_t hash = basis;
    > +    int i;
    > +    uint32_t port;
    > +
    > +    for (i = 0;
    > +         i < sizeof(conn->nat_info->min_addr) / sizeof(uint32_t);
    > +         i++) {
    > +        hash = hash_add(hash, ((uint32_t *) &conn->nat_info->min_addr)[i]);
    > +        hash = hash_add(hash, ((uint32_t *) &conn->nat_info->max_addr)[i]);
    > +    }
    > +
    > +    memcpy(&port, &conn->nat_info->min_port, sizeof port);
    > +    hash = hash_add(hash, port);
    > +
    > +    uint32_t dl_type_for_hash = (uint32_t) conn->key.dl_type;
    > +    hash = hash_add(hash,  dl_type_for_hash);
    > +    uint32_t nw_proto_for_hash = (uint32_t) conn->key.nw_proto;
    > +    hash = hash_add(hash,  nw_proto_for_hash);
    > +    uint32_t zone_for_hash = (uint32_t) conn->key.zone;
    > +    hash = hash_add(hash,  zone_for_hash);
    > +    return hash;
    > +}
    > +
    > +static bool
    > +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
    > +                              struct conn *nat_conn)
    > +{
    > +#define MIN_NAT_EPHEMERAL_PORT 1024
    > +#define MAX_NAT_EPHEMERAL_PORT 65535
    > +
    > +    uint16_t min_port;
    > +    uint16_t max_port;
    > +    uint16_t first_port;
    > +
    > +    uint32_t hash = nat_range_hash(conn, ct->hash_basis);
    > +
    > +    if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
    > +        (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
    > +        min_port = ntohs(conn->key.src.port);
    > +        max_port = ntohs(conn->key.src.port);
    > +        first_port = min_port;
    > +    } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
    > +               (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
    > +        min_port = ntohs(conn->key.dst.port);
    > +        max_port = ntohs(conn->key.dst.port);
    > +        first_port = min_port;
    > +    } else {
    > +        uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
    > +        uint32_t port_index = hash % (deltap + 1);
    > +        first_port = conn->nat_info->min_port + port_index;
    > +        min_port = conn->nat_info->min_port;
    > +        max_port = conn->nat_info->max_port;
    > +    }
    > +
    > +    uint32_t deltaa = 0;
    > +    uint32_t address_index;
    > +    struct ct_addr ct_addr;
    > +    memset(&ct_addr, 0, sizeof ct_addr);
    > +    struct ct_addr max_ct_addr;
    > +    memset(&max_ct_addr, 0, sizeof max_ct_addr);
    > +    max_ct_addr = conn->nat_info->max_addr;
    > +
    > +    if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
    > +        deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
    > +                 ntohl(conn->nat_info->min_addr.ipv4_aligned);
    > +        address_index = hash % (deltaa + 1);
    > +        ct_addr.ipv4_aligned = htonl(ntohl(conn->nat_info->min_addr.ipv4_aligned) +
    > +                                           address_index);
    > +    } else {
    > +        deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
    > +                                      &conn->nat_info->max_addr.ipv6_aligned);
    > +        /* deltaa must be within 32 bits for full hash coverage. A 64 or 128 bit hash is
    > +         * unnecessary and hence not used here. Most code is kept common with V4;
    > +         * nat_ipv6_addrs_delta() will do the enforcement via max_ct_addr.  */
    > +        max_ct_addr = conn->nat_info->min_addr;
    > +        nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
    > +
    > +        address_index = hash % (deltaa + 1);
    > +        ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
    > +        nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
    > +    }
    > +
    > +    uint16_t port = first_port;
    > +    bool all_ports_tried = false;
    > +    bool original_ports_tried = false;
    > +    struct ct_addr first_addr = ct_addr;
    > +    *nat_conn = *conn;
    > +    while (true) {
    > +        if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
    > +            nat_conn->rev_key.dst.addr = ct_addr;
    > +            nat_conn->rev_key.dst.port = htons(port);
    > +        } else {
    > +            nat_conn->rev_key.src.addr = ct_addr;
    > +            nat_conn->rev_key.src.port = htons(port);
    > +        }
    > +
    > +        struct nat_conn_key_node *nat_conn_key_node =
    > +            nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
    > +                                 ct->hash_basis);
    > +
    > +        if (!nat_conn_key_node) {
    > +            struct nat_conn_key_node *nat_conn_key = xzalloc(sizeof *nat_conn_key);
    > +            memcpy(&nat_conn_key->key, &nat_conn->rev_key, sizeof nat_conn_key->key);
    > +            memcpy(&nat_conn_key->value, &nat_conn->key, sizeof nat_conn_key->value);
    > +            uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key, ct->hash_basis);
    > +            hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node, nat_conn_key_hash);
    > +            return true;
    > +        } else if (!all_ports_tried) {
    > +            if (min_port == max_port) {
    > +                all_ports_tried = true;
    > +            } else if(port == max_port) {
    > +                port = min_port;
    > +            } else {
    > +                port++;
    > +            }
    > +            if (port == first_port) {
    > +                all_ports_tried = true;
    > +            }
    > +           } else {
    > +            if(memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
    > +                if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
    > +                    ct_addr.ipv4_aligned = htonl(ntohl(ct_addr.ipv4_aligned) + 1);
    > +                } else {
    > +                    nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
    > +                }
    > +            } else {
    > +                ct_addr = conn->nat_info->min_addr;
    > +            }
    > +            if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
    > +                if (!original_ports_tried) {
    > +                    original_ports_tried = true;
    > +                    ct_addr = conn->nat_info->min_addr;
    > +                    min_port = MIN_NAT_EPHEMERAL_PORT;
    > +                    max_port = MAX_NAT_EPHEMERAL_PORT;
    > +                } else {
    > +                    break;
    > +                }
    > +            }
    > +            first_port = min_port;
    > +            port = first_port;
    > +            all_ports_tried = false;
    > +           }
    > +       }
    > +    return false;
    > +}
    > +
    > +static struct nat_conn_key_node *
    > +nat_conn_keys_lookup(struct hmap *nat_conn_keys,
    > +                     const struct conn_key *key,
    > +                     uint32_t basis)
    > +{
    > +    struct nat_conn_key_node *nat_conn_key_node;
    > +    uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
    > +
    > +    HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, nat_conn_keys) {
    > +        if (!memcmp(&nat_conn_key_node->key, key, sizeof nat_conn_key_node->key)) {
    > +            return nat_conn_key_node;
    > +        }
    > +    }
    > +    return NULL;
    > +}
    > +
    > +static void
    > +nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
    > +                     uint32_t basis)
    > +{
    > +    struct nat_conn_key_node *nat_conn_key_node;
    > +    uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
    > +
    > +    HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, nat_conn_keys) {
    > +        if (!memcmp(&nat_conn_key_node->key, key, sizeof nat_conn_key_node->key)) {
    > +            hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
    > +            free(nat_conn_key_node);
    > +            return;
    > +        }
    > +    }
    > +}
    > +
    >  static void
    >  conn_key_lookup(struct conntrack_bucket *ctb,
    >                  struct conn_lookup_ctx *ctx,
    > @@ -1036,13 +1603,13 @@ conn_key_lookup(struct conntrack_bucket *ctb,
    >      ctx->conn = NULL;
    >
    >      HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
    > -        if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
    > +        if (!memcmp(&conn->key, &ctx->key, sizeof conn->key)
    >                  && !conn_expired(conn, now)) {
    >              ctx->conn = conn;
    >              ctx->reply = false;
    >              break;
    >          }
    > -        if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
    > +        if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key)
    >                  && !conn_expired(conn, now)) {
    >              ctx->conn = conn;
    >              ctx->reply = true;
    > @@ -1062,7 +1629,10 @@ conn_update(struct conn *conn, struct conntrack_bucket *ctb,
    >  static bool
    >  conn_expired(struct conn *conn, long long now)
    >  {
    > -    return now >= conn->expiration;
    > +       if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
    > +        return now >= conn->expiration;
    > +       }
    > +       return false;
    >  }
    >
    >  static bool
    > @@ -1089,6 +1659,7 @@ new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
    >  static void
    >  delete_conn(struct conn *conn)
    >  {
    > +       free(conn->nat_info);
    >      free(conn);
    >  }
    >
    > @@ -1141,7 +1712,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
    >      entry->zone = conn->key.zone;
    >      entry->mark = conn->mark;
    >
    > -    memcpy(&entry->labels, &conn->label, sizeof(entry->labels));
    > +    memcpy(&entry->labels, &conn->label, sizeof entry->labels);
    >      /* Not implemented yet */
    >      entry->timestamp.start = 0;
    >      entry->timestamp.stop = 0;
    > @@ -1188,7 +1759,8 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
    >                  break;
    >              }
    >              INIT_CONTAINER(conn, node, node);
    > -            if (!dump->filter_zone || conn->key.zone == dump->zone) {
    > +            if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
    > +                 (conn->conn_type != CT_CONN_TYPE_UN_NAT)){
    >                  conn_to_ct_dpif_entry(conn, entry, now);
    >                  break;
    >              }
    > @@ -1223,15 +1795,19 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
    >
    >          ct_lock_lock(&ct->buckets[i].lock);
    >          HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
    > -            if (!zone || *zone == conn->key.zone) {
    > +            if ((!zone || *zone == conn->key.zone) &&
    > +                (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
    >                  ovs_list_remove(&conn->exp_node);
    >                  hmap_remove(&ct->buckets[i].connections, &conn->node);
    >                  atomic_count_dec(&ct->n_conn);
    > -                delete_conn(conn);
    > +                if (conn->nat_info) {
    > +                    nat_clean(ct, conn, &ct->buckets[i]);
    > +                } else {
    > +                    delete_conn(conn);
    > +                }
    >              }
    >          }
    >          ct_lock_unlock(&ct->buckets[i].lock);
    >      }
    > -
    >      return 0;
    >  }
    > diff --git a/lib/conntrack.h b/lib/conntrack.h
    > index 471e529..5174123 100644
    > --- a/lib/conntrack.h
    > +++ b/lib/conntrack.h
    > @@ -26,6 +26,7 @@
    >  #include "openvswitch/thread.h"
    >  #include "openvswitch/types.h"
    >  #include "ovs-atomic.h"
    > +#include "ovs-thread.h"
    >  #include "packets.h"
    >
    >  /* Userspace connection tracker
    > @@ -71,6 +72,7 @@ struct ct_addr {
    >      };
    >  };
    >
    > +// Both NAT_ACTION_* and NAT_ACTION_*_PORT can be set
    >  enum nat_action_e {
    >         NAT_ACTION = 1 << 0,
    >         NAT_ACTION_SRC = 1 << 1,
    > @@ -121,6 +123,10 @@ struct OVS_LOCKABLE ct_lock {
    >      struct ovs_mutex lock;
    >  };
    >
    > +struct OVS_LOCKABLE ct_rwlock {
    > +    struct ovs_rwlock lock;
    > +};
    > +
    >  static inline void ct_lock_init(struct ct_lock *lock)
    >  {
    >      ovs_mutex_init_adaptive(&lock->lock);
    > @@ -144,6 +150,38 @@ static inline void ct_lock_destroy(struct ct_lock *lock)
    >  {
    >      ovs_mutex_destroy(&lock->lock);
    >  }
    > +
    > +static inline void ct_rwlock_init(struct ct_rwlock *lock)
    > +{
    > +    ovs_rwlock_init(&lock->lock);
    > +}
    > +
    > +static inline void ct_rwlock_wrlock(struct ct_rwlock *lock)
    > +    OVS_ACQUIRES(lock)
    > +    OVS_NO_THREAD_SAFETY_ANALYSIS
    > +{
    > +       ovs_rwlock_wrlock(&lock->lock);
    > +}
    > +
    > +static inline void ct_rwlock_rdlock(struct ct_rwlock *lock)
    > +    OVS_ACQUIRES(lock)
    > +    OVS_NO_THREAD_SAFETY_ANALYSIS
    > +{
    > +       ovs_rwlock_rdlock(&lock->lock);
    > +}
    > +
    > +static inline void ct_rwlock_unlock(struct ct_rwlock *lock)
    > +    OVS_RELEASES(lock)
    > +    OVS_NO_THREAD_SAFETY_ANALYSIS
    > +{
    > +       ovs_rwlock_unlock(&lock->lock);
    > +}
    > +
    > +static inline void ct_rwlock_destroy(struct ct_rwlock *lock)
    > +{
    > +       ovs_rwlock_destroy(&lock->lock);
    > +}
    > +
    >
    >  /* Timeouts: all the possible timeout states passed to update_expiration()
    >   * are listed here. The name will be prefix by CT_TM_ and the value is in
    > @@ -226,6 +264,12 @@ struct conntrack {
    >      /* Connections limit. When this limit is reached, no new connection
    >       * will be accepted. */
    >      atomic_uint n_conn_limit;
    > +
    > +    /* The following resources are referenced during nat connection
    > +     * creation and deletion. */
    > +    struct hmap nat_conn_keys OVS_GUARDED;
    > +    struct ct_rwlock nat_resources_lock;
    > +
    
    Could you add a comment about the nesting of nat_resources_lock?

done
    
    >  };
    >
    >  #endif /* conntrack.h */
    > --
    > 1.9.1
    >
    > _______________________________________________
    > dev mailing list
    > dev at openvswitch.org
    > https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=SzKAcsNrQCz7dgTv0a20z7sYOpBbGVyBNClNgUM3Cfg&s=_oJLYjfH0_ivCkT3IbFmPNJMMW8C9SN7-mF4wWFv-tI&e= 
    
    _______________________________________________
    dev mailing list
    dev at openvswitch.org
    https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=SzKAcsNrQCz7dgTv0a20z7sYOpBbGVyBNClNgUM3Cfg&s=_oJLYjfH0_ivCkT3IbFmPNJMMW8C9SN7-mF4wWFv-tI&e= 
    



More information about the dev mailing list