[ovs-dev] [patch_v4 3/6] Userspace Datapath: Introduce NAT support.
Daniele Di Proietto
diproiettod at ovn.org
Sat Jan 28 01:57:29 UTC 2017
2017-01-24 20:40 GMT-08:00 Darrell Ball <dlu998 at gmail.com>:
> This patch introduces NAT support for the userspace datapath.
> The conntrack module changes are in this patch.
>
> The per packet scope of lookups for NAT and un_NAT is at
> the bucket level rather than global. One hash table is
> introduced to support create/delete handling. The create/delete
> events may be further optimized, if the need becomes clear.
>
> Some NAT options with limited utility (persistent, random) are
> not supported yet, but will be supported in a later patch.
>
> Signed-off-by: Darrell Ball <dlu998 at gmail.com>
Sparse reports some problems:
../lib/conntrack.c:1375:16: warning: constant 0xffffffffffffffff is so
big it is unsigned long
../lib/conntrack.c:1398:9: warning: constant 0xffffffffffffffff is so
big it is unsigned long
../lib/conntrack.c:1400:36: warning: constant 0xffffffffffffffff is so
big it is unsigned long
../lib/conntrack.c:1403:33: warning: constant 0xffffffffffffffff is so
big it is unsigned long
../lib/conntrack.c:214:30: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:240:30: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1360:52: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1362:52: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1365:52: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1367:52: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1395:44: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1396:44: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1409:25: error: no member 's6_addr32' in struct in6_addr
../lib/conntrack.c:1410:25: error: no member 's6_addr32' in struct in6_addr
There are some minor coding style problems, you can find them with
utilities/checkpatch.py
> ---
> lib/conntrack-private.h | 16 +-
> lib/conntrack.c | 740 ++++++++++++++++++++++++++++++++++++++++++------
> lib/conntrack.h | 44 +++
> 3 files changed, 717 insertions(+), 83 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index 493865f..b71af37 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -51,14 +51,23 @@ struct conn_key {
> uint16_t zone;
> };
>
> +struct nat_conn_key_node {
> + struct hmap_node node;
> + struct conn_key key;
> + struct conn_key value;
> +};
> +
> struct conn {
> struct conn_key key;
> struct conn_key rev_key;
> long long expiration;
> struct ovs_list exp_node;
> struct hmap_node node;
> - uint32_t mark;
> ovs_u128 label;
> + /* XXX: consider flattening. */
> + struct nat_action_info_t *nat_info;
> + uint32_t mark;
> + uint8_t conn_type;
> };
>
> enum ct_update_res {
> @@ -67,6 +76,11 @@ enum ct_update_res {
> CT_UPDATE_NEW,
> };
>
> +enum ct_conn_type {
> + CT_CONN_TYPE_DEFAULT,
> + CT_CONN_TYPE_UN_NAT,
> +};
> +
> struct ct_l4_proto {
> struct conn *(*new_conn)(struct conntrack_bucket *, struct dp_packet *pkt,
> long long now);
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 0a611a2..34728a6 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -76,6 +76,20 @@ static void set_label(struct dp_packet *, struct conn *,
> const struct ovs_key_ct_labels *mask);
> static void *clean_thread_main(void *f_);
>
> +static struct nat_conn_key_node *
> +nat_conn_keys_lookup(struct hmap *nat_conn_keys,
> + const struct conn_key *key,
> + uint32_t basis);
> +
> +static void
> +nat_conn_keys_remove(struct hmap *nat_conn_keys,
> + const struct conn_key *key,
> + uint32_t basis);
> +
> +static bool
> +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
> + struct conn *nat_conn);
> +
> static struct ct_l4_proto *l4_protos[] = {
> [IPPROTO_TCP] = &ct_proto_tcp,
> [IPPROTO_UDP] = &ct_proto_other,
> @@ -90,9 +104,11 @@ long long ct_timeout_val[] = {
> };
>
> /* If the total number of connections goes above this value, no new connections
> - * are accepted */
> + * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
> #define DEFAULT_N_CONN_LIMIT 3000000
>
> +#define DT
> +
I guess this is left here from debugging
> /* Initializes the connection tracker 'ct'. The caller is responsible for
> * calling 'conntrack_destroy()', when the instance is not needed anymore */
> void
> @@ -101,6 +117,11 @@ conntrack_init(struct conntrack *ct)
> unsigned i, j;
> long long now = time_msec();
>
> + ct_rwlock_init(&ct->nat_resources_lock);
> + ct_rwlock_wrlock(&ct->nat_resources_lock);
> + hmap_init(&ct->nat_conn_keys);
> + ct_rwlock_unlock(&ct->nat_resources_lock);
> +
> for (i = 0; i < CONNTRACK_BUCKETS; i++) {
> struct conntrack_bucket *ctb = &ct->buckets[i];
>
> @@ -139,13 +160,24 @@ conntrack_destroy(struct conntrack *ct)
> ovs_mutex_destroy(&ctb->cleanup_mutex);
> ct_lock_lock(&ctb->lock);
> HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
> - atomic_count_dec(&ct->n_conn);
> + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> + atomic_count_dec(&ct->n_conn);
> + }
> delete_conn(conn);
> }
> hmap_destroy(&ctb->connections);
> ct_lock_unlock(&ctb->lock);
> ct_lock_destroy(&ctb->lock);
> }
> + ct_rwlock_wrlock(&ct->nat_resources_lock);
> + struct nat_conn_key_node *nat_conn_key_node;
> + HMAP_FOR_EACH_POP(nat_conn_key_node, node, &ct->nat_conn_keys) {
> + free(nat_conn_key_node);
> + }
> + hmap_destroy(&ct->nat_conn_keys);
> + ct_rwlock_unlock(&ct->nat_resources_lock);
> + ct_rwlock_destroy(&ct->nat_resources_lock);
> +
> }
>
> static unsigned hash_to_bucket(uint32_t hash)
> @@ -167,10 +199,175 @@ write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
> pkt->md.ct_label = label;
> }
>
> +static void
> +nat_packet(struct dp_packet *pkt, const struct conn *conn, uint16_t *state)
> +{
> + if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
> + *state |= CS_SRC_NAT;
> + if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> + struct ip_header *nh = dp_packet_l3(pkt);
> + packet_set_ipv4_addr(pkt, &nh->ip_src,
> + conn->rev_key.dst.addr.ipv4_aligned);
> + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
> + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> + struct in6_addr ipv6_addr;
> + memcpy(&ipv6_addr.s6_addr32, conn->rev_key.dst.addr.ipv6.be32,
> + sizeof ipv6_addr);
> + packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> + nh6->ip6_src.be32,
> + &ipv6_addr,
> + true);
> + }
> +
> + if (conn->key.nw_proto == IPPROTO_TCP) {
> + struct tcp_header *th = dp_packet_l4(pkt);
> + packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
> + } else if (conn->key.nw_proto == IPPROTO_UDP) {
> + struct udp_header *uh = dp_packet_l4(pkt);
> + packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
> + }
> + } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
> + *state |= CS_DST_NAT;
> +
> + if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> + struct ip_header *nh = dp_packet_l3(pkt);
> + packet_set_ipv4_addr(pkt, &nh->ip_dst,
> + conn->rev_key.src.addr.ipv4_aligned);
> + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
> + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> +
> + struct in6_addr ipv6_addr;
> + memcpy(&ipv6_addr.s6_addr32, conn->rev_key.dst.addr.ipv6.be32,
> + sizeof ipv6_addr);
> + packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> + nh6->ip6_dst.be32,
> + &ipv6_addr,
> + true);
> + }
> + if (conn->key.nw_proto == IPPROTO_TCP) {
> + struct tcp_header *th = dp_packet_l4(pkt);
> + packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
> + } else if (conn->key.nw_proto == IPPROTO_UDP) {
> + struct udp_header *uh = dp_packet_l4(pkt);
> + packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
> + }
> + }
> +}
> +
> +static void
> +un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
> + uint16_t *state)
> +{
> + if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
> + *state |= CS_SRC_NAT;
> + if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> + struct ip_header *nh = dp_packet_l3(pkt);
> + packet_set_ipv4_addr(pkt, &nh->ip_dst,
> + conn->key.src.addr.ipv4_aligned);
> + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
> + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> + struct in6_addr ipv6_addr;
> + memcpy(&ipv6_addr, conn->key.src.addr.ipv6.be32,
> + sizeof ipv6_addr);
> + packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> + nh6->ip6_dst.be32,
> + &ipv6_addr,
> + true);
> + }
> +
> + if (conn->key.nw_proto == IPPROTO_TCP) {
> + struct tcp_header *th = dp_packet_l4(pkt);
> + packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
> + } else if (conn->key.nw_proto == IPPROTO_UDP) {
> + struct udp_header *uh = dp_packet_l4(pkt);
> + packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
> + }
> + } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
> + *state |= CS_DST_NAT;
> + if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> + struct ip_header *nh = dp_packet_l3(pkt);
> + packet_set_ipv4_addr(pkt, &nh->ip_src,
> + conn->key.dst.addr.ipv4_aligned);
> + } else if (conn->key.dl_type == htons(ETH_TYPE_IPV6)) {
> + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
> + struct in6_addr ipv6_addr;
> + memcpy(&ipv6_addr, conn->key.dst.addr.ipv6.be32,
> + sizeof ipv6_addr);
> + packet_set_ipv6_addr(pkt, conn->key.nw_proto,
> + nh6->ip6_src.be32,
> + &ipv6_addr,
> + true);
> + }
> +
> + if (conn->key.nw_proto == IPPROTO_TCP) {
> + struct tcp_header *th = dp_packet_l4(pkt);
> + packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
> + } else if (conn->key.nw_proto == IPPROTO_UDP) {
> + struct udp_header *uh = dp_packet_l4(pkt);
> + packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
> + }
> + }
> +}
> +
> +/* Typical usage of this helper is in non per-packet code;
> + * this is because the bucket lock needs to be held for lookup
> + * and a hash would have already been needed. Hence, this function
> + * is just intended for code clarity. */
> +static struct conn *
> +conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
> +{
> + struct conn_lookup_ctx ctx;
> + ctx.conn = NULL;
> + ctx.key = *key;
> + ctx.hash = conn_key_hash(key, ct->hash_basis);
> + unsigned bucket = hash_to_bucket(ctx.hash);
> + conn_key_lookup(&ct->buckets[bucket], &ctx, now);
> + return ctx.conn;
> +}
> +
> +static void
> +nat_clean(struct conntrack *ct, struct conn *conn,
> + struct conntrack_bucket *ctb)
> + OVS_REQUIRES(ctb->lock)
> +{
> + long long now = time_msec();
> + ct_rwlock_wrlock(&ct->nat_resources_lock);
> + nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
> + ct_rwlock_unlock(&ct->nat_resources_lock);
> + ct_lock_unlock(&ctb->lock);
> +
> + uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
> + unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
> +
> + ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
> + ct_rwlock_wrlock(&ct->nat_resources_lock);
> +
> + struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
> +
> + struct nat_conn_key_node *nat_conn_key_node =
> + nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
> +
> + /* In the unlikely event, rev conn was recreated, then skip rev_conn cleanup. */
> + if ((rev_conn) && (!nat_conn_key_node ||
> + memcmp(&nat_conn_key_node->value, &rev_conn->rev_key,
> + sizeof nat_conn_key_node->value))) {
> + hmap_remove(&ct->buckets[bucket_rev_conn].connections,
> + &rev_conn->node);
> + free(rev_conn);
> + }
> + delete_conn(conn);
> +
> + ct_rwlock_unlock(&ct->nat_resources_lock);
> + ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
> + ct_lock_lock(&ctb->lock);
> +
> +}
> +
> static struct conn *
> conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
> - long long now)
> + long long now, const struct nat_action_info_t *nat_action_info,
> + struct conn *conn_for_un_nat_copy)
> {
> unsigned bucket = hash_to_bucket(ctx->hash);
> struct conn *nc = NULL;
> @@ -179,7 +376,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> *state |= CS_INVALID;
> return nc;
> }
> -
> *state |= CS_NEW;
>
> if (commit) {
> @@ -193,71 +389,210 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> }
>
> nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
> + ctx->conn = nc;
> + memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key);
> + conn_key_reverse(&nc->rev_key);
>
> - memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
> + if (nat_action_info && nat_action_info->nat_action & NAT_ACTION) {
> + nc->nat_info = xzalloc(sizeof *nat_action_info);
> + memcpy(nc->nat_info, nat_action_info, sizeof *nc->nat_info);
> + ct_rwlock_wrlock(&ct->nat_resources_lock);
>
> - conn_key_reverse(&nc->rev_key);
> + bool nat_res = nat_select_range_tuple(ct, nc, conn_for_un_nat_copy);
> +
> + if (!nat_res) {
> + free(nc->nat_info);
> + nc->nat_info = NULL;
> + free (nc);
> + ct_rwlock_unlock(&ct->nat_resources_lock);
> + return NULL;
> + }
> +
> + if (conn_for_un_nat_copy && nc->conn_type == CT_CONN_TYPE_DEFAULT) {
> + *nc = *conn_for_un_nat_copy;
> + conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
> + }
> + ct_rwlock_unlock(&ct->nat_resources_lock);
> +
> + nat_packet(pkt, nc, state);
> + }
> hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
> atomic_count_inc(&ct->n_conn);
> }
> -
> return nc;
> }
>
> -static struct conn *
> -process_one(struct conntrack *ct, struct dp_packet *pkt,
> - struct conn_lookup_ctx *ctx, uint16_t zone,
> - bool commit, long long now)
> +static bool
> +conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
> + struct conn_lookup_ctx *ctx, uint16_t *state,
> + struct conn **conn, long long now,
> + unsigned bucket)
> + OVS_REQUIRES(ct->buckets[bucket].lock)
> {
> - unsigned bucket = hash_to_bucket(ctx->hash);
> - struct conn *conn = ctx->conn;
> - uint16_t state = 0;
> + bool create_new_conn = false;
>
> - if (conn) {
> - if (ctx->related) {
> - state |= CS_RELATED;
> + if (ctx->related) {
> + *state |= CS_RELATED;
> + if (ctx->reply) {
> + *state |= CS_REPLY_DIR;
> + }
> + } else {
> + enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
> + pkt, ctx->reply, now);
> +
> + switch (res) {
> + case CT_UPDATE_VALID:
> + *state |= CS_ESTABLISHED;
> if (ctx->reply) {
> - state |= CS_REPLY_DIR;
> + *state |= CS_REPLY_DIR;
> + }
> + break;
> + case CT_UPDATE_INVALID:
> + *state |= CS_INVALID;
> + break;
> + case CT_UPDATE_NEW:
> + ovs_list_remove(&(*conn)->exp_node);
> + hmap_remove(&ct->buckets[bucket].connections, &(*conn)->node);
> + atomic_count_dec(&ct->n_conn);
> + if ((*conn)->nat_info) {
> + nat_clean(ct, *conn, &ct->buckets[bucket]);
> + } else {
> + delete_conn(*conn);
> }
> + create_new_conn = true;
> + break;
> + default:
> + OVS_NOT_REACHED();
> + }
> + }
> + return create_new_conn;
> +}
> +
> +static void
> +create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
> + long long now)
> +{
> + struct conn *nc = xzalloc(sizeof *nc);
> + memcpy(nc, conn_for_un_nat_copy, sizeof *nc);
> + nc->key = conn_for_un_nat_copy->rev_key;
> + nc->rev_key = conn_for_un_nat_copy->key;
> + uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
> + unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
> + ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
> + ct_rwlock_rdlock(&ct->nat_resources_lock);
> +
> + struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
> +
> + struct nat_conn_key_node *nat_conn_key_node =
> + nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
> + if (nat_conn_key_node && !memcmp(&nat_conn_key_node->value,
> + &nc->rev_key, sizeof nat_conn_key_node->value) && !rev_conn) {
> +
> + hmap_insert(&ct->buckets[un_nat_conn_bucket].connections, &nc->node,
> + un_nat_hash);
> } else {
> - enum ct_update_res res;
> + free(nc);
> + }
> + ct_rwlock_unlock(&ct->nat_resources_lock);
> + ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
> +}
> +
> +static void
> +handle_nat(struct dp_packet *pkt, struct conn *conn,
> + uint16_t zone, uint16_t *state, bool reply)
> +{
> + if ((conn->nat_info && conn->nat_info->nat_action & NAT_ACTION) &&
> + (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
> + (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
> + zone != pkt->md.ct_zone))){
>
> - res = conn_update(conn, &ct->buckets[bucket], pkt,
> - ctx->reply, now);
> + if(reply) {
> + un_nat_packet(pkt, conn, state);
> + } else {
> + nat_packet(pkt, conn, state);
> + }
> + }
> +}
>
> - switch (res) {
> - case CT_UPDATE_VALID:
> - state |= CS_ESTABLISHED;
> - if (ctx->reply) {
> - state |= CS_REPLY_DIR;
> - }
> - break;
> - case CT_UPDATE_INVALID:
> - state |= CS_INVALID;
> - break;
> - case CT_UPDATE_NEW:
> - ovs_list_remove(&conn->exp_node);
> - hmap_remove(&ct->buckets[bucket].connections, &conn->node);
> - atomic_count_dec(&ct->n_conn);
> - delete_conn(conn);
> - conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
> - break;
> - default:
> - OVS_NOT_REACHED();
> +static void
> +process_one(struct conntrack *ct, struct dp_packet *pkt,
> + struct conn_lookup_ctx *ctx, uint16_t zone,
> + bool commit, long long now, const uint32_t *setmark,
> + const struct ovs_key_ct_labels *setlabel,
> + const struct nat_action_info_t *nat_action_info)
> +{
> + struct conn *conn;
> + uint16_t state = 0;
> + unsigned bucket = hash_to_bucket(ctx->hash);
> + ct_lock_lock(&ct->buckets[bucket].lock);
> + conn_key_lookup(&ct->buckets[bucket], ctx, now);
> + conn = ctx->conn;
> + struct conn conn_for_un_nat_copy;
> + memset(&conn_for_un_nat_copy, 0, sizeof conn_for_un_nat_copy);
> +
> + if (OVS_LIKELY(conn)) {
> + if (conn->conn_type == CT_CONN_TYPE_UN_NAT){
> + ctx->reply = 1;
> +
> + struct conn_lookup_ctx ctx2;
> + ctx2.conn = NULL;
> + ctx2.key = conn->rev_key;
> + ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
> +
> + ct_lock_unlock(&ct->buckets[bucket].lock);
> + bucket = hash_to_bucket(ctx2.hash);
> +
> + ct_lock_lock(&ct->buckets[bucket].lock);
> + conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
> +
> + if (ctx2.conn) {
> + conn = ctx2.conn;
> + } else {
> + /* It is a race condition where conn has timed out and removed
> + * between unlock of the rev_conn and lock of the forward conn;
> + * nothing to do. */
> + ct_lock_unlock(&ct->buckets[bucket].lock);
> + return;
> }
> }
> + }
> +
> + bool create_new_conn = false;
> + if (OVS_LIKELY(conn)) {
> + create_new_conn = conn_update_state(ct, pkt, ctx, &state, &conn,
> + now, bucket);
> + if (nat_action_info->nat_action && !create_new_conn) {
> + handle_nat(pkt, conn, zone, &state, ctx->reply);
> + }
> } else {
> if (ctx->related) {
> state |= CS_INVALID;
> } else {
> - conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
> + create_new_conn = true;
> }
> }
>
> + if (OVS_UNLIKELY(create_new_conn)) {
> + conn = conn_not_found(ct, pkt, ctx, &state, commit,
> + now, nat_action_info, &conn_for_un_nat_copy);
> + }
> +
> write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
> conn ? conn->label : OVS_U128_ZERO);
>
> - return conn;
> + if (conn && setmark) {
> + set_mark(pkt, conn, setmark[0], setmark[1]);
> + }
> +
> + if (conn && setlabel) {
> + set_label(pkt, conn, &setlabel[0], &setlabel[1]);
> + }
> +
> + ct_lock_unlock(&ct->buckets[bucket].lock);
> +
> + if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
> + create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
> + }
> }
>
> /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
> @@ -274,7 +609,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
> const uint32_t *setmark,
> const struct ovs_key_ct_labels *setlabel,
> const char *helper,
> - const struct nat_action_info_t *nat_action_info OVS_UNUSED)
> + const struct nat_action_info_t *nat_action_info)
> {
> struct dp_packet **pkts = pkt_batch->packets;
> size_t cnt = pkt_batch->count;
> @@ -324,27 +659,12 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
> }
>
> for (i = 0; i < arrcnt; i++) {
> - struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
> size_t j;
>
> - ct_lock_lock(&ctb->lock);
> -
> ULLONG_FOR_EACH_1(j, arr[i].maps) {
> - struct conn *conn;
> -
> - conn_key_lookup(ctb, &ctxs[j], now);
> -
> - conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
> -
> - if (conn && setmark) {
> - set_mark(pkts[j], conn, setmark[0], setmark[1]);
> - }
> -
> - if (conn && setlabel) {
> - set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
> - }
> + process_one(ct, pkts[j], &ctxs[j], zone, commit,
> + now, setmark, setlabel, nat_action_info);
I think it's fine to remove the lock batching. If we do that can we
can simplify
a lot the code above that groups based on the bucket.
> }
> - ct_lock_unlock(&ctb->lock);
> }
>
> return 0;
> @@ -373,6 +693,7 @@ set_label(struct dp_packet *pkt, struct conn *conn,
> | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
> conn->label = pkt->md.ct_label;
> }
> +
>
> /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
> * earliest expiration time among the remaining connections in 'ctb'. Returns
> @@ -390,20 +711,27 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
>
> for (i = 0; i < N_CT_TM; i++) {
> LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
> - if (!conn_expired(conn, now) || count >= limit) {
> - min_expiration = MIN(min_expiration, conn->expiration);
> - if (count >= limit) {
> - /* Do not check other lists. */
> - COVERAGE_INC(conntrack_long_cleanup);
> - return min_expiration;
> + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> + if (!conn_expired(conn, now) || count >= limit) {
> + min_expiration = MIN(min_expiration, conn->expiration);
> + if (count >= limit) {
> + /* Do not check other lists. */
> + COVERAGE_INC(conntrack_long_cleanup);
> + return min_expiration;
> + }
> + break;
> }
> - break;
> + ovs_list_remove(&conn->exp_node);
> + hmap_remove(&ctb->connections, &conn->node);
> + if (conn->nat_info) {
> + nat_clean(ct, conn, ctb);
> + } else {
> + delete_conn(conn);
> + }
> +
> + atomic_count_dec(&ct->n_conn);
> + count++;
> }
> - ovs_list_remove(&conn->exp_node);
> - hmap_remove(&ctb->connections, &conn->node);
> - atomic_count_dec(&ct->n_conn);
> - delete_conn(conn);
> - count++;
> }
> }
>
> @@ -774,7 +1102,6 @@ extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
> return false;
> }
>
> - /* pf doesn't do this, but it seems a good idea */
> if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
> || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
> return false;
> @@ -998,7 +1325,6 @@ conn_key_hash(const struct conn_key *key, uint32_t basis)
>
> hsrc = hdst = basis;
>
> - /* Hash the source and destination tuple */
> for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
> hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
> hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
> @@ -1025,6 +1351,247 @@ conn_key_reverse(struct conn_key *key)
> key->dst = tmp;
> }
>
> +static uint32_t
> +nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min, struct in6_addr *ipv6_aligned_max)
> +{
> + uint64_t diff = 0;
> + ovs_u128 addr6_128_min;
> + memcpy(&addr6_128_min.u64.hi, &ipv6_aligned_min->s6_addr32[0],
> + sizeof addr6_128_min.u64.hi);
> + memcpy(&addr6_128_min.u64.lo, &ipv6_aligned_min->s6_addr32[2],
> + sizeof addr6_128_min.u64.lo);
> + ovs_u128 addr6_128_max;
> + memcpy(&addr6_128_max.u64.hi, &ipv6_aligned_max->s6_addr32[0],
> + sizeof addr6_128_max.u64.hi);
> + memcpy(&addr6_128_max.u64.lo, &ipv6_aligned_max->s6_addr32[2],
> + sizeof addr6_128_max.u64.lo);
> +
> + if ((ntohll(addr6_128_min.u64.hi) == ntohll(addr6_128_max.u64.hi)) &&
> + (ntohll(addr6_128_min.u64.lo) <= ntohll(addr6_128_max.u64.lo))){
> + diff = ntohll(addr6_128_max.u64.lo) - ntohll(addr6_128_min.u64.lo);
> + } else if ((ntohll(addr6_128_min.u64.hi) + 1 == ntohll(addr6_128_max.u64.hi)) &&
> + (ntohll(addr6_128_min.u64.lo) > ntohll(addr6_128_max.u64.lo))) {
> + diff = 0xffffffffffffffff - (ntohll(addr6_128_min.u64.lo) -
> + ntohll(addr6_128_max.u64.lo) - 1);
> + } else {
> + /* Limit address delta supported to 32 bits or 4 billion approximately.
> + * Possibly, this should be visible to the user through a datapath
> + * support check, however the practical impact is probably nil. */
> + diff = 0xfffffffe;
> + }
> + if (diff > 0xfffffffe) {
> + diff = 0xfffffffe;
> + }
> + return (uint32_t)diff;
> +}
> +
> +/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
> + * restricts the input parameters. */
> +static void
> +nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
> +{
> + ovs_u128 addr6_128;
> + memcpy(&addr6_128.u64.hi, &ipv6_aligned->s6_addr32[0], sizeof addr6_128.u64.hi);
> + memcpy(&addr6_128.u64.lo, &ipv6_aligned->s6_addr32[2], sizeof addr6_128.u64.lo);
> +
> + if (0xffffffffffffffff - increment >= ntohll(addr6_128.u64.lo)) {
> + addr6_128.u64.lo = htonll(increment + ntohll(addr6_128.u64.lo));
> + } else if (addr6_128.u64.hi != 0xffffffffffffffff) {
> + addr6_128.u64.hi = htonll(1 + ntohll(addr6_128.u64.hi));
> + addr6_128.u64.lo =
> + htonll(increment - (0xffffffffffffffff -
> + ntohll(addr6_128.u64.lo) + 1));
> + } else {
> + OVS_NOT_REACHED();
> + }
> +
> + memcpy(&ipv6_aligned->s6_addr32[0], &addr6_128.u64.hi, sizeof addr6_128.u64.hi);
> + memcpy(&ipv6_aligned->s6_addr32[2], &addr6_128.u64.lo, sizeof addr6_128.u64.lo);
> +
> + return;
> +}
> +
> +static uint32_t
> +nat_range_hash(const struct conn *conn, uint32_t basis)
> +{
> + uint32_t hash = basis;
> + int i;
> + uint32_t port;
> +
> + for (i = 0;
> + i < sizeof(conn->nat_info->min_addr) / sizeof(uint32_t);
> + i++) {
> + hash = hash_add(hash, ((uint32_t *) &conn->nat_info->min_addr)[i]);
> + hash = hash_add(hash, ((uint32_t *) &conn->nat_info->max_addr)[i]);
> + }
> +
> + memcpy(&port, &conn->nat_info->min_port, sizeof port);
> + hash = hash_add(hash, port);
> +
> + uint32_t dl_type_for_hash = (uint32_t) conn->key.dl_type;
> + hash = hash_add(hash, dl_type_for_hash);
> + uint32_t nw_proto_for_hash = (uint32_t) conn->key.nw_proto;
> + hash = hash_add(hash, nw_proto_for_hash);
> + uint32_t zone_for_hash = (uint32_t) conn->key.zone;
> + hash = hash_add(hash, zone_for_hash);
> + return hash;
> +}
> +
> +static bool
> +nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
> + struct conn *nat_conn)
> +{
> +#define MIN_NAT_EPHEMERAL_PORT 1024
> +#define MAX_NAT_EPHEMERAL_PORT 65535
> +
> + uint16_t min_port;
> + uint16_t max_port;
> + uint16_t first_port;
> +
> + uint32_t hash = nat_range_hash(conn, ct->hash_basis);
> +
> + if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
> + (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
> + min_port = ntohs(conn->key.src.port);
> + max_port = ntohs(conn->key.src.port);
> + first_port = min_port;
> + } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
> + (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
> + min_port = ntohs(conn->key.dst.port);
> + max_port = ntohs(conn->key.dst.port);
> + first_port = min_port;
> + } else {
> + uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
> + uint32_t port_index = hash % (deltap + 1);
> + first_port = conn->nat_info->min_port + port_index;
> + min_port = conn->nat_info->min_port;
> + max_port = conn->nat_info->max_port;
> + }
> +
> + uint32_t deltaa = 0;
> + uint32_t address_index;
> + struct ct_addr ct_addr;
> + memset(&ct_addr, 0, sizeof ct_addr);
> + struct ct_addr max_ct_addr;
> + memset(&max_ct_addr, 0, sizeof max_ct_addr);
> + max_ct_addr = conn->nat_info->max_addr;
> +
> + if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> + deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
> + ntohl(conn->nat_info->min_addr.ipv4_aligned);
> + address_index = hash % (deltaa + 1);
> + ct_addr.ipv4_aligned = htonl(ntohl(conn->nat_info->min_addr.ipv4_aligned) +
> + address_index);
> + } else {
> + deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
> + &conn->nat_info->max_addr.ipv6_aligned);
> + /* deltaa must be within 32 bits for full hash coverage. A 64 or 128 bit hash is
> + * unnecessary and hence not used here. Most code is kept common with V4;
> + * nat_ipv6_addrs_delta() will do the enforcement via max_ct_addr. */
> + max_ct_addr = conn->nat_info->min_addr;
> + nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
> +
> + address_index = hash % (deltaa + 1);
> + ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
> + nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
> + }
> +
> + uint16_t port = first_port;
> + bool all_ports_tried = false;
> + bool original_ports_tried = false;
> + struct ct_addr first_addr = ct_addr;
> + *nat_conn = *conn;
> + while (true) {
> + if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
> + nat_conn->rev_key.dst.addr = ct_addr;
> + nat_conn->rev_key.dst.port = htons(port);
> + } else {
> + nat_conn->rev_key.src.addr = ct_addr;
> + nat_conn->rev_key.src.port = htons(port);
> + }
> +
> + struct nat_conn_key_node *nat_conn_key_node =
> + nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
> + ct->hash_basis);
> +
> + if (!nat_conn_key_node) {
> + struct nat_conn_key_node *nat_conn_key = xzalloc(sizeof *nat_conn_key);
> + memcpy(&nat_conn_key->key, &nat_conn->rev_key, sizeof nat_conn_key->key);
> + memcpy(&nat_conn_key->value, &nat_conn->key, sizeof nat_conn_key->value);
> + uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key, ct->hash_basis);
> + hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node, nat_conn_key_hash);
> + return true;
> + } else if (!all_ports_tried) {
> + if (min_port == max_port) {
> + all_ports_tried = true;
> + } else if(port == max_port) {
> + port = min_port;
> + } else {
> + port++;
> + }
> + if (port == first_port) {
> + all_ports_tried = true;
> + }
> + } else {
> + if(memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
> + if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
> + ct_addr.ipv4_aligned = htonl(ntohl(ct_addr.ipv4_aligned) + 1);
> + } else {
> + nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
> + }
> + } else {
> + ct_addr = conn->nat_info->min_addr;
> + }
> + if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
> + if (!original_ports_tried) {
> + original_ports_tried = true;
> + ct_addr = conn->nat_info->min_addr;
> + min_port = MIN_NAT_EPHEMERAL_PORT;
> + max_port = MAX_NAT_EPHEMERAL_PORT;
> + } else {
> + break;
> + }
> + }
> + first_port = min_port;
> + port = first_port;
> + all_ports_tried = false;
> + }
> + }
> + return false;
> +}
> +
> +static struct nat_conn_key_node *
> +nat_conn_keys_lookup(struct hmap *nat_conn_keys,
> + const struct conn_key *key,
> + uint32_t basis)
> +{
> + struct nat_conn_key_node *nat_conn_key_node;
> + uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
> +
> + HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, nat_conn_keys) {
> + if (!memcmp(&nat_conn_key_node->key, key, sizeof nat_conn_key_node->key)) {
> + return nat_conn_key_node;
> + }
> + }
> + return NULL;
> +}
> +
> +static void
> +nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
> + uint32_t basis)
> +{
> + struct nat_conn_key_node *nat_conn_key_node;
> + uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
> +
> + HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash, nat_conn_keys) {
> + if (!memcmp(&nat_conn_key_node->key, key, sizeof nat_conn_key_node->key)) {
> + hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
> + free(nat_conn_key_node);
> + return;
> + }
> + }
> +}
> +
> static void
> conn_key_lookup(struct conntrack_bucket *ctb,
> struct conn_lookup_ctx *ctx,
> @@ -1036,13 +1603,13 @@ conn_key_lookup(struct conntrack_bucket *ctb,
> ctx->conn = NULL;
>
> HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
> - if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
> + if (!memcmp(&conn->key, &ctx->key, sizeof conn->key)
> && !conn_expired(conn, now)) {
> ctx->conn = conn;
> ctx->reply = false;
> break;
> }
> - if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
> + if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key)
> && !conn_expired(conn, now)) {
> ctx->conn = conn;
> ctx->reply = true;
> @@ -1062,7 +1629,10 @@ conn_update(struct conn *conn, struct conntrack_bucket *ctb,
> static bool
> conn_expired(struct conn *conn, long long now)
> {
> - return now >= conn->expiration;
> + if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> + return now >= conn->expiration;
> + }
> + return false;
> }
>
> static bool
> @@ -1089,6 +1659,7 @@ new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
> static void
> delete_conn(struct conn *conn)
> {
> + free(conn->nat_info);
> free(conn);
> }
>
> @@ -1141,7 +1712,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
> entry->zone = conn->key.zone;
> entry->mark = conn->mark;
>
> - memcpy(&entry->labels, &conn->label, sizeof(entry->labels));
> + memcpy(&entry->labels, &conn->label, sizeof entry->labels);
> /* Not implemented yet */
> entry->timestamp.start = 0;
> entry->timestamp.stop = 0;
> @@ -1188,7 +1759,8 @@ conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
> break;
> }
> INIT_CONTAINER(conn, node, node);
> - if (!dump->filter_zone || conn->key.zone == dump->zone) {
> + if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
> + (conn->conn_type != CT_CONN_TYPE_UN_NAT)){
> conn_to_ct_dpif_entry(conn, entry, now);
> break;
> }
> @@ -1223,15 +1795,19 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>
> ct_lock_lock(&ct->buckets[i].lock);
> HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
> - if (!zone || *zone == conn->key.zone) {
> + if ((!zone || *zone == conn->key.zone) &&
> + (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
> ovs_list_remove(&conn->exp_node);
> hmap_remove(&ct->buckets[i].connections, &conn->node);
> atomic_count_dec(&ct->n_conn);
> - delete_conn(conn);
> + if (conn->nat_info) {
> + nat_clean(ct, conn, &ct->buckets[i]);
> + } else {
> + delete_conn(conn);
> + }
> }
> }
> ct_lock_unlock(&ct->buckets[i].lock);
> }
> -
> return 0;
> }
> diff --git a/lib/conntrack.h b/lib/conntrack.h
> index 471e529..5174123 100644
> --- a/lib/conntrack.h
> +++ b/lib/conntrack.h
> @@ -26,6 +26,7 @@
> #include "openvswitch/thread.h"
> #include "openvswitch/types.h"
> #include "ovs-atomic.h"
> +#include "ovs-thread.h"
> #include "packets.h"
>
> /* Userspace connection tracker
> @@ -71,6 +72,7 @@ struct ct_addr {
> };
> };
>
> +// Both NAT_ACTION_* and NAT_ACTION_*_PORT can be set
> enum nat_action_e {
> NAT_ACTION = 1 << 0,
> NAT_ACTION_SRC = 1 << 1,
> @@ -121,6 +123,10 @@ struct OVS_LOCKABLE ct_lock {
> struct ovs_mutex lock;
> };
>
> +struct OVS_LOCKABLE ct_rwlock {
> + struct ovs_rwlock lock;
> +};
> +
> static inline void ct_lock_init(struct ct_lock *lock)
> {
> ovs_mutex_init_adaptive(&lock->lock);
> @@ -144,6 +150,38 @@ static inline void ct_lock_destroy(struct ct_lock *lock)
> {
> ovs_mutex_destroy(&lock->lock);
> }
> +
> +static inline void ct_rwlock_init(struct ct_rwlock *lock)
> +{
> + ovs_rwlock_init(&lock->lock);
> +}
> +
> +static inline void ct_rwlock_wrlock(struct ct_rwlock *lock)
> + OVS_ACQUIRES(lock)
> + OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> + ovs_rwlock_wrlock(&lock->lock);
> +}
> +
> +static inline void ct_rwlock_rdlock(struct ct_rwlock *lock)
> + OVS_ACQUIRES(lock)
> + OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> + ovs_rwlock_rdlock(&lock->lock);
> +}
> +
> +static inline void ct_rwlock_unlock(struct ct_rwlock *lock)
> + OVS_RELEASES(lock)
> + OVS_NO_THREAD_SAFETY_ANALYSIS
> +{
> + ovs_rwlock_unlock(&lock->lock);
> +}
> +
> +static inline void ct_rwlock_destroy(struct ct_rwlock *lock)
> +{
> + ovs_rwlock_destroy(&lock->lock);
> +}
> +
>
> /* Timeouts: all the possible timeout states passed to update_expiration()
> * are listed here. The name will be prefix by CT_TM_ and the value is in
> @@ -226,6 +264,12 @@ struct conntrack {
> /* Connections limit. When this limit is reached, no new connection
> * will be accepted. */
> atomic_uint n_conn_limit;
> +
> + /* The following resources are referenced during nat connection
> + * creation and deletion. */
> + struct hmap nat_conn_keys OVS_GUARDED;
> + struct ct_rwlock nat_resources_lock;
> +
Could you add a comment about the nesting of nat_resources_lock?
> };
>
> #endif /* conntrack.h */
> --
> 1.9.1
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
More information about the dev
mailing list