[ovs-dev] [patch_v2 1/4] Userspace Datapath: Add ALG infra and FTP.

Joe Stringer joe at ovn.org
Fri Jun 23 23:08:32 UTC 2017


On 17 June 2017 at 15:53, Darrell Ball <dlu998 at gmail.com> wrote:
> ALG infra and FTP (both V4 and V6) support is added to the userspace
> datapath.  Also, NAT support is included.
>
> Signed-off-by: Darrell Ball <dlu998 at gmail.com>
> ---

Hi Darrell, thanks for the patch.

I wasn't able to test this out, because my build and test environments
depend on sparse correctly running and it wouldn't compile. Please
consider setting it up, or using Travis-CI to validate builds before
v3.

I didn't do a thorough review, but I scanned through and pointed out a
couple of parts I have questions about.

>  lib/conntrack-private.h |   17 +
>  lib/conntrack.c         | 1016 +++++++++++++++++++++++++++++++++++++++++++----
>  lib/conntrack.h         |    4 +-
>  3 files changed, 957 insertions(+), 80 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index 55084d3..993af33 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -62,17 +62,34 @@ struct nat_conn_key_node {
>      struct conn_key value;
>  };
>
> +struct alg_exp_node {
> +    struct hmap_node node;
> +    struct ovs_list exp_node;
> +    long long expiration;
> +    struct conn_key key;
> +    struct conn_key master_key;
> +    struct ct_addr alg_nat_repl_addr;
> +    ovs_u128 master_label;
> +    uint32_t master_mark;
> +};
> +
>  struct conn {
>      struct conn_key key;
>      struct conn_key rev_key;
> +    /* Only used for orig_tuple support. */
> +    struct conn_key master_key;

I guess no-one's looked at this structure from a cacheline
perspective, but if we think it's worth optimizing then perhaps a
followup patch should rearrange these. You might consider starting
here by just placing all of the ALG-related fields together at the
end.

>      long long expiration;
>      struct ovs_list exp_node;
>      struct hmap_node node;
>      ovs_u128 label;
>      /* XXX: consider flattening. */
>      struct nat_action_info_t *nat_info;
> +    char *alg;
> +    int seq_skew;
>      uint32_t mark;
>      uint8_t conn_type;
> +    uint8_t seq_skew_dir;
> +    uint8_t alg_related;
>  };
>
>  enum ct_update_res {
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 90b154a..1f54fe3 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2015, 2016 Nicira, Inc.
> + * Copyright (c) 2015, 2016, 2017 Nicira, Inc.

All three files could get an update like this.

>   *
>   * Licensed under the Apache License, Version 2.0 (the "License");
>   * you may not use this file except in compliance with the License.
> @@ -21,6 +21,7 @@
>  #include <sys/types.h>
>  #include <netinet/in.h>
>  #include <netinet/icmp6.h>
> +#include <ctype.h>

Alphabetic order in each section of #includes?

>
>  #include "bitmap.h"
>  #include "conntrack-private.h"
> @@ -39,7 +40,6 @@
>  #include "random.h"
>  #include "timeval.h"
>
> -
>  VLOG_DEFINE_THIS_MODULE(conntrack);
>
>  COVERAGE_DEFINE(conntrack_full);
> @@ -50,9 +50,21 @@ struct conn_lookup_ctx {
>      struct conn *conn;
>      uint32_t hash;
>      bool reply;
> +    /* XXX: Only used by ICMP; change name. */

Please consider sending a separate patch to change the name.

<snip>

> @@ -179,14 +241,20 @@ conntrack_destroy(struct conntrack *ct)
>          ct_lock_unlock(&ctb->lock);
>          ct_lock_destroy(&ctb->lock);
>      }
> -    ct_rwlock_wrlock(&ct->nat_resources_lock);
> +    ct_rwlock_wrlock(&ct->resources_lock);
>      struct nat_conn_key_node *nat_conn_key_node;
>      HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
>          free(nat_conn_key_node);
>      }
>      hmap_destroy(&ct->nat_conn_keys);
> -    ct_rwlock_unlock(&ct->nat_resources_lock);
> -    ct_rwlock_destroy(&ct->nat_resources_lock);
> +
> +    struct alg_exp_node *alg_exp_node;
> +    HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
> +        free(alg_exp_node);
> +    }
> +    hmap_destroy(&ct->alg_expectations);

You might also consider poisoning ct->alg_exp_list.

<snip>

> @@ -363,8 +469,8 @@ reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
>          struct ip_header *nh = dp_packet_l3(pkt);
>          struct icmp_header *icmp = dp_packet_l4(pkt);
>          struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
> -        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
> -                        -pad, &inner_l4, false);
> +        extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) -pad,
> +                        &inner_l4, false);

I guess this is just some automatic argument formatting that you
performed incidentally, but if it's getting changed, then usually we
put a space around operators such as '-'.

<snip>

> @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>          nc->rev_key = nc->key;
>          conn_key_reverse(&nc->rev_key);
>
> +        if (helper) {
> +            nc->alg = xstrdup(helper);
> +        }
> +
> +        if (alg_exp) {
> +            nc->alg_related = true;
> +            nc->mark = alg_exp->master_mark;
> +            nc->label = alg_exp->master_label;
> +            nc->master_key = alg_exp->master_key;
> +        }
> +
>          if (nat_action_info) {
>              nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
> -            ct_rwlock_wrlock(&ct->nat_resources_lock);
> -
> -            bool nat_res = nat_select_range_tuple(ct, nc,
> -                                                  conn_for_un_nat_copy);
>
> -            if (!nat_res) {
> -                free(nc->nat_info);
> -                nc->nat_info = NULL;
> -                free (nc);
> -                ct_rwlock_unlock(&ct->nat_resources_lock);
> -                return NULL;
> -            }
> +            if (alg_exp) {
> +                nc->rev_key.src.addr = alg_nat_repl_addr;
> +                nc->nat_info->nat_action = NAT_ACTION_DST;
> +                *conn_for_un_nat_copy = *nc;
> +            } else {
> +                ct_rwlock_wrlock(&ct->resources_lock);
> +                bool nat_res = nat_select_range_tuple(
> +                                   ct, nc, conn_for_un_nat_copy);
> +
> +                if (!nat_res) {
> +                    free(nc->nat_info);
> +                    nc->nat_info = NULL;
> +                    free (nc);

I think that nc->alg may be leaked here? any reason it doesn't use
delete_conn()?

> +                    ct_rwlock_unlock(&ct->resources_lock);
> +                    return NULL;
> +                }
>
> -            if (conn_for_un_nat_copy &&
> -                nc->conn_type == CT_CONN_TYPE_DEFAULT) {
>                  *nc = *conn_for_un_nat_copy;

Perhaps nc->alg and/or nc->nat_info may be leaked here?

<snip>

> @@ -923,6 +1126,28 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
>          }
>      }
>
> +#define MAX_ALG_EXP_TO_EXPIRE 1000
> +    size_t alg_exp_count = hmap_count(&ct->alg_expectations);
> +    /* XXX: revisit this. */
> +    size_t max_to_expire =
> +        MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);

I would have thought MAX_ALG_EXP_TO_EXPIRE would be the upper bound,
not the lower bound.

<snip>

> @@ -1819,6 +2045,88 @@ nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
>      }
>  }
>
> +static struct alg_exp_node *
> +expectation_lookup(struct hmap *alg_expectations,
> +                   const struct conn_key *key,
> +                   uint32_t basis)

OVS_REQ_RDLOCK(...) ?

> +{
> +    struct conn_key check_key = *key;
> +    check_key.src.port = ALG_WC_SRC_PORT;
> +    struct alg_exp_node *alg_exp_node;
> +    uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);
> +    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
> +                             alg_exp_conn_key_hash,
> +                             alg_expectations) {
> +        if (!memcmp(&alg_exp_node->key, &check_key,
> +                    sizeof alg_exp_node->key)) {
> +            return alg_exp_node;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static void
> +expectation_create(struct conntrack *ct,
> +                   ovs_be16 dst_port,
> +                   const long long now,
> +                   enum ct_alg_mode mode,
> +                   const struct conn *master_conn)
> +{
> +    struct ct_addr src_addr;
> +    struct ct_addr dst_addr;
> +    struct ct_addr alg_nat_repl_addr;
> +
> +    switch (mode) {
> +    case CT_FTP_MODE_ACTIVE:
> +        src_addr = master_conn->rev_key.src.addr;
> +        dst_addr = master_conn->rev_key.dst.addr;
> +        alg_nat_repl_addr = master_conn->key.src.addr;
> +        break;
> +    case CT_FTP_MODE_PASSIVE:
> +        src_addr = master_conn->key.src.addr;
> +        dst_addr = master_conn->key.dst.addr;
> +        alg_nat_repl_addr = master_conn->rev_key.src.addr;
> +        break;
> +    default:
> +        OVS_NOT_REACHED();
> +    }
> +
> +    struct alg_exp_node *alg_exp_node =
> +        xzalloc(sizeof *alg_exp_node);
> +    alg_exp_node->key.dl_type = master_conn->key.dl_type;
> +    alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
> +    alg_exp_node->key.zone = master_conn->key.zone;
> +    alg_exp_node->key.src.addr = src_addr;
> +    alg_exp_node->key.dst.addr = dst_addr;
> +    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
> +    alg_exp_node->key.dst.port = dst_port;
> +    alg_exp_node->master_mark = master_conn->mark;
> +    alg_exp_node->master_label = master_conn->label;
> +    alg_exp_node->master_key = master_conn->key;
> +    ct_rwlock_rdlock(&ct->resources_lock);
> +    struct alg_exp_node *alg_exp = expectation_lookup(
> +        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);
> +    ct_rwlock_unlock(&ct->resources_lock);
> +    if (alg_exp) {
> +        free(alg_exp_node);
> +        return;
> +    }
> +
> +    alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;
> +    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
> +    uint32_t alg_exp_conn_key_hash =
> +        conn_key_hash(&alg_exp_node->key,
> +                      ct->hash_basis);
> +    ct_rwlock_wrlock(&ct->resources_lock);
> +    hmap_insert(&ct->alg_expectations,
> +                &alg_exp_node->node,
> +                alg_exp_conn_key_hash);
> +
> +    alg_exp_init_expiration(ct, alg_exp_node, now);
> +    ct_rwlock_unlock(&ct->resources_lock);

Is there a race where another thread inserts an equivalent expectation
between grabbing the readlock above and the writelock here?

> +}
> +
>  static void
>  conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
>                  long long now)
> @@ -1886,6 +2194,7 @@ static void
>  delete_conn(struct conn *conn)
>  {
>      free(conn->nat_info);
> +    free(conn->alg);
>      free(conn);
>  }
>
> @@ -1950,6 +2259,10 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
>      if (class->conn_get_protoinfo) {
>          class->conn_get_protoinfo(conn, &entry->protoinfo);
>      }
> +
> +    if (conn->alg) {
> +        entry->helper.name = xstrdup(conn->alg);
> +    }
>  }
>
>  int
> @@ -2020,7 +2333,7 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>          struct conn *conn, *next;
>
>          ct_lock_lock(&ct->buckets[i].lock);
> -        HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
> +        HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
>              if ((!zone || *zone == conn->key.zone) &&
>                  (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
>                  conn_clean(ct, conn, &ct->buckets[i]);
> @@ -2028,5 +2341,550 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>          }
>          ct_lock_unlock(&ct->buckets[i].lock);
>      }
> +
> +    ct_rwlock_wrlock(&ct->resources_lock);

Should this be nested within the bucket lock?

> +    struct alg_exp_node *alg_exp_node, *alg_exp_node_next;
> +    HMAP_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,
> +                       node, &ct->alg_expectations) {
> +        if (!zone || *zone == alg_exp_node->key.zone) {
> +            ovs_list_remove(&alg_exp_node->exp_node);
> +            hmap_remove(&ct->alg_expectations, &alg_exp_node->node);
> +            free(alg_exp_node);
> +        }
> +    }
> +    ct_rwlock_unlock(&ct->resources_lock);
>      return 0;
>  }
> +
> +static uint8_t
> +get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)
> +{
> +    return v4_addr >> (index * 8) & 0xff;

I think this is where the sparse warning is generated:

lib/conntrack.c:2390:12: error: restricted ovs_be32 degrades to integer

<snip>

> +static enum ftp_ctl_pkt
> +process_ftp_ctl_v4(struct conntrack *ct,
> +                   struct conn_lookup_ctx *ctx,
> +                   struct dp_packet *pkt,
> +                   const struct conn *conn_for_expectation,
> +                   long long now, ovs_be32 *v4_addr_rep,
> +                   char **ftp_data_v4_start,
> +                   size_t *addr_offset_from_ftp_data_start)
> +{
> +
> +    struct tcp_header *th = dp_packet_l4(pkt);
> +    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +    char *tcp_hdr = (char *) th;
> +    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
> +    char *ftp = ftp_msg;
> +    enum ct_alg_mode mode;
> +
> +     detect_ftp_ctl_mode(ctx, pkt, ftp_msg);

Whitespace. Same for the other call to this function.

> +static enum ftp_ctl_pkt
> +process_ftp_ctl_v6(struct conntrack *ct,
> +                   struct conn_lookup_ctx *ctx,
> +                   struct dp_packet *pkt,
> +                   const struct conn *conn_for_expectation,
> +                   long long now,
> +                   struct ct_addr *v6_addr_rep,
> +                   char **ftp_data_start,
> +                   size_t *addr_offset_from_ftp_data_start,
> +                   size_t *addr_size, enum ct_alg_mode *mode)

I didn't work through the logic of these functions with an actual
message; it'd be easier to do so if there were example messages in the
code, but then I think we generally just rely on the tests passing to
show the correctness. I see that everything is copied into a local
buffer for iteration first, so I assume you've done due diligence with
respect to validating seeks into the message.

<snip>

> +    const char *tail = dp_packet_tail(pkt);
> +    uint8_t pad = dp_packet_l2_pad_size(pkt);
> +    th->tcp_csum = 0;
> +    uint32_t tcp_csum;
> +    if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
> +        tcp_csum = packet_csum_pseudoheader6(nh6);
> +    } else {
> +        tcp_csum = packet_csum_pseudoheader(l3_hdr);
> +    }
> +    th->tcp_csum = csum_finish(
> +        csum_continue(tcp_csum, th, tail - (char *) th - pad));
> +    return;
> +}
> +

Extraneous newline at EOF?

> diff --git a/lib/conntrack.h b/lib/conntrack.h
> index 243aebb..0b110c7 100644
> --- a/lib/conntrack.h
> +++ b/lib/conntrack.h
> @@ -267,11 +267,13 @@ struct conntrack {

...
>      /* This lock is used during NAT connection creation and deletion;
>       * it is taken after a bucket lock and given back before that
>       * bucket unlock.
>       */

Please update the comment, even if to generalize saying that it
protects access to 'nat_conn_keys', 'alg_expectations' and
'alg_exp_list', and should be nested within locking of the ct bucket
lock.

> -    struct ct_rwlock nat_resources_lock;
> +    struct ct_rwlock resources_lock;

This change could probably be refactored out as a separate patch to
begin, reduce churn in this patch, and be trivially applied to master
independently of the rest of the series.


More information about the dev mailing list