[ovs-dev] [patch_v2 1/4] Userspace Datapath: Add ALG infra and FTP.
Joe Stringer
joe at ovn.org
Fri Jun 23 23:08:32 UTC 2017
On 17 June 2017 at 15:53, Darrell Ball <dlu998 at gmail.com> wrote:
> ALG infra and FTP (both V4 and V6) support is added to the userspace
> datapath. Also, NAT support is included.
>
> Signed-off-by: Darrell Ball <dlu998 at gmail.com>
> ---
Hi Darrell, thanks for the patch.
I wasn't able to test this out, because my build and test environments
depend on sparse correctly running and it wouldn't compile. Please
consider setting it up, or using Travis-CI to validate builds before
v3.
I didn't do a thorough review, but I scanned through and pointed out a
couple of parts I have questions about.
> lib/conntrack-private.h | 17 +
> lib/conntrack.c | 1016 +++++++++++++++++++++++++++++++++++++++++++----
> lib/conntrack.h | 4 +-
> 3 files changed, 957 insertions(+), 80 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index 55084d3..993af33 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -62,17 +62,34 @@ struct nat_conn_key_node {
> struct conn_key value;
> };
>
> +struct alg_exp_node {
> + struct hmap_node node;
> + struct ovs_list exp_node;
> + long long expiration;
> + struct conn_key key;
> + struct conn_key master_key;
> + struct ct_addr alg_nat_repl_addr;
> + ovs_u128 master_label;
> + uint32_t master_mark;
> +};
> +
> struct conn {
> struct conn_key key;
> struct conn_key rev_key;
> + /* Only used for orig_tuple support. */
> + struct conn_key master_key;
I guess no-one's looked at this structure from a cacheline
perspective, but if we think it's worth optimizing then perhaps a
followup patch should rearrange these. You might consider starting
here by just placing all of the ALG-related fields together at the
end.
> long long expiration;
> struct ovs_list exp_node;
> struct hmap_node node;
> ovs_u128 label;
> /* XXX: consider flattening. */
> struct nat_action_info_t *nat_info;
> + char *alg;
> + int seq_skew;
> uint32_t mark;
> uint8_t conn_type;
> + uint8_t seq_skew_dir;
> + uint8_t alg_related;
> };
>
> enum ct_update_res {
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 90b154a..1f54fe3 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1,5 +1,5 @@
> /*
> - * Copyright (c) 2015, 2016 Nicira, Inc.
> + * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
All three files could get an update like this.
> *
> * Licensed under the Apache License, Version 2.0 (the "License");
> * you may not use this file except in compliance with the License.
> @@ -21,6 +21,7 @@
> #include <sys/types.h>
> #include <netinet/in.h>
> #include <netinet/icmp6.h>
> +#include <ctype.h>
Alphabetic order in each section of #includes?
>
> #include "bitmap.h"
> #include "conntrack-private.h"
> @@ -39,7 +40,6 @@
> #include "random.h"
> #include "timeval.h"
>
> -
> VLOG_DEFINE_THIS_MODULE(conntrack);
>
> COVERAGE_DEFINE(conntrack_full);
> @@ -50,9 +50,21 @@ struct conn_lookup_ctx {
> struct conn *conn;
> uint32_t hash;
> bool reply;
> + /* XXX: Only used by ICMP; change name. */
Please consider sending a separate patch to change the name.
<snip>
> @@ -179,14 +241,20 @@ conntrack_destroy(struct conntrack *ct)
> ct_lock_unlock(&ctb->lock);
> ct_lock_destroy(&ctb->lock);
> }
> - ct_rwlock_wrlock(&ct->nat_resources_lock);
> + ct_rwlock_wrlock(&ct->resources_lock);
> struct nat_conn_key_node *nat_conn_key_node;
> HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
> free(nat_conn_key_node);
> }
> hmap_destroy(&ct->nat_conn_keys);
> - ct_rwlock_unlock(&ct->nat_resources_lock);
> - ct_rwlock_destroy(&ct->nat_resources_lock);
> +
> + struct alg_exp_node *alg_exp_node;
> + HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
> + free(alg_exp_node);
> + }
> + hmap_destroy(&ct->alg_expectations);
You might also consider poisoning ct->alg_exp_list.
<snip>
> @@ -363,8 +469,8 @@ reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
> struct ip_header *nh = dp_packet_l3(pkt);
> struct icmp_header *icmp = dp_packet_l4(pkt);
> struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
> - extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
> - -pad, &inner_l4, false);
> + extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) -pad,
> + &inner_l4, false);
I guess this is just some automatic argument formatting that you
performed incidentally, but if it's getting changed, then usually we
put a space around operators such as '-'.
<snip>
> @@ -554,34 +681,50 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> nc->rev_key = nc->key;
> conn_key_reverse(&nc->rev_key);
>
> + if (helper) {
> + nc->alg = xstrdup(helper);
> + }
> +
> + if (alg_exp) {
> + nc->alg_related = true;
> + nc->mark = alg_exp->master_mark;
> + nc->label = alg_exp->master_label;
> + nc->master_key = alg_exp->master_key;
> + }
> +
> if (nat_action_info) {
> nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
> - ct_rwlock_wrlock(&ct->nat_resources_lock);
> -
> - bool nat_res = nat_select_range_tuple(ct, nc,
> - conn_for_un_nat_copy);
>
> - if (!nat_res) {
> - free(nc->nat_info);
> - nc->nat_info = NULL;
> - free (nc);
> - ct_rwlock_unlock(&ct->nat_resources_lock);
> - return NULL;
> - }
> + if (alg_exp) {
> + nc->rev_key.src.addr = alg_nat_repl_addr;
> + nc->nat_info->nat_action = NAT_ACTION_DST;
> + *conn_for_un_nat_copy = *nc;
> + } else {
> + ct_rwlock_wrlock(&ct->resources_lock);
> + bool nat_res = nat_select_range_tuple(
> + ct, nc, conn_for_un_nat_copy);
> +
> + if (!nat_res) {
> + free(nc->nat_info);
> + nc->nat_info = NULL;
> + free (nc);
I think that nc->alg may be leaked here? any reason it doesn't use
delete_conn()?
> + ct_rwlock_unlock(&ct->resources_lock);
> + return NULL;
> + }
>
> - if (conn_for_un_nat_copy &&
> - nc->conn_type == CT_CONN_TYPE_DEFAULT) {
> *nc = *conn_for_un_nat_copy;
Perhaps nc->alg and/or nc->nat_info may be leaked here?
<snip>
> @@ -923,6 +1126,28 @@ sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
> }
> }
>
> +#define MAX_ALG_EXP_TO_EXPIRE 1000
> + size_t alg_exp_count = hmap_count(&ct->alg_expectations);
> + /* XXX: revisit this. */
> + size_t max_to_expire =
> + MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);
I would have thought MAX_ALG_EXP_TO_EXPIRE would be the upper bound,
not the lower bound.
<snip>
> @@ -1819,6 +2045,88 @@ nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
> }
> }
>
> +static struct alg_exp_node *
> +expectation_lookup(struct hmap *alg_expectations,
> + const struct conn_key *key,
> + uint32_t basis)
OVS_REQ_RDLOCK(...) ?
> +{
> + struct conn_key check_key = *key;
> + check_key.src.port = ALG_WC_SRC_PORT;
> + struct alg_exp_node *alg_exp_node;
> + uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);
> + HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
> + alg_exp_conn_key_hash,
> + alg_expectations) {
> + if (!memcmp(&alg_exp_node->key, &check_key,
> + sizeof alg_exp_node->key)) {
> + return alg_exp_node;
> + }
> + }
> +
> + return NULL;
> +}
> +
> +static void
> +expectation_create(struct conntrack *ct,
> + ovs_be16 dst_port,
> + const long long now,
> + enum ct_alg_mode mode,
> + const struct conn *master_conn)
> +{
> + struct ct_addr src_addr;
> + struct ct_addr dst_addr;
> + struct ct_addr alg_nat_repl_addr;
> +
> + switch (mode) {
> + case CT_FTP_MODE_ACTIVE:
> + src_addr = master_conn->rev_key.src.addr;
> + dst_addr = master_conn->rev_key.dst.addr;
> + alg_nat_repl_addr = master_conn->key.src.addr;
> + break;
> + case CT_FTP_MODE_PASSIVE:
> + src_addr = master_conn->key.src.addr;
> + dst_addr = master_conn->key.dst.addr;
> + alg_nat_repl_addr = master_conn->rev_key.src.addr;
> + break;
> + default:
> + OVS_NOT_REACHED();
> + }
> +
> + struct alg_exp_node *alg_exp_node =
> + xzalloc(sizeof *alg_exp_node);
> + alg_exp_node->key.dl_type = master_conn->key.dl_type;
> + alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
> + alg_exp_node->key.zone = master_conn->key.zone;
> + alg_exp_node->key.src.addr = src_addr;
> + alg_exp_node->key.dst.addr = dst_addr;
> + alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
> + alg_exp_node->key.dst.port = dst_port;
> + alg_exp_node->master_mark = master_conn->mark;
> + alg_exp_node->master_label = master_conn->label;
> + alg_exp_node->master_key = master_conn->key;
> + ct_rwlock_rdlock(&ct->resources_lock);
> + struct alg_exp_node *alg_exp = expectation_lookup(
> + &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);
> + ct_rwlock_unlock(&ct->resources_lock);
> + if (alg_exp) {
> + free(alg_exp_node);
> + return;
> + }
> +
> + alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;
> + alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
> + uint32_t alg_exp_conn_key_hash =
> + conn_key_hash(&alg_exp_node->key,
> + ct->hash_basis);
> + ct_rwlock_wrlock(&ct->resources_lock);
> + hmap_insert(&ct->alg_expectations,
> + &alg_exp_node->node,
> + alg_exp_conn_key_hash);
> +
> + alg_exp_init_expiration(ct, alg_exp_node, now);
> + ct_rwlock_unlock(&ct->resources_lock);
Is there a race where another thread inserts an equivalent expectation
between grabbing the readlock above and the writelock here?
> +}
> +
> static void
> conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
> long long now)
> @@ -1886,6 +2194,7 @@ static void
> delete_conn(struct conn *conn)
> {
> free(conn->nat_info);
> + free(conn->alg);
> free(conn);
> }
>
> @@ -1950,6 +2259,10 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
> if (class->conn_get_protoinfo) {
> class->conn_get_protoinfo(conn, &entry->protoinfo);
> }
> +
> + if (conn->alg) {
> + entry->helper.name = xstrdup(conn->alg);
> + }
> }
>
> int
> @@ -2020,7 +2333,7 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
> struct conn *conn, *next;
>
> ct_lock_lock(&ct->buckets[i].lock);
> - HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
> + HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
> if ((!zone || *zone == conn->key.zone) &&
> (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
> conn_clean(ct, conn, &ct->buckets[i]);
> @@ -2028,5 +2341,550 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
> }
> ct_lock_unlock(&ct->buckets[i].lock);
> }
> +
> + ct_rwlock_wrlock(&ct->resources_lock);
Should this be nested within the bucket lock?
> + struct alg_exp_node *alg_exp_node, *alg_exp_node_next;
> + HMAP_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,
> + node, &ct->alg_expectations) {
> + if (!zone || *zone == alg_exp_node->key.zone) {
> + ovs_list_remove(&alg_exp_node->exp_node);
> + hmap_remove(&ct->alg_expectations, &alg_exp_node->node);
> + free(alg_exp_node);
> + }
> + }
> + ct_rwlock_unlock(&ct->resources_lock);
> return 0;
> }
> +
> +static uint8_t
> +get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)
> +{
> + return v4_addr >> (index * 8) & 0xff;
I think this is where the sparse warning is generated:
lib/conntrack.c:2390:12: error: restricted ovs_be32 degrades to integer
<snip>
> +static enum ftp_ctl_pkt
> +process_ftp_ctl_v4(struct conntrack *ct,
> + struct conn_lookup_ctx *ctx,
> + struct dp_packet *pkt,
> + const struct conn *conn_for_expectation,
> + long long now, ovs_be32 *v4_addr_rep,
> + char **ftp_data_v4_start,
> + size_t *addr_offset_from_ftp_data_start)
> +{
> +
> + struct tcp_header *th = dp_packet_l4(pkt);
> + size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
> + char *tcp_hdr = (char *) th;
> + char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
> + char *ftp = ftp_msg;
> + enum ct_alg_mode mode;
> +
> + detect_ftp_ctl_mode(ctx, pkt, ftp_msg);
Whitespace. Same for the other call to this function.
> +static enum ftp_ctl_pkt
> +process_ftp_ctl_v6(struct conntrack *ct,
> + struct conn_lookup_ctx *ctx,
> + struct dp_packet *pkt,
> + const struct conn *conn_for_expectation,
> + long long now,
> + struct ct_addr *v6_addr_rep,
> + char **ftp_data_start,
> + size_t *addr_offset_from_ftp_data_start,
> + size_t *addr_size, enum ct_alg_mode *mode)
I didn't work through the logic of these functions with an actual
message; it'd be easier to do so if there were example messages in the
code, but then I think we generally just rely on the tests passing to
show the correctness. I see that everything is copied into a local
buffer for iteration first, so I assume you've done due diligence with
respect to validating seeks into the message.
<snip>
> + const char *tail = dp_packet_tail(pkt);
> + uint8_t pad = dp_packet_l2_pad_size(pkt);
> + th->tcp_csum = 0;
> + uint32_t tcp_csum;
> + if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
> + tcp_csum = packet_csum_pseudoheader6(nh6);
> + } else {
> + tcp_csum = packet_csum_pseudoheader(l3_hdr);
> + }
> + th->tcp_csum = csum_finish(
> + csum_continue(tcp_csum, th, tail - (char *) th - pad));
> + return;
> +}
> +
Extraneous newline at EOF?
> diff --git a/lib/conntrack.h b/lib/conntrack.h
> index 243aebb..0b110c7 100644
> --- a/lib/conntrack.h
> +++ b/lib/conntrack.h
> @@ -267,11 +267,13 @@ struct conntrack {
...
> /* This lock is used during NAT connection creation and deletion;
> * it is taken after a bucket lock and given back before that
> * bucket unlock.
> */
Please update the comment, even if to generalize saying that it
protects access to 'nat_conn_keys', 'alg_expectations' and
'alg_exp_list', and should be nested within locking of the ct bucket
lock.
> - struct ct_rwlock nat_resources_lock;
> + struct ct_rwlock resources_lock;
This change could probably be refactored out as a separate patch to
begin, reduce churn in this patch, and be trivially applied to master
independently of the rest of the series.
More information about the dev
mailing list