[ovs-dev] [PATCH v1 1/9] conntrack: Use rcu-lists to store conn expirations

William Tu u9012063 at gmail.com
Tue Feb 23 22:55:25 UTC 2021


Hi Gaetan,

Thanks for the patch, looks very useful.
I haven't tested it yet.
Minor question/comments inline.

On Wed, Feb 17, 2021 at 8:34 AM Gaetan Rivet <grive at u256.net> wrote:
>
> Change the connection expiration lists from ovs_list to rculist.
> This is a pre-step towards reducing the granularity of 'ct_lock'.
>
> Signed-off-by: Gaetan Rivet <grive at u256.net>
> Reviewed-by: Eli Britstein <elibr at nvidia.com>
> ---
>  lib/conntrack-private.h | 76 +++++++++++++++++++++++++++--------------
>  lib/conntrack-tp.c      | 60 +++++++++++++++++++++++++-------
>  lib/conntrack.c         | 14 ++++----
>  3 files changed, 105 insertions(+), 45 deletions(-)
>
> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
> index e8332bdba..4b6f9eae3 100644
> --- a/lib/conntrack-private.h
> +++ b/lib/conntrack-private.h
> @@ -29,6 +29,7 @@
>  #include "openvswitch/list.h"
>  #include "openvswitch/types.h"
>  #include "packets.h"
> +#include "rculist.h"
>  #include "unaligned.h"
>  #include "dp-packet.h"
>
> @@ -86,17 +87,55 @@ struct alg_exp_node {
>      bool nat_rpl_dst;
>  };
>
> +/* Timeouts: all the possible timeout states passed to update_expiration()
> + * are listed here. The name will be prefix by CT_TM_ and the value is in
> + * milliseconds */
> +#define CT_TIMEOUTS \
> +    CT_TIMEOUT(TCP_FIRST_PACKET) \
> +    CT_TIMEOUT(TCP_OPENING) \
> +    CT_TIMEOUT(TCP_ESTABLISHED) \
> +    CT_TIMEOUT(TCP_CLOSING) \
> +    CT_TIMEOUT(TCP_FIN_WAIT) \
> +    CT_TIMEOUT(TCP_CLOSED) \
> +    CT_TIMEOUT(OTHER_FIRST) \
> +    CT_TIMEOUT(OTHER_MULTIPLE) \
> +    CT_TIMEOUT(OTHER_BIDIR) \
> +    CT_TIMEOUT(ICMP_FIRST) \
> +    CT_TIMEOUT(ICMP_REPLY)
> +
> +enum ct_timeout {
> +#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> +    CT_TIMEOUTS
> +#undef CT_TIMEOUT
> +    N_CT_TM
> +};
> +
any reason for moving this macro to here?

>  enum OVS_PACKED_ENUM ct_conn_type {
>      CT_CONN_TYPE_DEFAULT,
>      CT_CONN_TYPE_UN_NAT,
>  };
>
> +struct conn_expire {
> +    /* Set once when initializing the expiration node. */
> +    struct conntrack *ct;
> +    /* Timeout state of the connection.
> +     * It follows the connection state updates.
> +     */
> +    enum ct_timeout tm;
> +    /* Insert and remove the expiration node only once per RCU syncs.
> +     * If multiple threads update the connection, its expiration should
> +     * be removed only once and added only once to timeout lists.
> +     */
> +    atomic_flag insert_once;
> +    atomic_flag remove_once;
> +    struct rculist node;
> +};
> +
>  struct conn {
>      /* Immutable data. */
>      struct conn_key key;
>      struct conn_key rev_key;
>      struct conn_key parent_key; /* Only used for orig_tuple support. */
> -    struct ovs_list exp_node;
>      struct cmap_node cm_node;
>      struct nat_action_info_t *nat_info;
>      char *alg;
> @@ -104,6 +143,7 @@ struct conn {
>
>      /* Mutable data. */
>      struct ovs_mutex lock; /* Guards all mutable fields. */
> +    struct conn_expire exp;
>      ovs_u128 label;
>      long long expiration;
>      uint32_t mark;
> @@ -132,33 +172,10 @@ enum ct_update_res {
>      CT_UPDATE_VALID_NEW,
>  };
>
> -/* Timeouts: all the possible timeout states passed to update_expiration()
> - * are listed here. The name will be prefix by CT_TM_ and the value is in
> - * milliseconds */
> -#define CT_TIMEOUTS \
> -    CT_TIMEOUT(TCP_FIRST_PACKET) \
> -    CT_TIMEOUT(TCP_OPENING) \
> -    CT_TIMEOUT(TCP_ESTABLISHED) \
> -    CT_TIMEOUT(TCP_CLOSING) \
> -    CT_TIMEOUT(TCP_FIN_WAIT) \
> -    CT_TIMEOUT(TCP_CLOSED) \
> -    CT_TIMEOUT(OTHER_FIRST) \
> -    CT_TIMEOUT(OTHER_MULTIPLE) \
> -    CT_TIMEOUT(OTHER_BIDIR) \
> -    CT_TIMEOUT(ICMP_FIRST) \
> -    CT_TIMEOUT(ICMP_REPLY)
> -
> -enum ct_timeout {
> -#define CT_TIMEOUT(NAME) CT_TM_##NAME,
> -    CT_TIMEOUTS
> -#undef CT_TIMEOUT
> -    N_CT_TM
> -};
> -
>  struct conntrack {
>      struct ovs_mutex ct_lock; /* Protects 2 following fields. */
>      struct cmap conns OVS_GUARDED;
> -    struct ovs_list exp_lists[N_CT_TM] OVS_GUARDED;
> +    struct rculist exp_lists[N_CT_TM] OVS_GUARDED;
>      struct hmap zone_limits OVS_GUARDED;
>      struct hmap timeout_policies OVS_GUARDED;
>      uint32_t hash_basis; /* Salt for hashing a connection key. */
> @@ -204,4 +221,13 @@ struct ct_l4_proto {
>                                 struct ct_dpif_protoinfo *);
>  };
>
> +static inline void
> +conn_expire_remove(struct conn_expire *exp)
> +{
> +    if (!atomic_flag_test_and_set(&exp->remove_once)
> +        && rculist_next(&exp->node)) {
> +        rculist_remove(&exp->node);
> +    }
> +}
> +
>  #endif /* conntrack-private.h */
> diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
> index a586d3a8d..30ba4bda8 100644
> --- a/lib/conntrack-tp.c
> +++ b/lib/conntrack-tp.c
> @@ -230,6 +230,50 @@ tm_to_ct_dpif_tp(enum ct_timeout tm)
>      return CT_DPIF_TP_ATTR_MAX;
>  }
>
> +static void
> +conn_expire_init(struct conn *conn, struct conntrack *ct)
> +{
> +    struct conn_expire *exp = &conn->exp;
> +
> +    if (exp->ct != NULL) {
> +        return;
> +    }
> +
> +    exp->ct = ct;
> +    atomic_flag_clear(&exp->insert_once);
> +    atomic_flag_clear(&exp->remove_once);
> +    /* The expiration is initially unscheduled, flag it as 'removed'. */
> +    atomic_flag_test_and_set(&exp->remove_once);
> +}
> +
> +static void
> +conn_expire_insert(struct conn *conn)
> +{
> +    struct conn_expire *exp = &conn->exp;
> +
> +    ovs_mutex_lock(&exp->ct->ct_lock);
> +    ovs_mutex_lock(&conn->lock);
> +
> +    rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node);
> +    atomic_flag_clear(&exp->insert_once);
> +    atomic_flag_clear(&exp->remove_once);
> +
> +    ovs_mutex_unlock(&conn->lock);
> +    ovs_mutex_unlock(&exp->ct->ct_lock);
> +}
> +
> +static void
> +conn_schedule_expiration(struct conntrack *ct, struct conn *conn,
> +                         enum ct_timeout tm, long long now, uint32_t tp_value)
> +{
> +    conn_expire_init(conn, ct);
> +    conn->expiration = now + tp_value * 1000;
> +    conn->exp.tm = tm;
> +    if (!atomic_flag_test_and_set(&conn->exp.insert_once)) {

Does this mean we only insert the oldest one, not the latest one?
So pkt1 arrives at t1, pkt2 arrives later at t2, then we update using t1

> +        ovsrcu_postpone(conn_expire_insert, conn);
> +    }
> +}
> +

rest looks good to me


William


More information about the dev mailing list