[ovs-dev] [PATCH RFC v2] Conntrack: Avoid recirculation for established connections.

antonio.fischetti at intel.com antonio.fischetti at intel.com
Thu May 25 16:02:35 UTC 2017


From: Antonio Fischetti <antonio.fischetti at intel.com>

With conntrack enabled, packets get recirculated and this impacts
the performance with thousands of active concurrent connections.

This patch is aimed at avoiding recirculation for packets belonging to
established connections in steady state. This is achieved by
manipulating the megaflows and actions. In this case the actions of the
megaflow of recirculated packet is initially updated with new 'CT_SKIP'
action and later updated with the actions of the megaflow of the
original packet(to handle zones). There after the EMC entry
belonging to the original packet is updated to point to the 'megaflow' of
the recirculated packet there by avoiding recirculation.

Signed-off-by: Antonio Fischetti <antonio.fischetti at intel.com>
Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
---
- ~10% performance improvement is observed in our testing with UDP streams.
- Limited testing is done with RFC patch and the tests include hundreds of
  concurrent active TCP connections.
- This is very early implementation and we are posting here to get initial
  feedback.
- This RFC patch is implemented leveraging EMC, but optimization could be
  extended to dpcls as well to handle higher flows.
- No VXLAN testing is done with this patch.

 datapath/linux/compat/include/linux/openvswitch.h |   1 +
 lib/dpif-netdev.c                                 | 148 ++++++++++++++++++++--
 lib/packets.h                                     |   2 +
 3 files changed, 143 insertions(+), 8 deletions(-)

diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h
index d22102e..6dc5674 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -762,6 +762,7 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
 	OVS_CT_ATTR_FORCE_COMMIT,  /* No argument */
 	OVS_CT_ATTR_EVENTMASK,  /* u32 mask of IPCT_* events. */
+	OVS_CT_ATTR_SKIP,       /* No argument, does not invoke CT. */
 	__OVS_CT_ATTR_MAX
 };
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b50164b..fbbb42e 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2101,7 +2101,8 @@ emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
 }
 
 static inline struct dp_netdev_flow *
-emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
+emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key,
+        void **pfound_entry)
 {
     struct emc_entry *current_entry;
 
@@ -2110,11 +2111,13 @@ emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
             && emc_entry_alive(current_entry)
             && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
 
+            *pfound_entry = current_entry;
             /* We found the entry with the 'key->mf' miniflow */
             return current_entry->flow;
         }
     }
 
+    *pfound_entry = NULL;
     return NULL;
 }
 
@@ -4583,10 +4586,12 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
         key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
 
         /* If EMC is disabled skip emc_lookup */
-        flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key);
+        flow = (cur_min == 0) ? NULL:
+                emc_lookup(flow_cache, key, &packet->md.prev_emc_entry);
         if (OVS_LIKELY(flow)) {
             dp_netdev_queue_batches(packet, flow, &key->mf, batches,
                                     n_batches);
+            packet->md.prev_flow = flow;
         } else {
             /* Exact match cache missed. Group missed packets together at
              * the beginning of the 'packets' array. */
@@ -4595,6 +4600,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
              * must be returned to the caller. The next key should be extracted
              * to 'keys[n_missed + 1]'. */
             key = &keys[++n_missed];
+            packet->md.prev_flow = NULL;
         }
     }
 
@@ -4604,6 +4610,9 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
     return dp_packet_batch_size(packets_);
 }
 
+#define CT_PREPEND_ACTION_LEN 0x0C
+#define CT_PREPEND_ACTION_SPARE_LEN 32
+#define CT_TOT_ACTIONS_LEN (CT_PREPEND_ACTION_LEN + CT_PREPEND_ACTION_SPARE_LEN)
 static inline void
 handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
                      struct dp_packet *packet,
@@ -4663,12 +4672,44 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
         ovs_mutex_lock(&pmd->flow_mutex);
         netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
         if (OVS_LIKELY(!netdev_flow)) {
-            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
-                                             add_actions->data,
-                                             add_actions->size);
+            /* Recirculated packets that belong to established connections
+             * are treated differently.  A place-holder is prepended to the
+             * list of actions. */
+            if (!(packet->md.ct_state & CS_INVALID) &&
+                 (packet->md.ct_state & CS_TRACKED) &&
+                 (packet->md.ct_state & CS_ESTABLISHED) &&
+                 (packet->md.recirc_id)) {
+
+                /* Prepend an OVS_CT_ATTR_SKIP to the list of actions inside
+                 * add_actions.  It does not really invoke the ConnTrack module,
+                 * it's a Ct action that works as a place-holder.  It will be
+                 * overwritten inside emc_patch_established_conn() with
+                 * the proper ct(zone=X) action. */
+                struct dp_netdev_actions *new_actions;
+                uint8_t tmp_action[CT_TOT_ACTIONS_LEN] = {
+                        0x0C, 0x00, 0x0C, 0x00,
+                        0x06, 0x00, 0x09, 0x00,
+                        0x00, 0x00, 0x00, 0x00,
+                        /* will append here add_actions. */
+                };
+                memcpy(&tmp_action[CT_PREPEND_ACTION_LEN],
+                        add_actions->data,
+                        add_actions->size);
+                new_actions = dp_netdev_actions_create(
+                        (const struct nlattr *)tmp_action,
+                        CT_PREPEND_ACTION_LEN + add_actions->size);
+                netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
+                                                 new_actions->actions,
+                                                 new_actions->size);
+            } else {
+                netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
+                                                 add_actions->data,
+                                                 add_actions->size);
+            }
         }
         ovs_mutex_unlock(&pmd->flow_mutex);
-        emc_probabilistic_insert(pmd, key, netdev_flow);
+        /* For this RFC, probabilistic emc insertion is disabled. */
+        emc_insert(&pmd->flow_cache, key, netdev_flow);
     }
 }
 
@@ -4761,7 +4802,8 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
 
         flow = dp_netdev_flow_cast(rules[i]);
 
-        emc_probabilistic_insert(pmd, &keys[i], flow);
+        /* For testing purposes the emc_insert is restored here. */
+        emc_insert(&pmd->flow_cache, &keys[i], flow);
         dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
     }
 
@@ -4998,6 +5040,78 @@ dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
     }
 }
 
+/* Search into EMC to retrieve the entry related to the original packet
+ * and the entry related to the recirculated packet.
+ * If both EMC entries are alive, then:
+ *  - The flow actions of the recirculated packet are updated with
+ *    'ct(zone=N)' as retrieved from the actions of the original flow.
+ *  - The EMC orig entry flow is updated with the flow pointer of recirc entry.
+ */
+static inline void
+emc_patch_established_conn(struct dp_netdev_pmd_thread *pmd,
+        struct dp_packet *packet, long long now OVS_UNUSED)
+{
+    struct emc_cache *cache = &pmd->flow_cache;
+    struct netdev_flow_key key;
+    struct emc_entry *orig_entry; /* EMC entry hit by the original packet. */
+    struct emc_entry *recirc_entry; /* EMC entry for recirculated packet. */
+    uint32_t old_hash;
+
+    if (!packet->md.prev_emc_entry) {
+        return;
+    }
+
+    orig_entry = packet->md.prev_emc_entry;
+    if (!emc_entry_alive(orig_entry)) {
+        return;
+    }
+
+    /* Check that the original EMC entry was not overwritten. */
+    struct dp_netdev_flow *orig_flow = orig_entry->flow;
+    if (packet->md.prev_flow && (packet->md.prev_flow != orig_flow)) {
+       return;
+    }
+
+    /* TODO as we are calling miniflow_extract now, can we avoid to invoke
+     * it again when we'll classify this recirculated packet? */
+    miniflow_extract(packet, &key.mf);
+    key.len = 0; /* Not computed yet. */
+    old_hash = dp_packet_get_rss_hash(packet);
+    key.hash = dpif_netdev_packet_get_rss_hash(packet, NULL);
+    dp_packet_set_rss_hash(packet, old_hash);
+
+    EMC_FOR_EACH_POS_WITH_HASH(cache, recirc_entry, key.hash) {
+        if (recirc_entry->key.hash == key.hash
+            && emc_entry_alive(recirc_entry)
+            && netdev_flow_key_equal_mf(&recirc_entry->key, &key.mf)) {
+
+            if (orig_entry->flow == recirc_entry->flow) {
+                /* Nothing to do, already patched by a packet of this
+                 * same batch. */
+                return;
+            }
+            /* We found the entry related to the recirculated packet.
+             * Retrieve the actions for orig and recirc entries. * */
+            struct dp_netdev_actions * orig_actions =
+                    dp_netdev_flow_get_actions(orig_entry->flow);
+            struct dp_netdev_actions * recirc_actions =
+                    dp_netdev_flow_get_actions(recirc_entry->flow);
+
+            /* The orig entry action contains a CT action with the zone info. */
+            struct nlattr *p = &orig_actions->actions[0];
+
+            /* Overwrite the CT Skip action of recirc entry with ct(zone=N). */
+            memcpy(recirc_actions->actions, p, p->nla_len);
+
+            /* Update orig EMC entry. */
+            orig_entry->flow = recirc_entry->flow;
+            dp_netdev_flow_ref(orig_entry->flow);
+
+            return;
+        }
+    }
+}
+
 static void
 dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
               const struct nlattr *a, bool may_steal)
@@ -5024,7 +5138,6 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
             } else {
                 tx_qid = pmd->static_tx_qid;
             }
-
             netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
                         dynamic_txqs);
             return;
@@ -5145,10 +5258,21 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
             struct dp_packet *packet;
             DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
                 packet->md.recirc_id = nl_attr_get_u32(a);
+
+                if (!(packet->md.ct_state & CS_INVALID) &&
+                     (packet->md.ct_state & CS_TRACKED) &&
+                     (packet->md.ct_state & CS_ESTABLISHED)) {
+                    (*depth)++;
+                    emc_patch_established_conn(pmd, packet, now);
+                    (*depth)--;
+                }
             }
 
             (*depth)++;
             dp_netdev_recirculate(pmd, packets_);
+            DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
+                packet->md.recirc_id = 0;
+            }
             (*depth)--;
 
             return;
@@ -5166,6 +5290,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
         const char *helper = NULL;
         const uint32_t *setmark = NULL;
         const struct ovs_key_ct_labels *setlabel = NULL;
+        bool skip_ct = false;
 
         NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
                                  nl_attr_get_size(a)) {
@@ -5194,6 +5319,9 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
                 /* Silently ignored, as userspace datapath does not generate
                  * netlink events. */
                 break;
+            case OVS_CT_ATTR_SKIP:
+                skip_ct = true;
+                break;
             case OVS_CT_ATTR_NAT:
             case OVS_CT_ATTR_UNSPEC:
             case __OVS_CT_ATTR_MAX:
@@ -5201,6 +5329,10 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
             }
         }
 
+        if (OVS_UNLIKELY(skip_ct)) {
+            break;
+        }
+
         conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
                           commit, zone, setmark, setlabel, helper);
         break;
diff --git a/lib/packets.h b/lib/packets.h
index 7dbf6dd..2c46a15 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -112,6 +112,8 @@ struct pkt_metadata {
     struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. Note that
                                  * if 'ip_dst' == 0, the rest of the fields may
                                  * be uninitialized. */
+    void *prev_emc_entry;       /* EMC entry that this packet matched. */
+    void *prev_flow;            /* Flow pointed by the matching EMC entry. */
 };
 
 static inline void
-- 
2.4.11



More information about the dev mailing list