[ovs-dev] [PATCH RFC] Conntrack: Avoid recirculation for established connections.
Greg Rose
gvrose8192 at gmail.com
Thu May 25 14:08:01 UTC 2017
On Thu, 2017-05-25 at 13:36 +0000, Fischetti, Antonio wrote:
> This patch doesn't seem to be in Patchwork.
>
Perhaps because it was sent as RFC?
- Greg
> > -----Original Message-----
> > From: ovs-dev-bounces at openvswitch.org [mailto:ovs-dev-
> > bounces at openvswitch.org] On Behalf Of antonio.fischetti at intel.com
> > Sent: Thursday, May 25, 2017 12:01 PM
> > To: dev at openvswitch.org
> > Subject: [ovs-dev] [PATCH RFC] Conntrack: Avoid recirculation for
> > established connections.
> >
> > From: Antonio Fischetti <antonio.fischetti at intel.com>
> >
> > With conntrack enabled, packets get recirculated and this impacts
> > the performance with thousands of active concurrent connections.
> >
> > This patch is aimed at avoiding recirculation for packets belonging to
> > established connections in steady state. This is achieved by
> > manipulating the megaflows and actions. In this case the actions of the
> > megaflow of recirculated packet is initially updated with new 'CT_SKIP'
> > action and later updated with the actions of the megaflow of the
> > original packet(to handle zones). There after the EMC entry
> > belonging to the original packet is updated to point to the 'megaflow' of
> > the recirculated packet there by avoiding recirculation.
> >
> > Signed-off-by: Antonio Fischetti <antonio.fischetti at intel.com>
> > Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> > Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> > ---
> > - ~10% performance improvement is observed in our testing with UDP
> > streams.
> > - Limited testing is done with RFC patch and the tests include hundreds of
> > concurrent active TCP connections.
> > - This is very early implementation and we are posting here to get initial
> > feedback.
> > - This RFC patch is implemented leveraging EMC, but optimization could be
> > extended to dpcls as well to handle higher flows.
> > - No VXLAN testing is done with this patch.
> >
> > datapath/linux/compat/include/linux/openvswitch.h | 1 +
> > lib/dpif-netdev.c | 148
> > ++++++++++++++++++++--
> > lib/packets.h | 2 +
> > 3 files changed, 143 insertions(+), 8 deletions(-)
> >
> > diff --git a/datapath/linux/compat/include/linux/openvswitch.h
> > b/datapath/linux/compat/include/linux/openvswitch.h
> > index d22102e..6dc5674 100644
> > --- a/datapath/linux/compat/include/linux/openvswitch.h
> > +++ b/datapath/linux/compat/include/linux/openvswitch.h
> > @@ -762,6 +762,7 @@ enum ovs_ct_attr {
> > OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */
> > OVS_CT_ATTR_FORCE_COMMIT, /* No argument */
> > OVS_CT_ATTR_EVENTMASK, /* u32 mask of IPCT_* events. */
> > + OVS_CT_ATTR_SKIP, /* No argument, does not invoke CT. */
> > __OVS_CT_ATTR_MAX
> > };
> >
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index b50164b..fbbb42e 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -2101,7 +2101,8 @@ emc_probabilistic_insert(struct dp_netdev_pmd_thread
> > *pmd,
> > }
> >
> > static inline struct dp_netdev_flow *
> > -emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
> > +emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key,
> > + void **pfound_entry)
> > {
> > struct emc_entry *current_entry;
> >
> > @@ -2110,11 +2111,13 @@ emc_lookup(struct emc_cache *cache, const struct
> > netdev_flow_key *key)
> > && emc_entry_alive(current_entry)
> > && netdev_flow_key_equal_mf(¤t_entry->key, &key->mf)) {
> >
> > + *pfound_entry = current_entry;
> > /* We found the entry with the 'key->mf' miniflow */
> > return current_entry->flow;
> > }
> > }
> >
> > + *pfound_entry = NULL;
> > return NULL;
> > }
> >
> > @@ -4583,10 +4586,12 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
> > key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
> >
> > /* If EMC is disabled skip emc_lookup */
> > - flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key);
> > + flow = (cur_min == 0) ? NULL:
> > + emc_lookup(flow_cache, key, &packet->md.prev_emc_entry);
> > if (OVS_LIKELY(flow)) {
> > dp_netdev_queue_batches(packet, flow, &key->mf, batches,
> > n_batches);
> > + packet->md.prev_flow = flow;
> > } else {
> > /* Exact match cache missed. Group missed packets together at
> > * the beginning of the 'packets' array. */
> > @@ -4595,6 +4600,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
> > * must be returned to the caller. The next key should be
> > extracted
> > * to 'keys[n_missed + 1]'. */
> > key = &keys[++n_missed];
> > + packet->md.prev_flow = NULL;
> > }
> > }
> >
> > @@ -4604,6 +4610,9 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
> > return dp_packet_batch_size(packets_);
> > }
> >
> > +#define CT_PREPEND_ACTION_LEN 0x0C
> > +#define CT_PREPEND_ACTION_SPARE_LEN 32
> > +#define CT_TOT_ACTIONS_LEN (CT_PREPEND_ACTION_LEN +
> > CT_PREPEND_ACTION_SPARE_LEN)
> > static inline void
> > handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
> > struct dp_packet *packet,
> > @@ -4663,12 +4672,44 @@ handle_packet_upcall(struct dp_netdev_pmd_thread
> > *pmd,
> > ovs_mutex_lock(&pmd->flow_mutex);
> > netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
> > if (OVS_LIKELY(!netdev_flow)) {
> > - netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
> > - add_actions->data,
> > - add_actions->size);
> > + /* Recirculated packets that belong to established
> > connections
> > + * are treated differently. A place-holder is prepended to
> > the
> > + * list of actions. */
> > + if (!(packet->md.ct_state & CS_INVALID) &&
> > + (packet->md.ct_state & CS_TRACKED) &&
> > + (packet->md.ct_state & CS_ESTABLISHED) &&
> > + (packet->md.recirc_id)) {
> > +
> > + /* Prepend an OVS_CT_ATTR_SKIP to the list of actions
> > inside
> > + * add_actions. It does not really invoke the ConnTrack
> > module,
> > + * it's a Ct action that works as a place-holder. It
> > will be
> > + * overwritten inside emc_patch_established_conn() with
> > + * the proper ct(zone=X) action. */
> > + struct dp_netdev_actions *new_actions;
> > + uint8_t tmp_action[CT_TOT_ACTIONS_LEN] = {
> > + 0x0C, 0x00, 0x0C, 0x00,
> > + 0x06, 0x00, 0x09, 0x00,
> > + 0x00, 0x00, 0x00, 0x00,
> > + /* will append here add_actions. */
> > + };
> > + memcpy(&tmp_action[CT_PREPEND_ACTION_LEN],
> > + add_actions->data,
> > + add_actions->size);
> > + new_actions = dp_netdev_actions_create(
> > + (const struct nlattr *)tmp_action,
> > + CT_PREPEND_ACTION_LEN + add_actions->size);
> > + netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
> > + new_actions->actions,
> > + new_actions->size);
> > + } else {
> > + netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
> > + add_actions->data,
> > + add_actions->size);
> > + }
> > }
> > ovs_mutex_unlock(&pmd->flow_mutex);
> > - emc_probabilistic_insert(pmd, key, netdev_flow);
> > + /* For this RFC, probabilistic emc insertion is disabled. */
> > + emc_insert(&pmd->flow_cache, key, netdev_flow);
> > }
> > }
> >
> > @@ -4761,7 +4802,8 @@ fast_path_processing(struct dp_netdev_pmd_thread
> > *pmd,
> >
> > flow = dp_netdev_flow_cast(rules[i]);
> >
> > - emc_probabilistic_insert(pmd, &keys[i], flow);
> > + /* For testing purposes the emc_insert is restored here. */
> > + emc_insert(&pmd->flow_cache, &keys[i], flow);
> > dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches,
> > n_batches);
> > }
> >
> > @@ -4998,6 +5040,78 @@ dp_execute_userspace_action(struct
> > dp_netdev_pmd_thread *pmd,
> > }
> > }
> >
> > +/* Search into EMC to retrieve the entry related to the original packet
> > + * and the entry related to the recirculated packet.
> > + * If both EMC entries are alive, then:
> > + * - The flow actions of the recirculated packet are updated with
> > + * 'ct(zone=N)' as retrieved from the actions of the original flow.
> > + * - The EMC orig entry flow is updated with the flow pointer of recirc
> > entry.
> > + */
> > +static inline void
> > +emc_patch_established_conn(struct dp_netdev_pmd_thread *pmd,
> > + struct dp_packet *packet, long long now OVS_UNUSED)
> > +{
> > + struct emc_cache *cache = &pmd->flow_cache;
> > + struct netdev_flow_key key;
> > + struct emc_entry *orig_entry; /* EMC entry hit by the original
> > packet. */
> > + struct emc_entry *recirc_entry; /* EMC entry for recirculated packet.
> > */
> > + uint32_t old_hash;
> > +
> > + if (!packet->md.prev_emc_entry) {
> > + return;
> > + }
> > +
> > + orig_entry = packet->md.prev_emc_entry;
> > + if (!emc_entry_alive(orig_entry)) {
> > + return;
> > + }
> > +
> > + /* Check that the original EMC entry was not overwritten. */
> > + struct dp_netdev_flow *orig_flow = orig_entry->flow;
> > + if (packet->md.prev_flow && (packet->md.prev_flow != orig_flow)) {
> > + return;
> > + }
> > +
> > + /* TODO as we are calling miniflow_extract now, can we avoid to
> > invoke
> > + * it again when we'll classify this recirculated packet? */
> > + miniflow_extract(packet, &key.mf);
> > + key.len = 0; /* Not computed yet. */
> > + old_hash = dp_packet_get_rss_hash(packet);
> > + key.hash = dpif_netdev_packet_get_rss_hash(packet, NULL);
> > + dp_packet_set_rss_hash(packet, old_hash);
> > +
> > + EMC_FOR_EACH_POS_WITH_HASH(cache, recirc_entry, key.hash) {
> > + if (recirc_entry->key.hash == key.hash
> > + && emc_entry_alive(recirc_entry)
> > + && netdev_flow_key_equal_mf(&recirc_entry->key, &key.mf)) {
> > +
> > + if (orig_entry->flow == recirc_entry->flow) {
> > + /* Nothing to do, already patched by a packet of this
> > + * same batch. */
> > + return;
> > + }
> > + /* We found the entry related to the recirculated packet.
> > + * Retrieve the actions for orig and recirc entries. * */
> > + struct dp_netdev_actions * orig_actions =
> > + dp_netdev_flow_get_actions(orig_entry->flow);
> > + struct dp_netdev_actions * recirc_actions =
> > + dp_netdev_flow_get_actions(recirc_entry->flow);
> > +
> > + /* The orig entry action contains a CT action with the zone
> > info. */
> > + struct nlattr *p = &orig_actions->actions[0];
> > +
> > + /* Overwrite the CT Skip action of recirc entry with
> > ct(zone=N). */
> > + memcpy(recirc_actions->actions, p, p->nla_len);
> > +
> > + /* Update orig EMC entry. */
> > + orig_entry->flow = recirc_entry->flow;
> > + dp_netdev_flow_ref(orig_entry->flow);
> > +
> > + return;
> > + }
> > + }
> > +}
> > +
> > static void
> > dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
> > const struct nlattr *a, bool may_steal)
> > @@ -5024,7 +5138,6 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> > } else {
> > tx_qid = pmd->static_tx_qid;
> > }
> > -
> > netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
> > dynamic_txqs);
> > return;
> > @@ -5145,10 +5258,21 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> > struct dp_packet *packet;
> > DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
> > packet->md.recirc_id = nl_attr_get_u32(a);
> > +
> > + if (!(packet->md.ct_state & CS_INVALID) &&
> > + (packet->md.ct_state & CS_TRACKED) &&
> > + (packet->md.ct_state & CS_ESTABLISHED)) {
> > + (*depth)++;
> > + emc_patch_established_conn(pmd, packet, now);
> > + (*depth)--;
> > + }
> > }
> >
> > (*depth)++;
> > dp_netdev_recirculate(pmd, packets_);
> > + DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
> > + packet->md.recirc_id = 0;
> > + }
> > (*depth)--;
> >
> > return;
> > @@ -5166,6 +5290,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> > const char *helper = NULL;
> > const uint32_t *setmark = NULL;
> > const struct ovs_key_ct_labels *setlabel = NULL;
> > + bool skip_ct = false;
> >
> > NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
> > nl_attr_get_size(a)) {
> > @@ -5194,6 +5319,9 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> > /* Silently ignored, as userspace datapath does not
> > generate
> > * netlink events. */
> > break;
> > + case OVS_CT_ATTR_SKIP:
> > + skip_ct = true;
> > + break;
> > case OVS_CT_ATTR_NAT:
> > case OVS_CT_ATTR_UNSPEC:
> > case __OVS_CT_ATTR_MAX:
> > @@ -5201,6 +5329,10 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> > }
> > }
> >
> > + if (OVS_UNLIKELY(skip_ct)) {
> > + break;
> > + }
> > +
> > conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type,
> > force,
> > commit, zone, setmark, setlabel, helper);
> > break;
> > diff --git a/lib/packets.h b/lib/packets.h
> > index 7dbf6dd..2c46a15 100644
> > --- a/lib/packets.h
> > +++ b/lib/packets.h
> > @@ -112,6 +112,8 @@ struct pkt_metadata {
> > struct flow_tnl tunnel; /* Encapsulating tunnel parameters. Note
> > that
> > * if 'ip_dst' == 0, the rest of the
> > fields may
> > * be uninitialized. */
> > + void *prev_emc_entry; /* EMC entry that this packet matched. */
> > + void *prev_flow; /* Flow pointed by the matching EMC
> > entry. */
> > };
> >
> > static inline void
> > --
> > 2.4.11
> >
> > _______________________________________________
> > dev mailing list
> > dev at openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
More information about the dev
mailing list