[ovs-dev] [PATCH RFC] Conntrack: Avoid recirculation for established connections.

Greg Rose gvrose8192 at gmail.com
Thu May 25 14:08:01 UTC 2017


On Thu, 2017-05-25 at 13:36 +0000, Fischetti, Antonio wrote:
> This patch doesn't seem to be in Patchwork.
> 

Perhaps because it was sent as RFC?

- Greg

> > -----Original Message-----
> > From: ovs-dev-bounces at openvswitch.org [mailto:ovs-dev-
> > bounces at openvswitch.org] On Behalf Of antonio.fischetti at intel.com
> > Sent: Thursday, May 25, 2017 12:01 PM
> > To: dev at openvswitch.org
> > Subject: [ovs-dev] [PATCH RFC] Conntrack: Avoid recirculation for
> > established connections.
> > 
> > From: Antonio Fischetti <antonio.fischetti at intel.com>
> > 
> > With conntrack enabled, packets get recirculated and this impacts
> > the performance with thousands of active concurrent connections.
> > 
> > This patch is aimed at avoiding recirculation for packets belonging to
> > established connections in steady state. This is achieved by
> > manipulating the megaflows and actions. In this case the actions of the
> > megaflow of recirculated packet is initially updated with new 'CT_SKIP'
> > action and later updated with the actions of the megaflow of the
> > original packet(to handle zones). There after the EMC entry
> > belonging to the original packet is updated to point to the 'megaflow' of
> > the recirculated packet there by avoiding recirculation.
> > 
> > Signed-off-by: Antonio Fischetti <antonio.fischetti at intel.com>
> > Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> > Co-authored-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy at intel.com>
> > ---
> > - ~10% performance improvement is observed in our testing with UDP
> > streams.
> > - Limited testing is done with RFC patch and the tests include hundreds of
> >   concurrent active TCP connections.
> > - This is very early implementation and we are posting here to get initial
> >   feedback.
> > - This RFC patch is implemented leveraging EMC, but optimization could be
> >   extended to dpcls as well to handle higher flows.
> > - No VXLAN testing is done with this patch.
> > 
> >  datapath/linux/compat/include/linux/openvswitch.h |   1 +
> >  lib/dpif-netdev.c                                 | 148
> > ++++++++++++++++++++--
> >  lib/packets.h                                     |   2 +
> >  3 files changed, 143 insertions(+), 8 deletions(-)
> > 
> > diff --git a/datapath/linux/compat/include/linux/openvswitch.h
> > b/datapath/linux/compat/include/linux/openvswitch.h
> > index d22102e..6dc5674 100644
> > --- a/datapath/linux/compat/include/linux/openvswitch.h
> > +++ b/datapath/linux/compat/include/linux/openvswitch.h
> > @@ -762,6 +762,7 @@ enum ovs_ct_attr {
> >  	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
> >  	OVS_CT_ATTR_FORCE_COMMIT,  /* No argument */
> >  	OVS_CT_ATTR_EVENTMASK,  /* u32 mask of IPCT_* events. */
> > +	OVS_CT_ATTR_SKIP,       /* No argument, does not invoke CT. */
> >  	__OVS_CT_ATTR_MAX
> >  };
> > 
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index b50164b..fbbb42e 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -2101,7 +2101,8 @@ emc_probabilistic_insert(struct dp_netdev_pmd_thread
> > *pmd,
> >  }
> > 
> >  static inline struct dp_netdev_flow *
> > -emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
> > +emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key,
> > +        void **pfound_entry)
> >  {
> >      struct emc_entry *current_entry;
> > 
> > @@ -2110,11 +2111,13 @@ emc_lookup(struct emc_cache *cache, const struct
> > netdev_flow_key *key)
> >              && emc_entry_alive(current_entry)
> >              && netdev_flow_key_equal_mf(&current_entry->key, &key->mf)) {
> > 
> > +            *pfound_entry = current_entry;
> >              /* We found the entry with the 'key->mf' miniflow */
> >              return current_entry->flow;
> >          }
> >      }
> > 
> > +    *pfound_entry = NULL;
> >      return NULL;
> >  }
> > 
> > @@ -4583,10 +4586,12 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
> >          key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
> > 
> >          /* If EMC is disabled skip emc_lookup */
> > -        flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key);
> > +        flow = (cur_min == 0) ? NULL:
> > +                emc_lookup(flow_cache, key, &packet->md.prev_emc_entry);
> >          if (OVS_LIKELY(flow)) {
> >              dp_netdev_queue_batches(packet, flow, &key->mf, batches,
> >                                      n_batches);
> > +            packet->md.prev_flow = flow;
> >          } else {
> >              /* Exact match cache missed. Group missed packets together at
> >               * the beginning of the 'packets' array. */
> > @@ -4595,6 +4600,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
> >               * must be returned to the caller. The next key should be
> > extracted
> >               * to 'keys[n_missed + 1]'. */
> >              key = &keys[++n_missed];
> > +            packet->md.prev_flow = NULL;
> >          }
> >      }
> > 
> > @@ -4604,6 +4610,9 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
> >      return dp_packet_batch_size(packets_);
> >  }
> > 
> > +#define CT_PREPEND_ACTION_LEN 0x0C
> > +#define CT_PREPEND_ACTION_SPARE_LEN 32
> > +#define CT_TOT_ACTIONS_LEN (CT_PREPEND_ACTION_LEN +
> > CT_PREPEND_ACTION_SPARE_LEN)
> >  static inline void
> >  handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
> >                       struct dp_packet *packet,
> > @@ -4663,12 +4672,44 @@ handle_packet_upcall(struct dp_netdev_pmd_thread
> > *pmd,
> >          ovs_mutex_lock(&pmd->flow_mutex);
> >          netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
> >          if (OVS_LIKELY(!netdev_flow)) {
> > -            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
> > -                                             add_actions->data,
> > -                                             add_actions->size);
> > +            /* Recirculated packets that belong to established
> > connections
> > +             * are treated differently.  A place-holder is prepended to
> > the
> > +             * list of actions. */
> > +            if (!(packet->md.ct_state & CS_INVALID) &&
> > +                 (packet->md.ct_state & CS_TRACKED) &&
> > +                 (packet->md.ct_state & CS_ESTABLISHED) &&
> > +                 (packet->md.recirc_id)) {
> > +
> > +                /* Prepend an OVS_CT_ATTR_SKIP to the list of actions
> > inside
> > +                 * add_actions.  It does not really invoke the ConnTrack
> > module,
> > +                 * it's a Ct action that works as a place-holder.  It
> > will be
> > +                 * overwritten inside emc_patch_established_conn() with
> > +                 * the proper ct(zone=X) action. */
> > +                struct dp_netdev_actions *new_actions;
> > +                uint8_t tmp_action[CT_TOT_ACTIONS_LEN] = {
> > +                        0x0C, 0x00, 0x0C, 0x00,
> > +                        0x06, 0x00, 0x09, 0x00,
> > +                        0x00, 0x00, 0x00, 0x00,
> > +                        /* will append here add_actions. */
> > +                };
> > +                memcpy(&tmp_action[CT_PREPEND_ACTION_LEN],
> > +                        add_actions->data,
> > +                        add_actions->size);
> > +                new_actions = dp_netdev_actions_create(
> > +                        (const struct nlattr *)tmp_action,
> > +                        CT_PREPEND_ACTION_LEN + add_actions->size);
> > +                netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
> > +                                                 new_actions->actions,
> > +                                                 new_actions->size);
> > +            } else {
> > +                netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
> > +                                                 add_actions->data,
> > +                                                 add_actions->size);
> > +            }
> >          }
> >          ovs_mutex_unlock(&pmd->flow_mutex);
> > -        emc_probabilistic_insert(pmd, key, netdev_flow);
> > +        /* For this RFC, probabilistic emc insertion is disabled. */
> > +        emc_insert(&pmd->flow_cache, key, netdev_flow);
> >      }
> >  }
> > 
> > @@ -4761,7 +4802,8 @@ fast_path_processing(struct dp_netdev_pmd_thread
> > *pmd,
> > 
> >          flow = dp_netdev_flow_cast(rules[i]);
> > 
> > -        emc_probabilistic_insert(pmd, &keys[i], flow);
> > +        /* For testing purposes the emc_insert is restored here. */
> > +        emc_insert(&pmd->flow_cache, &keys[i], flow);
> >          dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches,
> > n_batches);
> >      }
> > 
> > @@ -4998,6 +5040,78 @@ dp_execute_userspace_action(struct
> > dp_netdev_pmd_thread *pmd,
> >      }
> >  }
> > 
> > +/* Search into EMC to retrieve the entry related to the original packet
> > + * and the entry related to the recirculated packet.
> > + * If both EMC entries are alive, then:
> > + *  - The flow actions of the recirculated packet are updated with
> > + *    'ct(zone=N)' as retrieved from the actions of the original flow.
> > + *  - The EMC orig entry flow is updated with the flow pointer of recirc
> > entry.
> > + */
> > +static inline void
> > +emc_patch_established_conn(struct dp_netdev_pmd_thread *pmd,
> > +        struct dp_packet *packet, long long now OVS_UNUSED)
> > +{
> > +    struct emc_cache *cache = &pmd->flow_cache;
> > +    struct netdev_flow_key key;
> > +    struct emc_entry *orig_entry; /* EMC entry hit by the original
> > packet. */
> > +    struct emc_entry *recirc_entry; /* EMC entry for recirculated packet.
> > */
> > +    uint32_t old_hash;
> > +
> > +    if (!packet->md.prev_emc_entry) {
> > +        return;
> > +    }
> > +
> > +    orig_entry = packet->md.prev_emc_entry;
> > +    if (!emc_entry_alive(orig_entry)) {
> > +        return;
> > +    }
> > +
> > +    /* Check that the original EMC entry was not overwritten. */
> > +    struct dp_netdev_flow *orig_flow = orig_entry->flow;
> > +    if (packet->md.prev_flow && (packet->md.prev_flow != orig_flow)) {
> > +       return;
> > +    }
> > +
> > +    /* TODO as we are calling miniflow_extract now, can we avoid to
> > invoke
> > +     * it again when we'll classify this recirculated packet? */
> > +    miniflow_extract(packet, &key.mf);
> > +    key.len = 0; /* Not computed yet. */
> > +    old_hash = dp_packet_get_rss_hash(packet);
> > +    key.hash = dpif_netdev_packet_get_rss_hash(packet, NULL);
> > +    dp_packet_set_rss_hash(packet, old_hash);
> > +
> > +    EMC_FOR_EACH_POS_WITH_HASH(cache, recirc_entry, key.hash) {
> > +        if (recirc_entry->key.hash == key.hash
> > +            && emc_entry_alive(recirc_entry)
> > +            && netdev_flow_key_equal_mf(&recirc_entry->key, &key.mf)) {
> > +
> > +            if (orig_entry->flow == recirc_entry->flow) {
> > +                /* Nothing to do, already patched by a packet of this
> > +                 * same batch. */
> > +                return;
> > +            }
> > +            /* We found the entry related to the recirculated packet.
> > +             * Retrieve the actions for orig and recirc entries. * */
> > +            struct dp_netdev_actions * orig_actions =
> > +                    dp_netdev_flow_get_actions(orig_entry->flow);
> > +            struct dp_netdev_actions * recirc_actions =
> > +                    dp_netdev_flow_get_actions(recirc_entry->flow);
> > +
> > +            /* The orig entry action contains a CT action with the zone
> > info. */
> > +            struct nlattr *p = &orig_actions->actions[0];
> > +
> > +            /* Overwrite the CT Skip action of recirc entry with
> > ct(zone=N). */
> > +            memcpy(recirc_actions->actions, p, p->nla_len);
> > +
> > +            /* Update orig EMC entry. */
> > +            orig_entry->flow = recirc_entry->flow;
> > +            dp_netdev_flow_ref(orig_entry->flow);
> > +
> > +            return;
> > +        }
> > +    }
> > +}
> > +
> >  static void
> >  dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
> >                const struct nlattr *a, bool may_steal)
> > @@ -5024,7 +5138,6 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> >              } else {
> >                  tx_qid = pmd->static_tx_qid;
> >              }
> > -
> >              netdev_send(p->port->netdev, tx_qid, packets_, may_steal,
> >                          dynamic_txqs);
> >              return;
> > @@ -5145,10 +5258,21 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> >              struct dp_packet *packet;
> >              DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
> >                  packet->md.recirc_id = nl_attr_get_u32(a);
> > +
> > +                if (!(packet->md.ct_state & CS_INVALID) &&
> > +                     (packet->md.ct_state & CS_TRACKED) &&
> > +                     (packet->md.ct_state & CS_ESTABLISHED)) {
> > +                    (*depth)++;
> > +                    emc_patch_established_conn(pmd, packet, now);
> > +                    (*depth)--;
> > +                }
> >              }
> > 
> >              (*depth)++;
> >              dp_netdev_recirculate(pmd, packets_);
> > +            DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
> > +                packet->md.recirc_id = 0;
> > +            }
> >              (*depth)--;
> > 
> >              return;
> > @@ -5166,6 +5290,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> >          const char *helper = NULL;
> >          const uint32_t *setmark = NULL;
> >          const struct ovs_key_ct_labels *setlabel = NULL;
> > +        bool skip_ct = false;
> > 
> >          NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
> >                                   nl_attr_get_size(a)) {
> > @@ -5194,6 +5319,9 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> >                  /* Silently ignored, as userspace datapath does not
> > generate
> >                   * netlink events. */
> >                  break;
> > +            case OVS_CT_ATTR_SKIP:
> > +                skip_ct = true;
> > +                break;
> >              case OVS_CT_ATTR_NAT:
> >              case OVS_CT_ATTR_UNSPEC:
> >              case __OVS_CT_ATTR_MAX:
> > @@ -5201,6 +5329,10 @@ dp_execute_cb(void *aux_, struct dp_packet_batch
> > *packets_,
> >              }
> >          }
> > 
> > +        if (OVS_UNLIKELY(skip_ct)) {
> > +            break;
> > +        }
> > +
> >          conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type,
> > force,
> >                            commit, zone, setmark, setlabel, helper);
> >          break;
> > diff --git a/lib/packets.h b/lib/packets.h
> > index 7dbf6dd..2c46a15 100644
> > --- a/lib/packets.h
> > +++ b/lib/packets.h
> > @@ -112,6 +112,8 @@ struct pkt_metadata {
> >      struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. Note
> > that
> >                                   * if 'ip_dst' == 0, the rest of the
> > fields may
> >                                   * be uninitialized. */
> > +    void *prev_emc_entry;       /* EMC entry that this packet matched. */
> > +    void *prev_flow;            /* Flow pointed by the matching EMC
> > entry. */
> >  };
> > 
> >  static inline void
> > --
> > 2.4.11
> > 
> > _______________________________________________
> > dev mailing list
> > dev at openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev





More information about the dev mailing list