[ovs-dev] [RFC PATCH v2] Pipeline packet processing in OVS using FVL flow director.

Mark Bloch markb at mellanox.com
Thu Feb 9 11:28:16 UTC 2017


Hi Sugesh,

First of all, thanks for the respin of this patch.

The overall idea of using HW classification abilities to do offloads
is something we would also like to tackle, so thank you for being first!  

I have a few comments, please see inline.

On 2/6/2017 7:39 PM, Sugesh Chandran wrote:
> The patch uses a pipeline model of packet processing in dpif-netdev.
> Packets are processed by either normal or hardware pipeline based on flow-
> director ID on the packet. The extendable model allows to enable any type
> of partial offloads(Packet types, flow director) in a need basis.
> The patch improved VxLAN decapsulation performance by ~62% .As a caveat, the
> default software path performance is reduced by ~6-7% due to the
> pipeline handling overhead.
> 
> Following major changes are introduced by this commit.
> 
> 1) Added a netdev specific pipeline selection logic on packet reception.
> 2) Every packet carries a 64 bit pipeline metadata over the lifetime of packet
> in OVS.
> 3) Selective miniflow extract logic is introduced for pipeline based miniflow
> extract.
> 4) Flow insert logic is modified to replace the default s/w rules of vxlan
> tunneling with (h/w + s/w) rules.
> 5) Action logic is modified to perform the light-weight tunnel pop for the
> hardware pipeline.
> 
> TODO::
> 1) Flow insertions are handled by the PMD threads at the moment. Its necessary
> to define a handler thread to install hw+s/w flows to avoid blocking PMD
> threads for longer period.
> 2) Hardware flow deletion is not implemented in this patch. Can add when the
> flow insert approach is finalized.
> 3) hardware specific rules are populated only at emc level. dp_classifier also
> need to be populated accordingly.
> 
> Signed-off-by: Sugesh Chandran <sugesh.chandran at intel.com>
> 
> ---
> v2
> Rebased to latest OVS source tree
> ---
>   include/openvswitch/flow.h    |   9 +-
>   include/openvswitch/packets.h |  15 ++-
>   lib/automake.mk               |   8 +-
>   lib/dpdk-i40e-ofld.c          | 207 ++++++++++++++++++++++++++++++++++++
>   lib/dpdk-i40e-ofld.h          |  73 +++++++++++++
>   lib/dpif-netdev.c             | 237 ++++++++++++++++++++++++++++++++++++++----
>   lib/flow.c                    |  71 +++++++++++--
>   lib/hw-pipeline.c             |  75 +++++++++++++
>   lib/hw-pipeline.h             |  52 +++++++++
>   lib/match.c                   |   2 +-
>   lib/netdev-bsd.c              |   1 +
>   lib/netdev-dpdk.c             |  34 +++++-
>   lib/netdev-dummy.c            |   1 +
>   lib/netdev-linux.c            |   1 +
>   lib/netdev-native-tnl.c       |  42 ++++++++
>   lib/netdev-native-tnl.h       |   2 +-
>   lib/netdev-provider.h         |   6 ++
>   lib/netdev-vport.c            |   3 +-
>   lib/nx-match.c                |   2 +-
>   lib/odp-util.h                |   2 +-
>   lib/ofp-util.c                |   2 +-
>   lib/packets.h                 |   8 ++
>   ofproto/ofproto-dpif-rid.h    |   2 +-
>   ofproto/ofproto-dpif-xlate.c  |   2 +-
>   24 files changed, 801 insertions(+), 56 deletions(-)
>   create mode 100644 lib/dpdk-i40e-ofld.c
>   create mode 100644 lib/dpdk-i40e-ofld.h
>   create mode 100644 lib/hw-pipeline.c
>   create mode 100644 lib/hw-pipeline.h
> 
> diff --git a/include/openvswitch/flow.h b/include/openvswitch/flow.h
> index df80dfe..3639fc0 100644
> --- a/include/openvswitch/flow.h
> +++ b/include/openvswitch/flow.h
> @@ -23,7 +23,7 @@
>   /* This sequence number should be incremented whenever anything involving flows
>    * or the wildcarding of flows changes.  This will cause build assertion
>    * failures in places which likely need to be updated. */
> -#define FLOW_WC_SEQ 36
> +#define FLOW_WC_SEQ 37
> 
>   /* Number of Open vSwitch extension 32-bit registers. */
>   #define FLOW_N_REGS 16
> @@ -99,6 +99,9 @@ struct flow {
>       uint32_t conj_id;           /* Conjunction ID. */
>       ofp_port_t actset_output;   /* Output port in action set. */
> 
> +    uint16_t pipeline_id;
> +    uint16_t pipeline_state;
> +    uint8_t pad0[4]; /* Pad to make pipeline 64 bit */
>       /* L2, Order the same as in the Ethernet header! (64-bit aligned) */
>       struct eth_addr dl_dst;     /* Ethernet destination address. */
>       struct eth_addr dl_src;     /* Ethernet source address. */
> @@ -135,8 +138,8 @@ BUILD_ASSERT_DECL(sizeof(struct flow_tnl) % sizeof(uint64_t) 
> == 0);
> 
>   /* Remember to update FLOW_WC_SEQ when changing 'struct flow'. */
>   BUILD_ASSERT_DECL(offsetof(struct flow, igmp_group_ip4) + sizeof(uint32_t)
> -                  == sizeof(struct flow_tnl) + 248
> -                  && FLOW_WC_SEQ == 36);
> +                  == sizeof(struct flow_tnl) + 256
> +                  && FLOW_WC_SEQ == 37);
> 
>   /* Incremental points at which flow classification may be performed in
>    * segments.
> diff --git a/include/openvswitch/packets.h b/include/openvswitch/packets.h
> index 5d97309..26fbc87 100644
> --- a/include/openvswitch/packets.h
> +++ b/include/openvswitch/packets.h
> @@ -19,6 +19,13 @@
> 
>   #include <netinet/in.h>
>   #include "openvswitch/tun-metadata.h"
> +/* Unfortunately, a "struct flow" sometimes has to handle OpenFlow port
> + * numbers and other times datapath (dpif) port numbers.  This union allows
> + * access to both. */
> +union flow_in_port {
> +    odp_port_t odp_port;
> +    ofp_port_t ofp_port;
> +};
> 
>   /* Tunnel information used in flow key and metadata. */
>   struct flow_tnl {
> @@ -53,12 +60,4 @@ struct flow_tnl {
> 
>   #define FLOW_TNL_F_MASK ((1 << 4) - 1)
> 
> -/* Unfortunately, a "struct flow" sometimes has to handle OpenFlow port
> - * numbers and other times datapath (dpif) port numbers.  This union allows
> - * access to both. */
> -union flow_in_port {
> -    odp_port_t odp_port;
> -    ofp_port_t ofp_port;
> -};
> -
>   #endif /* packets.h */
> diff --git a/lib/automake.mk b/lib/automake.mk
> index abc9d0d..563e521 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -370,8 +370,12 @@ endif
> 
>   if DPDK_NETDEV
>   lib_libopenvswitch_la_SOURCES += \
> -       lib/dpdk.c \
> -       lib/netdev-dpdk.c
> +    lib/dpdk-i40e-ofld.c \
> +    lib/dpdk-i40e-ofld.h \
> +    lib/dpdk.c \
> +    lib/netdev-dpdk.c \
> +    lib/hw-pipeline.c \
> +    lib/hw-pipeline.h \
>   else
>   lib_libopenvswitch_la_SOURCES += \
>           lib/dpdk-stub.c
> diff --git a/lib/dpdk-i40e-ofld.c b/lib/dpdk-i40e-ofld.c
> new file mode 100644
> index 0000000..8c8985d
> --- /dev/null
> +++ b/lib/dpdk-i40e-ofld.c
> @@ -0,0 +1,207 @@
> +/*
> + * Copyright (c) 2016 Intel Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +
> +#include "dpdk-i40e-ofld.h"
> +#include "errno.h"
> +#include "ovs-thread.h"
> +#include "openvswitch/vlog.h"
> +#include "netdev-provider.h"
> +
> +/* Hardware specific functions to configure i40e NIC to handle VxLAN packets*/
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +VLOG_DEFINE_THIS_MODULE(dpdk_hw_ofld);
> +
> +#define VXLAN_DST_PORT          4789
> +#define VXLAN_HLEN                  50
> +#define MAX_FDIR_RULES          8000
> +
> +static uint32_t total_fdir_ids;
> +static struct ovs_mutex hw_ofld_mutex = OVS_MUTEX_INITIALIZER;
> +
> +/*
> + * Returns '0' if FDIR IDs reaches max limit. Only 8000 entries are
> + * supported in FVL.
> + */
> +static inline uint32_t
> +i40e_fdir_entry_cnt_inc(void)
> +{
> +    if (total_fdir_ids < MAX_FDIR_RULES) {
> +        ovs_mutex_lock(&hw_ofld_mutex);
> +        total_fdir_ids++;
> +        ovs_mutex_unlock(&hw_ofld_mutex);
> +        return (total_fdir_ids);
> +    }
> +    return 0;
> +}

Looking at the patch, I see a lot of dead code and entire sections
commented out, is there a reason you've included them?

> +static inline void
> +i40e_fdir_entry_cnt_decr(void)
> +{
> +    ovs_mutex_lock(&hw_ofld_mutex);
> +    total_fdir_ids ? total_fdir_ids-- : 0;
> +    ovs_mutex_unlock(&hw_ofld_mutex);
> +}
> +
> +/*
> + * Release the hardware offloading functionality from the dpdk-port.
> + */
> +int
> +dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port)
> +{
> +    ovs_mutex_lock(&hw_ofld_mutex);
> +    //set_i40e_ofld_flag(dpdk_port, 0);
> +    ovs_mutex_unlock(&hw_ofld_mutex);
> +    return 0;
> +}
> +
> +int
> +dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev,
> +                                        int n_rxq, int n_txq,
> +                                        struct rte_eth_conf *port_conf)
> +{
> +    int err = 0;
> +    struct rte_eth_dev_info info;
> +    uint16_t vendor_id, device_id;
> +
> +    rte_eth_dev_info_get(get_dpdk_port_id(dev), &info);
> +    vendor_id = info.pci_dev->id.vendor_id;
> +    device_id = info.pci_dev->id.device_id;
> +    /* Configure vxlan offload only if its FVL NIC */
> +    if (vendor_id != I40E_INTEL_VENDOR_ID || device_id !=
> +                                            I40E_DEV_ID_SFP_XL710) {
> +        VLOG_INFO("Failed to configure NIC, unsupported NIC");
> +        err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> +                                    n_rxq, n_txq, port_conf);
> +        return err;
> +    }
> +    /* Configure FVL FDIR VxLAN tunnel handing */
> +    port_conf->fdir_conf.mode = RTE_FDIR_MODE_PERFECT;
> +    port_conf->fdir_conf.status = RTE_FDIR_REPORT_STATUS_ALWAYS;
> +    port_conf->fdir_conf.flex_conf.nb_payloads = 1;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].type = RTE_ETH_L4_PAYLOAD;
> +    /* Need to initilize all the 16 flex bytes,no matter;
> +     * what we really using, possibly a DPDK bug?? */
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[0] = 0;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[1] = 1;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[2] = 2;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[3] = 3;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[4] = 4;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[5] = 5;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[6] = 6;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[7] = 7;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[8] = 8;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[9] = 9;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[10] = 10;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[11] = 11;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[12] = 12;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[13] = 13;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[14] = 14;
> +    port_conf->fdir_conf.flex_conf.flex_set[0].src_offset[15] = 15;
> +    err = rte_eth_dev_configure(get_dpdk_port_id(dev),
> +                                n_rxq, n_txq, port_conf);
> +    if (err) {
> +        VLOG_ERR("Failed to configure DPDK port with hardware offload");
> +        return err;
> +    }
> +    /*Clean all FDIR entries if any */
> +    err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(dev),
> +            RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_FLUSH, NULL);
> +    VLOG_INFO("Configured port with FDIR , %d", err);
> +    return err;
> +}
> +
> +/*
> + * Install rules for VxLAN packets in hardware
> + */
> +int
> +set_up_hw_offload_port_rule(struct netdev *netdev__,
> +                                const struct flow *flow,
> +                                /*const uint32_t hw_flow_id,*/
> +                                const bool is_add_rule)
> +{
> +    int err = 0;
> +    uint8_t flexbytes[RTE_ETH_FDIR_MAX_FLEXLEN] = { 0 };
> +    uint32_t *vni;
> +    enum rte_filter_op filter_op;
> +    struct rte_eth_fdir_filter entry = { 0 };
> +    struct netdev_dpdk *netdev;
> +
> +    netdev = netdev_dpdk_cast(netdev__);
> +    /*if (is_i40e_ofld_enable(netdev)) {*/
> +        entry.soft_id = (flow->tunnel.tun_id >>32);
> +        if (!entry.soft_id) {
> +            VLOG_DBG("Invalid flow ID, Cant install rule in the NIC for "
> +                             "hardware offload");
> +            err = ECANCELED;
> +            return err;
> +        }
> +        /* Install rules in NIC only for VxLAN flows */
> +        if (ntohs(flow->tp_dst) != VXLAN_DST_PORT) {
> +            return 0;
> +        }
> +
> +        entry.input.flow_ext.vlan_tci = 0; //! ignored by i40e fdir
> +        entry.input.flow_ext.is_vf = 0;
> +        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
> +        entry.input.flow.udp4_flow.ip.src_ip = flow->nw_src;
> +        entry.input.flow.udp4_flow.ip.dst_ip = flow->nw_dst;
> +        entry.input.flow.udp4_flow.ip.tos = flow->nw_tos;
> +        entry.input.flow.udp4_flow.ip.ttl = flow->nw_ttl;
> +        entry.input.flow.udp4_flow.ip.proto = 17; /* UDP */
> +
> +        entry.input.flow.udp4_flow.dst_port = flow->tp_dst;
> +        entry.input.flow.udp4_flow.src_port = flow->tp_src;
> +        vni = (uint32_t *)&flexbytes[4];
> +        //*vni = flow->tunnel.tun_id << 8;
> +        *vni = flow->tunnel.tun_id;
> +        memcpy(entry.input.flow_ext.flexbytes, flexbytes,
> +                      RTE_ETH_FDIR_MAX_FLEXLEN);
> +
> +        entry.action.behavior = RTE_ETH_FDIR_PASSTHRU;
> +         //entry.action.report_status = RTE_ETH_FDIR_REPORT_ID_FLEX_4;
> +        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID;
> +        entry.action.rx_queue = 0;
> +        entry.action.flex_off = 0;  /* use 0 by default */
> +        filter_op = is_add_rule ? RTE_ETH_FILTER_ADD :
> +                                              RTE_ETH_FILTER_DELETE;
> +        err = rte_eth_dev_filter_ctrl(get_dpdk_port_id(netdev),
> +                 RTE_ETH_FILTER_FDIR, filter_op, &entry);
> +
> +        /*
> +         * XXX : Delayed the max limit check for flow director entries after
> +         * the configuration. Anyway the rte_eth_dev_filter_ctrl will fail if
> +         * max limit reaches. This can be used for tracking.
> +         */
> +        if (is_add_rule) {
> +            if (!i40e_fdir_entry_cnt_inc()) {
> +                VLOG_DBG("Cant configure rule on NIC, Flow director "
> +                        "entries hits max limit");
> +            }
> +        }
> +        else {
> +            i40e_fdir_entry_cnt_decr();
> +        }
> +        if (err < 0) {
> +            VLOG_ERR("flow director programming error in NIC: (%d)\n", err);
> +            return err;
> +        }
> +    //}
> +    return err;
> +}
> +
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> diff --git a/lib/dpdk-i40e-ofld.h b/lib/dpdk-i40e-ofld.h
> new file mode 100644
> index 0000000..e51556c
> --- /dev/null
> +++ b/lib/dpdk-i40e-ofld.h
> @@ -0,0 +1,73 @@
> +/*
> + * Copyright (c) 2016 Intel Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef DPDK_I40E_OFLD_H_
> +#define DPDK_I40E_OFLD_H_
> +
> +#include <config.h>
> +
> +#include "dp-packet.h"
> +#include "netdev.h"
> +#include "rte_ethdev.h"
> +
> +/*
> + * Macro to enable/disable HW OFFLOAD feature for DPDK.
> + * 1 :- Enable HW_OFFLOAD support in OVS
> + * 0 :- Disable HW_OFFLOAD support in OVS
> + */
> +#define DPDK_I40E_TNL_OFFLOAD_ENABLE        1
> +#ifdef DPDK_I40E_TNL_OFFLOAD_ENABLE
> +
> +struct netdev_dpdk;
> +struct dp_netdev_pmd_thread;
> +struct dp_netdev_flow;
> +
> +#define I40E_DEV_ID_SFP_XL710 0x1572
> +#define I40E_INTEL_VENDOR_ID        0x8086
> +
> +struct netdev_dpdk *netdev_dpdk_cast(const struct netdev *netdev);
> +extern inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port);
> +int dpdk_eth_dev_hw_ofld_init(struct netdev_dpdk *dev, int n_rxq, int n_txq,
> +                              struct rte_eth_conf *port_conf);
> +int dpdk_hw_ofld_port_release(struct netdev_dpdk *dpdk_port);
> +int set_up_hw_offload_port_rule(struct netdev *netdev__,
> +                                const struct flow *flow,
> +                                /* const uint32_t hw_flow_id, */
> +                                const bool is_add_rule);
> +const struct dp_netdev_flow *lookup_hw_offload_flow_for_fdirid(
> +                            const struct dp_netdev_pmd_thread *pmd,
> +                            struct rte_mbuf *mbuf, uint32_t flow_id);
> +
> +static inline uint32_t
> +get_fdir_flow_id(struct dp_packet *packet)
> +{
> +    struct rte_mbuf *mbuf;
> +    uint32_t flow_id =0;
> +    mbuf = (struct rte_mbuf *)packet;
> +    flow_id = mbuf->hash.fdir.hi;
> +    mbuf->hash.fdir.hi = 0;
> +    return flow_id;
> +}
> +
> +static inline void
> +reset_fdir_flow_id(struct dp_packet *packet)
> +{
> +    struct rte_mbuf *mbuf;
> +    mbuf = (struct rte_mbuf *)packet;
> +    mbuf->hash.fdir.hi = 0;
> +}
> +#endif //DPDK_I40E_TNL_OFFLOAD_ENABLE
> +#endif /* DPDK_I40E_OFLD_H_ */
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 0be5db5..6bc2139 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -76,6 +76,8 @@
>   #include "tnl-ports.h"
>   #include "unixctl.h"
>   #include "util.h"
> +#include "hw-pipeline.h"
> +#include "netdev-provider.h"
> 
>   VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> 
> @@ -384,6 +386,58 @@ struct dp_netdev_flow {
>       /* 'cr' must be the last member. */
>   };
> 
> +struct pipeline_flow {
> +    struct dp_netdev_flow flow;
> +    struct netdev_flow_key key;
> +};
> +struct pipeline_flow_queue {
> +    /* XXX: its necessary to have queue per pipeline in the future.
> +     * For now its only for one pipeline.
> +     */
> +    struct pipeline_flow  ppl_flow[MAX_PIPELINE_FLOW];
> +    int front;
> +    int back;
> +};
> +static struct pipeline_flow_queue flow_queue = { 0 };
> +
> +static void enqueue_flow(struct dp_netdev_flow *flow, struct netdev_flow_key *key)
> +{
> +    /* Enqueue the flow into pipeline queue. Dont care the case of overwrite
> +     * This has to be atomic, but anyway the flow insert is handled by PMD itself.
> +     * So no chance of preemption.
> +     */
> +    struct pipeline_flow *ppl_flow = &flow_queue.ppl_flow[flow_queue.front];
> +    memcpy(&ppl_flow->flow, flow, sizeof ppl_flow->flow);
> +    memcpy(&ppl_flow->key, key, sizeof ppl_flow->key);
> +    flow_queue.front++;
> +    if (flow_queue.front >= MAX_PIPELINE_FLOW) {
> +        /* Reset the index for read from start */
> +        flow_queue.front  =0;
> +    }
> +}
> +
> +static struct pipeline_flow *dequeue_flow(void)
> +{
> +    struct pipeline_flow *ppl_flow = &flow_queue.ppl_flow[flow_queue.back];
> +    if(!ppl_flow->flow.pmd_id) {
> +        VLOG_DBG("The queue is empty, cannot read");
> +        return NULL;
> +    }
> +    flow_queue.back++;
> +    if (flow_queue.back >= MAX_PIPELINE_FLOW) {
> +        /* Reset the back pointer for the proper read */
> +        flow_queue.back = 0;
> +    }
> +    return ppl_flow;
> +}
> +
> +static inline void del_pipeline_flow_in_q(struct pipeline_flow *ppl_flow)
> +{
> +   if(ppl_flow){
> +       memset(ppl_flow, 0, sizeof *ppl_flow);
> +   }
> +}
> +
>   static void dp_netdev_flow_unref(struct dp_netdev_flow *);
>   static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
>   static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
> @@ -423,7 +477,7 @@ struct dp_netdev_pmd_cycles {
> 
>   struct polled_queue {
>       struct netdev_rxq *rx;
> -    odp_port_t port_no;
> +    struct dp_netdev_port *port;
>   };
> 
>   /* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
> @@ -568,7 +622,7 @@ static void dp_netdev_execute_actions(struct 
> dp_netdev_pmd_thread *pmd,
>                                         size_t actions_len,
>                                         long long now);
>   static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> -                            struct dp_packet_batch *, odp_port_t port_no);
> +                            struct dp_packet_batch *, struct dp_netdev_port *port);
>   static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
>                                     struct dp_packet_batch *);
> 
> @@ -619,6 +673,21 @@ static int dpif_netdev_xps_get_tx_qid(const struct 
> dp_netdev_pmd_thread *pmd,
>   static inline bool emc_entry_alive(struct emc_entry *ce);
>   static void emc_clear_entry(struct emc_entry *ce);
> 
> +static inline void
> +emc_insert(struct dp_netdev_pmd_thread *pmd, const struct netdev_flow_key *key,
> +           struct dp_netdev_flow *flow);
> +static inline void
> +emc_hw_insert(struct dp_netdev_pmd_thread *pmd, const struct netdev_flow_key *key,
> +           struct dp_netdev_flow *flow);
> +/*
> + * XXX :: Do not change the order of following flow_insert function set. Its 
> depends on
> + * the order of enum pipeline_id.
> + */
> +static pipeline_dp_flow_insert ppl_dp_flow_insert[] = {
> +        emc_insert,
> +        emc_hw_insert //HW_OFFLOAD_PIPE_LINE
> +};
> +
>   static void
>   emc_cache_init(struct emc_cache *flow_cache)
>   {
> @@ -1915,11 +1984,12 @@ emc_change_entry(struct emc_entry *ce, struct 
> dp_netdev_flow *flow,
>   }
> 
>   static inline void
> -emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
> +emc_insert(struct dp_netdev_pmd_thread *pmd, const struct netdev_flow_key *key,
>              struct dp_netdev_flow *flow)
>   {
>       struct emc_entry *to_be_replaced = NULL;
>       struct emc_entry *current_entry;
> +    struct emc_cache *cache = &pmd->flow_cache;
> 
>       EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
>           if (netdev_flow_key_equal(&current_entry->key, key)) {
> @@ -1943,6 +2013,100 @@ emc_insert(struct emc_cache *cache, const struct 
> netdev_flow_key *key,
>       emc_change_entry(to_be_replaced, flow, key);
>   }
> 
> +/* Flow rule insertion for hardware offload */
> +static inline void
> +emc_hw_insert(struct dp_netdev_pmd_thread *pmd, const struct netdev_flow_key *key,
> +           struct dp_netdev_flow *flow)
> +{
> +    struct emc_cache *cache = &pmd->flow_cache;
> +    struct flow *sw_flow = &flow->flow;
> +    struct flow_tnl *tnl_md = &sw_flow->tunnel;
> +    emc_insert(pmd, key, flow);
> +    /* Set up the hardware flow to insert into hardware */
> +    if(tnl_md->ip_dst) {
> +        struct flow in_flow = { 0 };
> +        struct netdev_flow_key in_key = { 0 };
> +        /*
> +         * just enqueue the flow, need it later at the time of outer flow 
> insertion.
> +         */
> +        enqueue_flow(flow, key);
> +        /* Insert the inner flow for pipeline */
> +        in_flow.dl_dst = sw_flow->dl_dst;
> +        in_flow.dl_src = sw_flow->dl_src;
> +        in_flow.pipeline_id = HW_OFFLOAD_PIPELINE;
> +        in_flow.pipeline_state = PIPELINE_ACTIVE;
> +        in_flow.in_port = sw_flow->in_port;
> +        in_key.len = 0;
> +        in_key.hash = key->hash;

Why are you setting len == 0 and taking the hash from the sw_flow?
Why not use netdev_flow_key_size to set the len.
As for the hash, hash_finish(sw_flow->in_port.odp_port, 42) could be used.

> +        miniflow_map_init(&in_key.mf, &in_flow);
> +        miniflow_init(&in_key.mf, &in_flow);
> +        memcpy(&flow->flow, &in_flow, sizeof(in_flow));
> +        emc_insert(pmd, &in_key, flow);
> +    }
> +    else {
> +        /* Insert the outer flow using the flow data that stored last time. */
> +        struct pipeline_flow *ppl_flow;
> +        struct flow *old_flow;
> +        struct flow out_flow = { 0 };
> +        struct flow hw_flow = *sw_flow;
> +        struct netdev_flow_key out_key = { 0 };
> +        ppl_flow = dequeue_flow();

Your queue might be empty, ppl_flow could be NULL.

> +        old_flow = &ppl_flow->flow.flow;
> +        if(!old_flow) {
> +            VLOG_DBG("NULL OLD FLOW, cannot do much");
> +            goto out;
> +        }
> +        if (old_flow->tunnel.ip_dst != sw_flow->nw_dst ||
> +                old_flow->tunnel.ip_src != sw_flow->nw_src) {
> +            /* Looks like the tunnel is missing in queue */
> +            VLOG_DBG("Cannot find the tunnel information in the queue, Cannot 
> insert "
> +                    "hardware rule");
> +            goto out;
> +        }

Why do you assume the order of flow insertion ? (shouldn't you check the entire
queue for a possible match?). 

> +        /* Insert outer flow now */
> +        out_flow.pipeline_id = HW_OFFLOAD_PIPELINE;
> +        out_flow.pipeline_state = PIPELINE_ACTIVE;
> +        out_flow.in_port = sw_flow->in_port;
> +        out_flow.dl_dst = sw_flow->dl_dst;
> +        out_flow.dl_src = sw_flow->dl_src;
> +        out_key.len = 0;
> +        out_key.hash = key->hash;
> +        miniflow_map_init(&out_key.mf, &out_flow);
> +        miniflow_init(&out_key.mf, &out_flow);
> +        memcpy(&flow->flow, &out_flow, sizeof(out_flow));
> +        emc_insert(pmd, &out_key, flow);
> +        {
> +            struct dp_netdev_port *dp_port;
> +            uint32_t err;
> +            /* Program the NICs */
> +            hw_flow.tunnel.tun_id = old_flow->tunnel.tun_id;
> +            err = get_port_by_number(pmd->dp, hw_flow.in_port.odp_port, &dp_port);
> +            if (err) {
> +                VLOG_ERR("Cannot get the port information, Failed to configure "
> +                                    "hardware offload");
> +                goto out;
> +            }
> +            set_up_hw_offload_port_rule(dp_port->netdev, &hw_flow, 1);
> +        }
> +out:
> +        del_pipeline_flow_in_q(ppl_flow);
> +    }
> +}
> +
> +/* Flow rule insertion into the emc
> + * Decides what function is going to insert the rule.
> + */
> +static inline void
> +dp_emc_flow_insert(struct dp_packet *packet, struct dp_netdev_pmd_thread *pmd,
> +        const struct netdev_flow_key *key, struct dp_netdev_flow *flow)
> +{
> +    struct pipeline_md *ppl_md;
> +    ppl_md = &packet->md.ppl_md;
> +    ppl_dp_flow_insert[ppl_md->id](pmd, key, flow);
> +}
> +
>   static inline struct dp_netdev_flow *
>   emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
>   {
> @@ -2710,7 +2874,7 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op 
> **ops, size_t n_ops)
>               break;
> 
>           case DPIF_OP_FLOW_DEL:
> -            op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
> +            //op->error = dpif_netdev_flow_del(dpif, &op->u.flow_del);
>               break;
> 
>           case DPIF_OP_EXECUTE:
> @@ -2903,7 +3067,7 @@ cycles_count_end(struct dp_netdev_pmd_thread *pmd,
>   static void
>   dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>                              struct netdev_rxq *rx,
> -                           odp_port_t port_no)
> +                           struct dp_netdev_port *port)
>   {
>       struct dp_packet_batch batch;
>       int error;
> @@ -2916,7 +3080,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>           *recirc_depth_get() = 0;
> 
>           cycles_count_start(pmd);
> -        dp_netdev_input(pmd, &batch, port_no);
> +        dp_netdev_input(pmd, &batch, port);
>           cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
>       } else if (error != EAGAIN && error != EOPNOTSUPP) {
>           static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> @@ -3396,7 +3560,7 @@ dpif_netdev_run(struct dpif *dpif)
> 
>                   for (i = 0; i < port->n_rxq; i++) {
>                       dp_netdev_process_rxq_port(non_pmd, port->rxqs[i].rx,
> -                                               port->port_no);
> +                                               port);
>                   }
>               }
>           }
> @@ -3503,7 +3667,7 @@ pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>       i = 0;
>       HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
>           poll_list[i].rx = poll->rxq->rx;
> -        poll_list[i].port_no = poll->rxq->port->port_no;
> +        poll_list[i].port = poll->rxq->port;
>           i++;
>       }
> 
> @@ -3553,7 +3717,7 @@ reload:
>       for (;;) {
>           for (i = 0; i < poll_cnt; i++) {
>               dp_netdev_process_rxq_port(pmd, poll_list[i].rx,
> -                                       poll_list[i].port_no);
> +                                       poll_list[i].port);
>           }
> 
>           if (lc++ > 1024) {
> @@ -4078,15 +4242,28 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
>                  struct dp_packet_batch *packets_,
>                  struct netdev_flow_key *keys,
>                  struct packet_batch_per_flow batches[], size_t *n_batches,
> -               bool md_is_valid, odp_port_t port_no)
> +               bool md_is_valid, struct dp_netdev_port *port)
>   {
>       struct emc_cache *flow_cache = &pmd->flow_cache;
>       struct netdev_flow_key *key = &keys[0];
> -    size_t n_missed = 0, n_dropped = 0;
> +    size_t i, n_missed = 0, n_dropped = 0;
> +    struct dp_packet **packets = packets_->packets;
>       struct dp_packet *packet;
>       const size_t size = dp_packet_batch_size(packets_);
> -    int i;
> -
> +    int cnt = packets_->count;
> +    struct pipeline_md *ppl_md;
> +    odp_port_t port_no = 0;
> +    struct netdev *netdev =NULL;
> +    inline void
> +   (* get_packet_pipeline_ptr)(struct netdev *netdev, struct dp_packet *packet,
> +                                            struct pipeline_md *ppl_md);
> +
> +    get_packet_pipeline_ptr = &get_packet_pipeline_no_op;
> +    if(port) {
> +        port_no = port->port_no;
> +        netdev = port->netdev;
> +        get_packet_pipeline_ptr = &get_packet_pipeline;
> +    }
>       DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets_) {
>           struct dp_netdev_flow *flow;
> 
> @@ -4106,7 +4283,10 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
>           if (!md_is_valid) {
>               pkt_metadata_init(&packet->md, port_no);
>           }
> -        miniflow_extract(packet, &key->mf);
> +        /* Call the miniflow extract for the specific pipeline */
> +        ppl_md = &packet->md.ppl_md;
> +        (* get_packet_pipeline_ptr)(netdev, packet, ppl_md);
> +        ppl_mf_extract[ppl_md->id](packet, &key->mf);
>           key->len = 0; /* Not computed yet. */
>           key->hash = dpif_netdev_packet_get_rss_hash(packet, &key->mf);
> 
> @@ -4139,9 +4319,19 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, 
> struct dp_packet *packet,
>       struct ofpbuf *add_actions;
>       struct dp_packet_batch b;
>       struct match match;
> +    struct pipeline_md *ppl_md = &packet->md.ppl_md;
>       ovs_u128 ufid;
>       int error;
> 
> +    if (ppl_md->id && ppl_md->state) {
> +        /* Upcall for a different active pipeline than software pipeline is not
> +         * allowed.
> +         */
> +        VLOG_INFO("Cannot make upcall on packet from pipeline %d", ppl_md->id);
> +        dp_packet_delete(packet);
> +        (*lost_cnt)++;
> +        return;
> +    }
>       match.tun_md.valid = false;
>       miniflow_expand(&key->mf, &match.flow);
> 
> @@ -4193,8 +4383,8 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, 
> struct dp_packet *packet,
>                                                add_actions->size);
>           }
>           ovs_mutex_unlock(&pmd->flow_mutex);
> -
> -        emc_insert(&pmd->flow_cache, key, netdev_flow);
> +        dp_emc_flow_insert(packet, pmd, key, netdev_flow);
> +        //emc_insert(&pmd->flow_cache, key, netdev_flow);
>       }
>   }
> 
> @@ -4217,7 +4407,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
>       struct dpcls *cls;
>       struct dpcls_rule *rules[PKT_ARRAY_SIZE];
>       struct dp_netdev *dp = pmd->dp;
> -    struct emc_cache *flow_cache = &pmd->flow_cache;
> +    //struct emc_cache *flow_cache = &pmd->flow_cache;
>       int miss_cnt = 0, lost_cnt = 0;
>       int lookup_cnt = 0, add_lookup_cnt;
>       bool any_miss;
> @@ -4288,7 +4478,8 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
> 
>           flow = dp_netdev_flow_cast(rules[i]);
> 
> -        emc_insert(flow_cache, &keys[i], flow);
> +        dp_emc_flow_insert(packet, pmd, &keys[i], flow);
> +        //emc_insert(flow_cache, &keys[i], flow);
>           dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
>       }
> 
> @@ -4307,7 +4498,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
>   static void
>   dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
>                     struct dp_packet_batch *packets,
> -                  bool md_is_valid, odp_port_t port_no)
> +                  bool md_is_valid, struct dp_netdev_port *port)
>   {
>       int cnt = packets->count;
>   #if !defined(__CHECKER__) && !defined(_WIN32)
> @@ -4324,7 +4515,7 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
> 
>       n_batches = 0;
>       emc_processing(pmd, packets, keys, batches, &n_batches,
> -                            md_is_valid, port_no);
> +                            md_is_valid, port);
>       if (!dp_packet_batch_is_empty(packets)) {
>           /* Get ingress port from first packet's metadata. */
>           in_port = packets->packets[0]->md.in_port.odp_port;
> @@ -4353,16 +4544,16 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
>   static void
>   dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
>                   struct dp_packet_batch *packets,
> -                odp_port_t port_no)
> +                struct dp_netdev_port *port)
>   {
> -    dp_netdev_input__(pmd, packets, false, port_no);
> +    dp_netdev_input__(pmd, packets, false, port);
>   }
> 
>   static void
>   dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
>                         struct dp_packet_batch *packets)
>   {
> -    dp_netdev_input__(pmd, packets, true, 0);
> +    dp_netdev_input__(pmd, packets, true, NULL);
>   }
> 
>   struct dp_netdev_execute_aux {
> diff --git a/lib/flow.c b/lib/flow.c
> index fb7bfeb..c1cf582 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -40,6 +40,8 @@
>   #include "random.h"
>   #include "unaligned.h"
>   #include "util.h"
> +#include "hw-pipeline.h"
> +#include "dpdk-i40e-ofld.h"
> 
>   COVERAGE_DEFINE(flow_extract);
>   COVERAGE_DEFINE(miniflow_malloc);
> @@ -125,7 +127,7 @@ struct mf_ctx {
>    * away.  Some GCC versions gave warnings on ALWAYS_INLINE, so these are
>    * defined as macros. */
> 
> -#if (FLOW_WC_SEQ != 36)
> +#if (FLOW_WC_SEQ != 37)
>   #define MINIFLOW_ASSERT(X) ovs_assert(X)
>   BUILD_MESSAGE("FLOW_WC_SEQ changed: miniflow_extract() will have runtime "
>                  "assertions enabled. Consider updating FLOW_WC_SEQ after "
> @@ -547,6 +549,56 @@ flow_extract(struct dp_packet *packet, struct flow *flow)
>       miniflow_expand(&m.mf, flow);
>   }
> 
> +void
> +hw_fvl_mf_extract(struct dp_packet *packet, struct miniflow *dst)
> +{
> +    uint64_t hw_flow_id;
> +    uint64_t *values = miniflow_values(dst);
> +    const struct pkt_metadata *md = &packet->md;
> +    const struct pipeline_md *ppl_md = &md->ppl_md;
> +    struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,
> +                         values + FLOW_U64S };
> +    miniflow_push_uint32(mf, dp_hash, md->dp_hash);

dp_hash should be zero at this point, I'm guessing you are doing this because  in_port
is in the upper 32 bits, so for miniflow_push to work, you need to push dp_hash.
why not do: miniflow_push_uint32(mf, dp_hash, 0) ?

> +    miniflow_push_uint32(mf, in_port, odp_to_u32(md->in_port.odp_port));
> +    //miniflow_push_uint16(mf, pipeline_id, HW_OFFLOAD_PIPELINE);
> +    //miniflow_push_uint16(mf, pipeline_state, PIPELINE_ACTIVE);
> +    miniflow_push_uint32(mf, pipeline_id,
> +                                       (HW_OFFLOAD_PIPELINE<<16 | 
> PIPELINE_ACTIVE));
> +    miniflow_pad_to_64(mf, pipeline_state);
> +
> +    /*hw_flow_id = get_fdir_flow_id(packet);
> +    if(!hw_flow_id) {*/
> +        const void *data = dp_packet_data(packet);
> +        miniflow_push_macs(mf, dl_dst, data);

The if around this block is commented out, so we are always taking the mac address.
in next version of dpdk, there will be a new api, rte_flow.
With that API, it possible to create rules that match: l2 -> ip - udp -> vxlan -> l2 -> ip
so if when inserting to hw we use the both inner and upper header, we can tag those packets
and then the hash would be done on: tag, in_port, pipeline_active + hw_offload_pipeline
this way we can avoid taking the mac addresses.

> +    /*}*/
> +    dst->map = mf.map;
> +}
> +
> +void
> +hw_vlan_mf_extract(struct dp_packet *packet, struct miniflow *dst)
> +{
> +    ovs_be16 vlan_tci;
> +    ovs_be16 dl_type;
> +    const struct pkt_metadata *md = &packet->md;
> +    const void *data = dp_packet_data(packet);
> +    size_t size = dp_packet_size(packet);
> +    uint64_t *values = miniflow_values(dst);
> +    struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,
> +                         values + FLOW_U64S };
> +    miniflow_push_uint32(mf, dp_hash, md->dp_hash);
> +    miniflow_push_uint32(mf, in_port, odp_to_u32(md->in_port.odp_port));
> +
> +    miniflow_push_uint16(mf, pipeline_id, HW_OFFLOAD_PIPELINE);
> +    miniflow_push_uint16(mf, pipeline_state, PIPELINE_ACTIVE);
> +    miniflow_pad_to_64(mf, pipeline_state);
> +    miniflow_push_macs(mf, dl_dst, data);
> +    vlan_tci = parse_vlan(&data, &size);
> +    dl_type = parse_ethertype(&data, &size);
> +    miniflow_push_be16(mf, dl_type, dl_type);
> +    miniflow_push_be16(mf, vlan_tci, vlan_tci);
> +    dst->map = mf.map;
> +}

Why is the vlan code included?

> +
>   /* Caller is responsible for initializing 'dst' with enough storage for
>    * FLOW_U64S * 8 bytes. */
>   void
> @@ -869,7 +921,7 @@ flow_get_metadata(const struct flow *flow, struct match 
> *flow_metadata)
>   {
>       int i;
> 
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>       match_init_catchall(flow_metadata);
>       if (flow->tunnel.tun_id != htonll(0)) {
> @@ -1275,7 +1327,7 @@ void flow_wildcards_init_for_packet(struct flow_wildcards *wc,
>       memset(&wc->masks, 0x0, sizeof wc->masks);
> 
>       /* Update this function whenever struct flow changes. */
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>       if (flow_tnl_dst_is_set(&flow->tunnel)) {
>           if (flow->tunnel.flags & FLOW_TNL_F_KEY) {
> @@ -1319,6 +1371,7 @@ void flow_wildcards_init_for_packet(struct flow_wildcards *wc,
>       WC_MASK_FIELD(wc, ct_label);
>       WC_MASK_FIELD(wc, recirc_id);
>       WC_MASK_FIELD(wc, dp_hash);
> +    //WC_MASK_FIELD(wc, pipeline_id);
?
>       WC_MASK_FIELD(wc, in_port);
> 
>       /* actset_output wildcarded. */
> @@ -1393,7 +1446,7 @@ void
>   flow_wc_map(const struct flow *flow, struct flowmap *map)
>   {
>       /* Update this function whenever struct flow changes. */
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>       flowmap_init(map);
> 
> @@ -1416,6 +1469,7 @@ flow_wc_map(const struct flow *flow, struct flowmap *map)
>       FLOWMAP_SET(map, recirc_id);
>       FLOWMAP_SET(map, dp_hash);
>       FLOWMAP_SET(map, in_port);
> +    //FLOWMAP_SET(map, pipeline_id);
?
>       FLOWMAP_SET(map, dl_dst);
>       FLOWMAP_SET(map, dl_src);
>       FLOWMAP_SET(map, dl_type);
> @@ -1477,12 +1531,13 @@ void
>   flow_wildcards_clear_non_packet_fields(struct flow_wildcards *wc)
>   {
>       /* Update this function whenever struct flow changes. */
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>       memset(&wc->masks.metadata, 0, sizeof wc->masks.metadata);
>       memset(&wc->masks.regs, 0, sizeof wc->masks.regs);
>       wc->masks.actset_output = 0;
>       wc->masks.conj_id = 0;
> +    wc->masks.pipeline_id = 0;
>   }
> 
>   /* Returns true if 'wc' matches every packet, false if 'wc' fixes any bits or
> @@ -1621,7 +1676,7 @@ flow_wildcards_set_xxreg_mask(struct flow_wildcards *wc, 
> int idx,
>   uint32_t
>   miniflow_hash_5tuple(const struct miniflow *flow, uint32_t basis)
>   {
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
>       uint32_t hash = basis;
> 
>       if (flow) {
> @@ -1668,7 +1723,7 @@ ASSERT_SEQUENTIAL(ipv6_src, ipv6_dst);
>   uint32_t
>   flow_hash_5tuple(const struct flow *flow, uint32_t basis)
>   {
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
>       uint32_t hash = basis;
> 
>       if (flow) {
> @@ -2136,7 +2191,7 @@ flow_push_mpls(struct flow *flow, int n, ovs_be16 
> mpls_eth_type,
> 
>           if (clear_flow_L3) {
>               /* Clear all L3 and L4 fields and dp_hash. */
> -            BUILD_ASSERT(FLOW_WC_SEQ == 36);
> +            BUILD_ASSERT(FLOW_WC_SEQ == 37);
>               memset((char *) flow + FLOW_SEGMENT_2_ENDS_AT, 0,
>                      sizeof(struct flow) - FLOW_SEGMENT_2_ENDS_AT);
>               flow->dp_hash = 0;
> diff --git a/lib/hw-pipeline.c b/lib/hw-pipeline.c
> new file mode 100644
> index 0000000..378f45f
> --- /dev/null
> +++ b/lib/hw-pipeline.c
> @@ -0,0 +1,75 @@
> +/*
> + * hw-pipeline.c
> + *
> + *  Created on: 13 Oct 2016
> + *      Author: sugeshch
> + */
> +#include <config.h>
> +#include "hw-pipeline.h"
> +#include "openvswitch/vlog.h"
> +#include "netdev-provider.h"
> +#include "dp-packet.h"
> +VLOG_DEFINE_THIS_MODULE(hw_pipeline);
> +
> +/*
> + * XXX: Do not change the order of following list of miniflow extract functions.
> + * The index is being mapped to the pipeline_id. The changes in the order must
> + * update the enum pipeline_id.
> + */
> +pipeline_mf_extract ppl_mf_extract[] = {
> +        miniflow_extract,
> +        hw_ofld_tunnel_mf_extract   //HW_OFFLOAD_PIPE_LINE
> +};
> +
> +static inline void
> +init_packet_pipeline_md(struct pkt_metadata *md, enum pipeline_id ppl_id,
> +                                              bool status)
> +{
> +    md->ppl_md.id = ppl_id;
> +    md->ppl_md.state = status;
> +}
> +
> +
> +inline void
> +get_packet_pipeline_no_op(struct netdev *netdev, struct dp_packet *packet,
> +                                        struct pipeline_md *ppl_md)
> +{
> +    /* Does nothing. Simple no-op */
> +}
> +
> +inline void
> +get_packet_pipeline(struct netdev *netdev, struct dp_packet *packet,
> +                                        struct pipeline_md *ppl_md)
> +{
> +    if(netdev->netdev_class->get_pipeline) {
> +        /* The port has specific pipeline function to determine the pipeline 
> and status */
> +        netdev->netdev_class->get_pipeline(netdev, packet, ppl_md);
> +    }
> +}
> +
> +/*
> + * hardware offload miniflow extract function
> + */
> +void
> +hw_ofld_tunnel_mf_extract(struct dp_packet *packet, struct miniflow *mf)
> +{
> +    // first check if packet has the pipeline_id set, if yes, check if the 
> pipeline is active,
> +    //if yes, then do the specific miniflow extract than default and return.
> +    // Packet received on the physical port doesnt have the pipeline. so call 
> the port's pipeline define function. the function takes the packet as input.
> +    // In DPDK this function must be defined. packet metadata has to carry the
> +    //pipeline id and status for processing.
> +    // Default should be the sofware miniflow extract.
> +    struct pipeline_md *ppl_md;
> +    ppl_md = &packet->md.ppl_md;
> +    if ((!ppl_md->id || ppl_md->state == PIPELINE_INACTIVE)) {
> +        /* if the pipeline id is not set, then do the default miniflow extract
> +         * Also when pipeline state is inactive, do the default miniflow extract
> +         */
> +        /* Default miniflow extract */
> +        miniflow_extract(packet, mf);
> +        return;
> +    }
> +    //hw_vlan_mf_extract(packet, mf);
> +    hw_fvl_mf_extract(packet, mf);
> +}
> +
> diff --git a/lib/hw-pipeline.h b/lib/hw-pipeline.h
> new file mode 100644
> index 0000000..3690d99
> --- /dev/null
> +++ b/lib/hw-pipeline.h
> @@ -0,0 +1,52 @@
> +/*
> + * hw-pipeline.h
> + *
> + *  Created on: 13 Oct 2016
> + *      Author: sugeshch
> + */
> +
> +#ifndef LIB_HW_PIPELINE_H_
> +#define LIB_HW_PIPELINE_H_
> +
> +#include "flow.h"
> +
> +enum pipeline_id {
> +        DEFAULT_SW_PIPELINE = 0,
> +        HW_OFFLOAD_PIPELINE
> +};
> +
> +enum pipeline_state {
> +    PIPELINE_INACTIVE = 0,
> +    PIPELINE_ACTIVE
> +};
> +
> +void hw_ofld_tunnel_mf_extract(struct dp_packet *packet, struct miniflow *mf);
> +
> +/**** FORWARD References ****/
> +struct netdev;
> +struct dp_packet;
> +struct emc_cache;
> +struct netdev_flow_key;
> +struct dp_netdev_flow;
> +struct pipeline_flow_batch;
> +/***************************/
> +
> +#define MAX_PIPELINE_FLOW   5
> +
> +void
> +get_packet_pipeline_no_op(struct netdev *netdev, struct dp_packet *packet,
> +                                        struct pipeline_md *ppl_md);
> +void get_packet_pipeline(struct netdev *netdev, struct dp_packet *packet,
> +                                        struct pipeline_md *ppl_md);
> +/*
> + * List of extrat function corresponds to the pipeline_id. Index of function is 
> directly
> + *  mapped to the pipe_line_id enum.
> + */
> +typedef void (*pipeline_mf_extract)(struct dp_packet *packet, struct miniflow *mf);
> +extern pipeline_mf_extract ppl_mf_extract[];
> +
> +typedef void (*pipeline_dp_flow_insert)(struct dp_netdev_pmd_thread *pmd,
> +                                        const struct netdev_flow_key *key,
> +                                        struct dp_netdev_flow *flow);
> +
> +#endif /* LIB_HW_PIPELINE_H_ */
> diff --git a/lib/match.c b/lib/match.c
> index 3fcaec5..57529f5 100644
> --- a/lib/match.c
> +++ b/lib/match.c
> @@ -1075,7 +1075,7 @@ match_format(const struct match *match, struct ds *s, int 
> priority)
> 
>       int i;
> 
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>       if (priority != OFP_DEFAULT_PRIORITY) {
>           ds_put_format(s, "%spriority=%s%d,",
> diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c
> index 94c515d..e76988a 100644
> --- a/lib/netdev-bsd.c
> +++ b/lib/netdev-bsd.c
> @@ -1492,6 +1492,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum 
> netdev_flags off,
>       CONSTRUCT,                                       \
>       netdev_bsd_destruct,                             \
>       netdev_bsd_dealloc,                              \
> +    NULL, /* get_pipeline */                         \
>       NULL, /* get_config */                           \
>       NULL, /* set_config */                           \
>       NULL, /* get_tunnel_config */                    \
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 94568a1..a1a4cd3 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -55,6 +55,8 @@
>   #include "unaligned.h"
>   #include "timeval.h"
>   #include "unixctl.h"
> +#include "hw-pipeline.h"
> +#include "dpdk-i40e-ofld.h"
> 
>   VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
>   static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> @@ -143,7 +145,7 @@ BUILD_ASSERT_DECL((MAX_NB_MBUF / 
> ROUND_DOWN_POW2(MAX_NB_MBUF/MIN_NB_MBUF))
>   #define VHOST_ENQ_RETRY_NUM 8
>   #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
> 
> -static const struct rte_eth_conf port_conf = {
> +static struct rte_eth_conf port_conf = {
>       .rxmode = {
>           .mq_mode = ETH_MQ_RX_RSS,
>           .split_hdr_size = 0,
> @@ -417,6 +419,11 @@ is_dpdk_class(const struct netdev_class *class)
>              || class->init == netdev_dpdk_vhost_class_init;
>   }
> 
> +inline int get_dpdk_port_id(struct netdev_dpdk *dpdk_port)
> +{
> +    return dpdk_port->port_id;
> +}
> +
>   /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically
>    * aligned at 1k or less. If a declared mbuf size is not a multiple of this
>    * value, insufficient buffers are allocated to accomodate the packet in its
> @@ -656,7 +663,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, 
> int n_txq)
>               VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
>           }
> 
> -        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
> +        //diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &conf);
> +        diag = dpdk_eth_dev_hw_ofld_init(dev, n_rxq, n_txq, &conf);
>           if (diag) {
>               VLOG_WARN("Interface %s eth_dev setup error %s\n",
>                         dev->up.name, rte_strerror(-diag));
> @@ -789,7 +797,7 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>       return 0;
>   }
> 
> -static struct netdev_dpdk *
> +struct netdev_dpdk *
>   netdev_dpdk_cast(const struct netdev *netdev)
>   {
>       return CONTAINER_OF(netdev, struct netdev_dpdk, up);
> @@ -1062,6 +1070,24 @@ netdev_dpdk_dealloc(struct netdev *netdev)
>       rte_free(dev);
>   }
> 
> +static void
> +netdev_dpdk_get_pipeline(const struct netdev *netdev, struct dp_packet *packet,
> +                                                  void *pipeline_res)
> +{
> +    struct pipeline_md *ppl_md = pipeline_res;
> +    struct rte_mbuf *mbuf;
> +    /* TODO :: Have to look at the packet as well to decide the pipeline 
> status. Now lets
> +     * hardcord it
> +     */
> +    ppl_md->id = HW_OFFLOAD_PIPELINE;
> +
> +    /* DPDK pipeline is defined by the ol_flags n the packet,
> +     */
> +    mbuf = (struct rte_mbuf *)packet;
> +    ppl_md->state = (mbuf->ol_flags & PKT_RX_FDIR_ID)? PIPELINE_ACTIVE :
> +                                 PIPELINE_INACTIVE;
> +}
> +
>   static int
>   netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args)
>   {
> @@ -1602,7 +1628,6 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct 
> dp_packet_batch *batch)
>       }
> 
>       batch->count = nb_rx;
> -
>       return 0;
>   }
> 
> @@ -3255,6 +3280,7 @@ unlock:
>       CONSTRUCT,                                                \
>       DESTRUCT,                                                 \
>       netdev_dpdk_dealloc,                                      \
> +    netdev_dpdk_get_pipeline,                               \
>       netdev_dpdk_get_config,                                   \
>       SET_CONFIG,                                               \
>       NULL,                       /* get_tunnel_config */       \
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index 0657434..c16b2fe 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -1353,6 +1353,7 @@ netdev_dummy_update_flags(struct netdev *netdev_,
>       netdev_dummy_construct,                                     \
>       netdev_dummy_destruct,                                      \
>       netdev_dummy_dealloc,                                       \
> +    NULL, /* get_pipeline */                                       \
>       netdev_dummy_get_config,                                    \
>       netdev_dummy_set_config,                                    \
>       NULL,                       /* get_tunnel_config */         \
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 9ff1333..71304f9 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -2774,6 +2774,7 @@ netdev_linux_update_flags(struct netdev *netdev_, enum 
> netdev_flags off,
>       CONSTRUCT,                                                  \
>       netdev_linux_destruct,                                      \
>       netdev_linux_dealloc,                                       \
> +    NULL,                       /* get_pipeline */              \
>       NULL,                       /* get_config */                \
>       NULL,                       /* set_config */                \
>       NULL,                       /* get_tunnel_config */         \
> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
> index c730e72..7935fab 100644
> --- a/lib/netdev-native-tnl.c
> +++ b/lib/netdev-native-tnl.c
> @@ -44,6 +44,7 @@
>   #include "unaligned.h"
>   #include "unixctl.h"
>   #include "openvswitch/vlog.h"
> +#include "hw-pipeline.h"
> 
>   VLOG_DEFINE_THIS_MODULE(native_tnl);
>   static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5);
> @@ -512,6 +513,47 @@ err:
>       return NULL;
>   }
> 
> +struct dp_packet *
> +hw_ofld_netdev_vxlan_pop_header(struct dp_packet *packet)
> +{
> +    struct rte_mbuf *mbuf;
> +    struct pipeline_md ppl_md = (packet->md.ppl_md);
> +    unsigned int hlen;
> +    if(!ppl_md.id || ppl_md.state == PIPELINE_INACTIVE) {
> +        struct dp_packet *ret_pkt;
> +        /* Call the default pop, but retain the pipeline, in_port, hash
> +         * for future use
> +         */
> +        ret_pkt = netdev_vxlan_pop_header(packet);
> +        ret_pkt->md.ppl_md = ppl_md;
> +        //ret_pkt->md.tunnel.in_port = tnl_port;
> +        return ret_pkt;
> +    }
> +    /*
> +     * XXX:: A fair assumption that the packets in this pipeline are
> +     * ETH -->IP -->UDP-->VXLAN
> +     */
> +    hlen = sizeof(struct eth_header) + IP_HEADER_LEN;
> +    dp_packet_reset_packet(packet, hlen + VXLAN_HLEN);
> +    mbuf = (struct rte_mbuf *)packet;
> +    mbuf->ol_flags &= ~PKT_RX_FDIR_ID;
> +    return packet;
> +}
> +
> +/* VxLAN pop operation needed to be pipelined. */
> +typedef struct dp_packet * (*pipeline_vxlan_pop_hdr_fn)(struct dp_packet *packet);
> +pipeline_vxlan_pop_hdr_fn pipeline_vxlan_pop_header[] = {
> +    netdev_vxlan_pop_header,
> +    hw_ofld_netdev_vxlan_pop_header
> +};
> +
> +struct dp_packet *
> +pipeline_netdev_vxlan_pop_header(struct dp_packet *packet)
> +{
> +    struct pipeline_md *ppl_md = &(packet->md.ppl_md);
> +    return pipeline_vxlan_pop_header[ppl_md->id](packet);
> +}
> +
>   int
>   netdev_vxlan_build_header(const struct netdev *netdev,
>                             struct ovs_action_push_tnl *data,
> diff --git a/lib/netdev-native-tnl.h b/lib/netdev-native-tnl.h
> index a912ce9..4ad5c12 100644
> --- a/lib/netdev-native-tnl.h
> +++ b/lib/netdev-native-tnl.h
> @@ -56,7 +56,7 @@ netdev_vxlan_build_header(const struct netdev *netdev,
>                             const struct netdev_tnl_build_header_params *params);
> 
>   struct dp_packet *
> -netdev_vxlan_pop_header(struct dp_packet *packet);
> +pipeline_netdev_vxlan_pop_header(struct dp_packet *packet);
> 
>   static inline bool
>   netdev_tnl_is_header_ipv6(const void *header)
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index 8346fc4..6fab665 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -73,6 +73,7 @@ struct netdev {
>       int n_txq;
>       int n_rxq;
>       int ref_cnt;                        /* Times this devices was opened. */
> +    uint32_t pipeline_id; /* The id of pipeline the port associated with */
>       struct shash_node *node;            /* Pointer to element in global map. */
>       struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". */
>   };
> @@ -267,6 +268,11 @@ struct netdev_class {
>       void (*destruct)(struct netdev *);
>       void (*dealloc)(struct netdev *);
> 
> +    /* Get the pipeline information for the netdev. This will return the 
> pipe_line id and
> +     * status of pipeline for packet processing.
> +     */
> +    void (*get_pipeline)(const struct netdev *netdev, struct dp_packet *packet,
> +                                      void *pipeline_res);
>       /* Fetches the device 'netdev''s configuration, storing it in 'args'.
>        * The caller owns 'args' and pre-initializes it to an empty smap.
>        *
> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index 2d0aa43..8c48c1a 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -792,6 +792,7 @@ get_stats(const struct netdev *netdev, struct netdev_stats 
> *stats)
>       netdev_vport_construct,                                 \
>       netdev_vport_destruct,                                  \
>       netdev_vport_dealloc,                                   \
> +    NULL,   /* get_pipeline */                              \
>       GET_CONFIG,                                             \
>       SET_CONFIG,                                             \
>       GET_TUNNEL_CONFIG,                                      \
> @@ -873,7 +874,7 @@ netdev_vport_tunnel_register(void)
>                                          netdev_gre_pop_header),
>           TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header,
>                                              netdev_tnl_push_udp_header,
> -                                           netdev_vxlan_pop_header),
> +                                           pipeline_netdev_vxlan_pop_header),
>           TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL),
>           TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL),
>       };
> diff --git a/lib/nx-match.c b/lib/nx-match.c
> index e9d649b..124e56b 100644
> --- a/lib/nx-match.c
> +++ b/lib/nx-match.c
> @@ -962,7 +962,7 @@ nx_put_raw(struct ofpbuf *b, enum ofp_version oxm, const 
> struct match *match,
>       int match_len;
>       int i;
> 
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>       /* Metadata. */
>       if (match->wc.masks.dp_hash) {
> diff --git a/lib/odp-util.h b/lib/odp-util.h
> index 42011bc..f391e2a 100644
> --- a/lib/odp-util.h
> +++ b/lib/odp-util.h
> @@ -142,7 +142,7 @@ void odp_portno_names_destroy(struct hmap *portno_names);
>    * add another field and forget to adjust this value.
>    */
>   #define ODPUTIL_FLOW_KEY_BYTES 640
> -BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>   /* A buffer with sufficient size and alignment to hold an nlattr-formatted flow
>    * key.  An array of "struct nlattr" might not, in theory, be sufficiently
> diff --git a/lib/ofp-util.c b/lib/ofp-util.c
> index 0c9343e..b8872b5 100644
> --- a/lib/ofp-util.c
> +++ b/lib/ofp-util.c
> @@ -101,7 +101,7 @@ ofputil_netmask_to_wcbits(ovs_be32 netmask)
>   void
>   ofputil_wildcard_from_ofpfw10(uint32_t ofpfw, struct flow_wildcards *wc)
>   {
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>       /* Initialize most of wc. */
>       flow_wildcards_init_catchall(wc);
> diff --git a/lib/packets.h b/lib/packets.h
> index c4d3799..d7b1eb3 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -90,6 +90,12 @@ flow_tnl_equal(const struct flow_tnl *a, const struct 
> flow_tnl *b)
>       return a_size == flow_tnl_size(b) && !memcmp(a, b, a_size);
>   }
> 
> +/* Refer hardware pipeline.h for more details */
> +struct pipeline_md {
> +    uint16_t id; //enum pipeline_id
> +    uint16_t state; //enum pipeline_state
> +};
> +
>   /* Datapath packet metadata */
>   struct pkt_metadata {
>       uint32_t recirc_id;         /* Recirculation id carried with the
> @@ -104,6 +110,8 @@ struct pkt_metadata {
>       uint32_t ct_mark;           /* Connection mark. */
>       ovs_u128 ct_label;          /* Connection label. */
>       union flow_in_port in_port; /* Input port. */
> +    struct pipeline_md ppl_md;
> +    uint8_t pad[4]; /*pad for the pipeline metadata */
>       struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. Note that
>                                    * if 'ip_dst' == 0, the rest of the fields may
>                                    * be uninitialized. */
> diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h
> index c357591..dfe54ff 100644
> --- a/ofproto/ofproto-dpif-rid.h
> +++ b/ofproto/ofproto-dpif-rid.h
> @@ -99,7 +99,7 @@ struct rule;
>   /* Metadata for restoring pipeline context after recirculation.  Helpers
>    * are inlined below to keep them together with the definition for easier
>    * updates. */
> -BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
> 
>   struct frozen_metadata {
>       /* Metadata in struct flow. */
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
> index 503a347..525cdcd 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -3087,7 +3087,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t 
> ofp_port,
> 
>       /* If 'struct flow' gets additional metadata, we'll need to zero it out
>        * before traversing a patch port. */
> -    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 36);
> +    BUILD_ASSERT_DECL(FLOW_WC_SEQ == 37);
>       memset(&flow_tnl, 0, sizeof flow_tnl);
> 
>       if (!xport) {
> -- 
> 2.7.4
> 

Mark.


More information about the dev mailing list