[ovs-dev] [PATCH 1/9] netdev: Extend rx_recv to pass multiple packets.

Jarno Rajahalme jrajahalme at nicira.com
Tue Mar 18 22:56:29 UTC 2014


On Mar 18, 2014, at 1:53 PM, Pravin <pshelar at nicira.com> wrote:

> DPDK can receive multiple packets but current netdev API does
> not allow that.  Following patch allows dpif-netdev receive batch
> of packet in a rx_recv() call for any netdev port.  This will be
> used by dpdk-netdev.
> 
> Signed-off-by: Pravin B Shelar <pshelar at nicira.com>
> ---
> lib/automake.mk       |    1 +
> lib/dpif-netdev.c     |   51 ++++++++++++++++++-------------------------------
> lib/dpif-netdev.h     |   40 ++++++++++++++++++++++++++++++++++++++
> lib/netdev-dummy.c    |   26 +++++++++----------------
> lib/netdev-linux.c    |   20 ++++++++++++++++---
> lib/netdev-provider.h |   24 +++++------------------
> lib/netdev.c          |   24 +++++------------------
> lib/netdev.h          |    2 +-
> lib/packets.c         |    9 +++++++++
> lib/packets.h         |    1 +
> 10 files changed, 107 insertions(+), 91 deletions(-)
> create mode 100644 lib/dpif-netdev.h
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index e22165c..832b7f9 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -52,6 +52,7 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/dhparams.h \
>  	lib/dirs.h \
>  	lib/dpif-netdev.c \
> +	lib/dpif-netdev.h \
>  	lib/dpif-provider.h \
>  	lib/dpif.c \
>  	lib/dpif.h \
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 3bbfd2a..d78fb25 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -34,6 +34,7 @@
> #include "classifier.h"
> #include "csum.h"
> #include "dpif.h"
> +#include "dpif-netdev.h"
> #include "dpif-provider.h"
> #include "dummy.h"
> #include "dynamic-string.h"
> @@ -68,10 +69,6 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> /* Configuration parameters. */
> enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
> 
> -/* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
> - * headers to be aligned on a 4-byte boundary.  */
> -enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };
> -
> /* Queues. */
> enum { N_QUEUES = 2 };          /* Number of queues for dpif_recv(). */
> enum { MAX_QUEUE_LEN = 128 };   /* Maximum number of packets per queue. */
> @@ -1443,6 +1440,7 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
> {
>     struct dp_netdev *dp = get_dp_netdev(dpif);
>     struct pkt_metadata *md = &execute->md;
> +    struct ofpbuf *packet;
>     struct flow key;
> 
>     if (execute->packet->size < ETH_HEADER_LEN ||
> @@ -1453,8 +1451,9 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
>     /* Extract flow key. */
>     flow_extract(execute->packet, md, &key);
> 
> +    packet = ofpbuf_clone(execute->packet);
>     ovs_rwlock_rdlock(&dp->port_rwlock);
> -    dp_netdev_execute_actions(dp, &key, execute->packet, md, execute->actions,
> +    dp_netdev_execute_actions(dp, &key, packet, md, execute->actions,
>                               execute->actions_len);
>     ovs_rwlock_unlock(&dp->port_rwlock);
> 

It is not clear to me why the packet should be cloned here, especially since you are already copying the data in the userspace action. Also, it seems this will leak memory in cases where there is no OUTPUT or USERSPACE action, as the dp_execute_action() will not be called in those cases (e.g., a drop flow).


> @@ -1586,12 +1585,10 @@ dp_forwarder_main(void *f_)
> {
>     struct dp_forwarder *f = f_;
>     struct dp_netdev *dp = f->dp;
> -    struct ofpbuf packet;
> 
>     f->name = xasprintf("forwarder_%u", ovsthread_id_self());
>     set_subprogram_name("%s", f->name);
> 
> -    ofpbuf_init(&packet, 0);
>     while (!latch_is_set(&dp->exit_latch)) {
>         bool received_anything;
>         int i;
> @@ -1605,25 +1602,19 @@ dp_forwarder_main(void *f_)
>                 if (port->rx
>                     && port->node.hash >= f->min_hash
>                     && port->node.hash <= f->max_hash) {
> -                    int buf_size;
> +                    struct ofpbuf *packets[MAX_RX_BATCH];
> +                    int count;
>                     int error;
> -                    int mtu;
> -
> -                    if (netdev_get_mtu(port->netdev, &mtu)) {
> -                        mtu = ETH_PAYLOAD_MAX;
> -                    }
> -                    buf_size = DP_NETDEV_HEADROOM + VLAN_ETH_HEADER_LEN + mtu;
> -
> -                    ofpbuf_clear(&packet);
> -                    ofpbuf_reserve_with_tailroom(&packet, DP_NETDEV_HEADROOM,
> -                                                 buf_size);
> 
> -                    error = netdev_rx_recv(port->rx, &packet);
> +                    error = netdev_rx_recv(port->rx, packets, &count);
>                     if (!error) {
> +                        int i;
>                         struct pkt_metadata md
>                             = PKT_METADATA_INITIALIZER(port->port_no);
> 
> -                        dp_netdev_port_input(dp, &packet, &md);
> +                        for (i = 0; i < count; i++) {
> +                            dp_netdev_port_input(dp, packets[i], &md);
> +                        }
>                         received_anything = true;
>                     } else if (error != EAGAIN && error != EOPNOTSUPP) {
>                         static struct vlog_rate_limit rl
> @@ -1659,7 +1650,6 @@ dp_forwarder_main(void *f_)
> 
>         poll_block();
>     }
> -    ofpbuf_uninit(&packet);
> 
>     free(f->name);
> 
> @@ -1741,6 +1731,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct ofpbuf *packet,
>     } else {
>         ovsthread_counter_inc(dp->n_missed, 1);
>         dp_netdev_output_userspace(dp, packet, DPIF_UC_MISS, &key, NULL);
> +        ofpbuf_delete(packet);
>     }

Maybe this should be done in both cases (see below).

> }
> 
> @@ -1767,6 +1758,7 @@ dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *packet,
>         if (userdata) {
>             buf_size += NLA_ALIGN(userdata->nla_len);
>         }
> +        buf_size += packet->size;
>         ofpbuf_init(buf, buf_size);
> 
>         /* Put ODP flow. */
> @@ -1780,10 +1772,8 @@ dp_netdev_output_userspace(struct dp_netdev *dp, struct ofpbuf *packet,
>                                           NLA_ALIGN(userdata->nla_len));
>         }
> 
> -        /* Steal packet data. */
> -        ovs_assert(packet->source == OFPBUF_MALLOC);
> -        upcall->packet = *packet;
> -        ofpbuf_use(packet, NULL, 0);
> +        upcall->packet.data = ofpbuf_put(buf, packet->data, packet->size);
> +        upcall->packet.size = packet->size;
> 

I do not see a reason in this patch why the packet data should be copied here. However, I can guess that the reason for it comes apparent with the later patches?

>         seq_change(dp->queue_seq);
> 
> @@ -1825,15 +1815,8 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet,
> 
>         userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
> 
> -        /* Make a copy if we are not allowed to steal the packet's data. */
> -        if (!may_steal) {
> -            packet = ofpbuf_clone_with_headroom(packet, DP_NETDEV_HEADROOM);
> -        }
>         dp_netdev_output_userspace(aux->dp, packet, DPIF_UC_ACTION, aux->key,
>                                    userdata);
> -        if (!may_steal) {
> -            ofpbuf_uninit(packet);
> -        }
>         break;
>     }
>     case OVS_ACTION_ATTR_PUSH_VLAN:
> @@ -1846,6 +1829,10 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet,
>     case __OVS_ACTION_ATTR_MAX:
>         OVS_NOT_REACHED();
>     }
> +
> +    if (may_steal) {
> +        ofpbuf_delete(packet);
> +    }

You can not rely on this to happen for all the packets. It seems you could delete this and the cloning in dpif_netdev_execute(), and have dp_port_input() do the delete also after execution?

> }
> 
> static void
> diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
> new file mode 100644
> index 0000000..64c93ee
> --- /dev/null
> +++ b/lib/dpif-netdev.h
> @@ -0,0 +1,40 @@
> +/*
> + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef DPIF_NETDEV_H
> +#define DPIF_NETDEV_H 1
> +
> +#include <stdbool.h>
> +#include <stddef.h>
> +#include <stdint.h>
> +#include "openvswitch/types.h"
> +#include "packets.h"
> +
> +#ifdef  __cplusplus
> +extern "C" {
> +#endif
> +
> +/* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
> + * headers to be aligned on a 4-byte boundary.  */
> +enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };
> +
> +enum { MAX_RX_BATCH = 256 };     /* Maximum number of flows in flow table. */
> +
> +#ifdef  __cplusplus
> +}
> +#endif
> +
> +#endif /* netdev.h */
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index f23fc9f..27487f9 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -755,12 +755,11 @@ netdev_dummy_rx_dealloc(struct netdev_rx *rx_)
> }
> 
> static int
> -netdev_dummy_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
> +netdev_dummy_rx_recv(struct netdev_rx *rx_, struct ofpbuf **arr, int *c)
> {
>     struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
>     struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
>     struct ofpbuf *packet;
> -    int retval;
> 
>     ovs_mutex_lock(&netdev->mutex);
>     if (!list_is_empty(&rx->recv_queue)) {
> @@ -774,22 +773,15 @@ netdev_dummy_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
>     if (!packet) {
>         return EAGAIN;
>     }
> +    ovs_mutex_lock(&netdev->mutex);
> +    netdev->stats.rx_packets++;
> +    netdev->stats.rx_bytes += packet->size;
> +    ovs_mutex_unlock(&netdev->mutex);
> 
> -    if (packet->size <= ofpbuf_tailroom(buffer)) {
> -        memcpy(buffer->data, packet->data, packet->size);
> -        buffer->size += packet->size;
> -        retval = 0;
> -
> -        ovs_mutex_lock(&netdev->mutex);
> -        netdev->stats.rx_packets++;
> -        netdev->stats.rx_bytes += packet->size;
> -        ovs_mutex_unlock(&netdev->mutex);
> -    } else {
> -        retval = EMSGSIZE;
> -    }
> -    ofpbuf_delete(packet);
> -
> -    return retval;
> +    packet_set_size(packet, packet->size);

The name of this function is a bit misleading, especially since all the users already have the correct size...

> +    arr[0] = packet;
> +    *c = 1;
> +    return 0;
> }
> 
> static void
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 75ce7c6..574a572 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -50,6 +50,7 @@
> #include "connectivity.h"
> #include "coverage.h"
> #include "dpif-linux.h"
> +#include "dpif-netdev.h"
> #include "dynamic-string.h"
> #include "fatal-signal.h"
> #include "hash.h"
> @@ -461,6 +462,7 @@ static int af_packet_sock(void);
> static bool netdev_linux_miimon_enabled(void);
> static void netdev_linux_miimon_run(void);
> static void netdev_linux_miimon_wait(void);
> +static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
> 
> static bool
> is_netdev_linux_class(const struct netdev_class *netdev_class)
> @@ -984,10 +986,19 @@ netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
> }
> 
> static int
> -netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
> +netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf **packet, int *c)
> {
>     struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
> -    int retval;
> +    struct netdev *netdev = rx->up.netdev;
> +    struct ofpbuf *buffer;
> +    ssize_t retval;
> +    int mtu;
> +
> +    if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
> +        mtu = ETH_PAYLOAD_MAX;
> +    }
> +
> +    buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, DP_NETDEV_HEADROOM);
> 
>     retval = (rx->is_tap
>               ? netdev_linux_rx_recv_tap(rx->fd, buffer)
> @@ -995,8 +1006,11 @@ netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
>     if (retval && retval != EAGAIN && retval != EMSGSIZE) {
>         VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
>                      ovs_strerror(errno), netdev_rx_get_name(rx_));
> +    } else {
> +        packet_set_size(buffer, buffer->size);
> +        packet[0] = buffer;
> +        *c = 1;

I don’t see the buffer being released in all the other cases (when it’s ownership does not transfer to the caller).

>     }
> -
>     return retval;
> }
> 
> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index 673d3ab..9bacaa0 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> @@ -634,28 +634,14 @@ struct netdev_class {
>     void (*rx_destruct)(struct netdev_rx *);
>     void (*rx_dealloc)(struct netdev_rx *);
> 
> -    /* Attempts to receive a packet from 'rx' into the tailroom of 'buffer',
> -     * which should initially be empty.  If successful, returns 0 and
> -     * increments 'buffer->size' by the number of bytes in the received packet,
> -     * otherwise a positive errno value.  Returns EAGAIN immediately if no
> -     * packet is ready to be received.
> -     *
> -     * Must return EMSGSIZE, and discard the packet, if the received packet
> -     * is longer than 'ofpbuf_tailroom(buffer)'.
> -     *
> -     * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to
> -     * add a VLAN header which is obtained out-of-band to the packet. If
> -     * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be
> -     * available for the packet, otherwise it may be used for the packet
> -     * itself.
> -     *
> -     * It is advised that the tailroom of 'buffer' should be
> -     * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an
> -     * out-of-band VLAN header to be added to the packet.
> +    /* Attempts to receive batch of packets from 'rx' and place array of pointers
> +     * into '*pkt'. netdev is responsible for allocating buffers.
> +     * '*cnt' points to packet count for given batch. Once packets are returned
> +     * to caller, netdev should give up ownership of ofbpuf data.
>      *
>      * This function may be set to null if it would always return EOPNOTSUPP
>      * anyhow. */
> -    int (*rx_recv)(struct netdev_rx *rx, struct ofpbuf *buffer);
> +    int (*rx_recv)(struct netdev_rx *rx, struct ofpbuf **pkt, int *cnt);
> 
>     /* Registers with the poll loop to wake up from the next call to
>      * poll_block() when a packet is ready to be received with netdev_rx_recv()
> diff --git a/lib/netdev.c b/lib/netdev.c
> index e9c8d8f..a058742 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -551,10 +551,7 @@ netdev_rx_close(struct netdev_rx *rx)
>     }
> }
> 
> -/* Attempts to receive a packet from 'rx' into the tailroom of 'buffer', which
> - * must initially be empty.  If successful, returns 0 and increments
> - * 'buffer->size' by the number of bytes in the received packet, otherwise a
> - * positive errno value.
> +/* Attempts to receive batch packet from 'rx'.
>  *
>  * Returns EAGAIN immediately if no packet is ready to be received.
>  *
> @@ -562,10 +559,7 @@ netdev_rx_close(struct netdev_rx *rx)
>  * than 'ofpbuf_tailroom(buffer)'.
>  *
>  * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to
> - * add a VLAN header which is obtained out-of-band to the packet. If
> - * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be
> - * available for the packet, otherwise it may be used for the packet
> - * itself.
> + * add a VLAN header which is obtained out-of-band to the packet.
>  *
>  * It is advised that the tailroom of 'buffer' should be
>  * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an
> @@ -575,23 +569,15 @@ netdev_rx_close(struct netdev_rx *rx)
>  * This function may be set to null if it would always return EOPNOTSUPP
>  * anyhow. */
> int
> -netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf *buffer)
> +netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf **buffers, int *cnt)
> {
>     int retval;
> 
> -    ovs_assert(buffer->size == 0);
> -    ovs_assert(ofpbuf_tailroom(buffer) >= ETH_TOTAL_MIN);
> -
> -    retval = rx->netdev->netdev_class->rx_recv(rx, buffer);
> +    retval = rx->netdev->netdev_class->rx_recv(rx, buffers, cnt);
>     if (!retval) {
>         COVERAGE_INC(netdev_received);
> -        if (buffer->size < ETH_TOTAL_MIN) {
> -            ofpbuf_put_zeros(buffer, ETH_TOTAL_MIN - buffer->size);
> -        }
> -        return 0;
> -    } else {
> -        return retval;
>     }
> +    return retval;
> }
> 
> /* Arranges for poll_block() to wake up when a packet is ready to be received
> diff --git a/lib/netdev.h b/lib/netdev.h
> index 410c35b..bfd8f91 100644
> --- a/lib/netdev.h
> +++ b/lib/netdev.h
> @@ -161,7 +161,7 @@ void netdev_rx_close(struct netdev_rx *);
> 
> const char *netdev_rx_get_name(const struct netdev_rx *);
> 
> -int netdev_rx_recv(struct netdev_rx *, struct ofpbuf *);
> +int netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf **buffers, int *cnt);
> void netdev_rx_wait(struct netdev_rx *);
> int netdev_rx_drain(struct netdev_rx *);
> 
> diff --git a/lib/packets.c b/lib/packets.c
> index 3f7d6eb..6f2fca6 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -1012,3 +1012,12 @@ void pkt_metadata_from_flow(struct pkt_metadata *md, const struct flow *flow)
>     pkt_metadata_init(md, &flow->tunnel, flow->skb_priority,
>                            flow->pkt_mark, &flow->in_port);
> }
> +
> +void
> +packet_set_size(struct ofpbuf *b, int size)
> +{
> +    b->size = size;
> +    if (b->size < ETH_TOTAL_MIN) {
> +        ofpbuf_put_zeros(b, ETH_TOTAL_MIN - b->size);
> +    }
> +}
> diff --git a/lib/packets.h b/lib/packets.h
> index e6b3303..f41a934 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -671,5 +671,6 @@ void packet_set_sctp_port(struct ofpbuf *, ovs_be16 src, ovs_be16 dst);
> uint16_t packet_get_tcp_flags(const struct ofpbuf *, const struct flow *);
> void packet_format_tcp_flags(struct ds *, uint16_t);
> const char *packet_tcp_flag_to_string(uint32_t flag);
> +void packet_set_size(struct ofpbuf *b, int size);
> 
> #endif /* packets.h */
> -- 
> 1.7.9.5
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev




More information about the dev mailing list