[ovs-dev] [PATCH v8 01/13] netdev-dpdk: fix mbuf sizing

Eelco Chaudron echaudro at redhat.com
Mon Jun 18 11:20:22 UTC 2018



On 11 Jun 2018, at 18:21, Tiago Lam wrote:

> From: Mark Kavanagh <mark.b.kavanagh at intel.com>
>
> There are numerous factors that must be considered when calculating
> the size of an mbuf:
> - the data portion of the mbuf must be sized in accordance With Rx
>   buffer alignment (typically 1024B). So, for example, in order to
>   successfully receive and capture a 1500B packet, mbufs with a
>   data portion of size 2048B must be used.
> - in OvS, the elements that comprise an mbuf are:
>   * the dp packet, which includes a struct rte mbuf (704B)
>   * RTE_PKTMBUF_HEADROOM (128B)
>   * packet data (aligned to 1k, as previously described)
>   * RTE_PKTMBUF_TAILROOM (typically 0)
>
> Some PMDs require that the total mbuf size (i.e. the total sum of all
> of the above-listed components' lengths) is cache-aligned. To satisfy
> this requirement, it may be necessary to round up the total mbuf size
> with respect to cacheline size. In doing so, it's possible that the
> dp_packet's data portion is inadvertently increased in size, such that
> it no longer adheres to Rx buffer alignment. Consequently, the
> following property of the mbuf no longer holds true:
>
>     mbuf.data_len == mbuf.buf_len - mbuf.data_off
>
> This creates a problem in the case of multi-segment mbufs, where that
> assumption is assumed to be true for all but the final segment in an
> mbuf chain. Resolve this issue by adjusting the size of the mbuf's
> private data portion, as opposed to the packet data portion when
> aligning mbuf size to cachelines.
>
> Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization")
> Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size")
> CC: Santosh Shukla <santosh.shukla at caviumnetworks.com>
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh at intel.com>
> ---
>  lib/netdev-dpdk.c | 48 
> +++++++++++++++++++++++++++++++-----------------
>  1 file changed, 31 insertions(+), 17 deletions(-)
>
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 2e2f568..468ab36 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -82,12 +82,6 @@ static struct vlog_rate_limit rl = 
> VLOG_RATE_LIMIT_INIT(5, 20);
>                                       + (2 * VLAN_HEADER_LEN))
>  #define MTU_TO_FRAME_LEN(mtu)       ((mtu) + ETHER_HDR_LEN + 
> ETHER_CRC_LEN)
>  #define MTU_TO_MAX_FRAME_LEN(mtu)   ((mtu) + ETHER_HDR_MAX_LEN)
> -#define FRAME_LEN_TO_MTU(frame_len) ((frame_len)                    \
> -                                     - ETHER_HDR_LEN - ETHER_CRC_LEN)
> -#define MBUF_SIZE(mtu)              
> ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \
> -                                             + sizeof(struct 
> dp_packet) \
> -                                             + RTE_PKTMBUF_HEADROOM), 
>   \
> -                                             RTE_CACHE_LINE_SIZE)
>  #define NETDEV_DPDK_MBUF_ALIGN      1024
>  #define NETDEV_DPDK_MAX_PKT_LEN     9728
>
> @@ -493,7 +487,7 @@ is_dpdk_class(const struct netdev_class *class)
>   * behaviour, which reduces performance. To prevent this, use a 
> buffer size
>   * that is closest to 'mtu', but which satisfies the aforementioned 
> criteria.
>   */
> -static uint32_t
> +static uint16_t
>  dpdk_buf_size(int mtu)
>  {
>      return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + 
> RTE_PKTMBUF_HEADROOM),
> @@ -585,7 +579,7 @@ dpdk_mp_do_not_free(struct rte_mempool *mp) 
> OVS_REQUIRES(dpdk_mp_mutex)
>   *  - a new mempool was just created;
>   *  - a matching mempool already exists. */
>  static struct rte_mempool *
> -dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
> +dpdk_mp_create(struct netdev_dpdk *dev, uint16_t mbuf_pkt_data_len)
>  {
>      char mp_name[RTE_MEMPOOL_NAMESIZE];
>      const char *netdev_name = netdev_get_name(&dev->up);
> @@ -593,6 +587,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
>      uint32_t n_mbufs;
>      uint32_t hash = hash_string(netdev_name, 0);
>      struct rte_mempool *mp = NULL;
> +    uint16_t mbuf_size, aligned_mbuf_size, mbuf_priv_data_len;
>
>      /*
>       * XXX: rough estimation of number of mbufs required for this 
> port:
> @@ -611,13 +606,14 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
>          /* Full DPDK memory pool name must be unique and cannot be
>           * longer than RTE_MEMPOOL_NAMESIZE. */
>          int ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
> -                           "ovs%08x%02d%05d%07u",
> -                           hash, socket_id, mtu, n_mbufs);
> +                           "ovs%08x%02d%05u%07u",
> +                           hash, socket_id, mbuf_pkt_data_len, 
> n_mbufs);
>          if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
>              VLOG_DBG("snprintf returned %d. "
>                       "Failed to generate a mempool name for \"%s\". "
> -                     "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
> -                     ret, netdev_name, hash, socket_id, mtu, 
> n_mbufs);
> +                     "Hash:0x%x, socket_id: %d, pkt data room:%u, 
> mbufs:%u.",
> +                     ret, netdev_name, hash, socket_id, 
> mbuf_pkt_data_len,
> +                     n_mbufs);
>              break;
>          }
>
> @@ -626,13 +622,31 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
>                    netdev_name, n_mbufs, socket_id,
>                    dev->requested_n_rxq, dev->requested_n_txq);
>
> +        mbuf_priv_data_len = sizeof(struct dp_packet) -
> +                                 sizeof(struct rte_mbuf);
> +        /* The size of the entire mbuf. */
> +        mbuf_size = sizeof (struct dp_packet) +
> +                                mbuf_pkt_data_len + 
> RTE_PKTMBUF_HEADROOM;
> +        /* mbuf size, rounded up to cacheline size. */
> +        aligned_mbuf_size = ROUND_UP(mbuf_size, RTE_CACHE_LINE_SIZE);
> +        /* If there is a size discrepancy, add padding to 
> mbuf_priv_data_len.
> +         * This maintains mbuf size cache alignment, while also 
> honoring RX
> +         * buffer alignment in the data portion of the mbuf. If this 
> adjustment
> +         * is not made, there is a possiblity later on that for an 
> element of
> +         * the mempool, buf, buf->data_len < (buf->buf_len - 
> buf->data_off).
> +         * This is problematic in the case of multi-segment mbufs, 
> particularly
> +         * when an mbuf segment needs to be resized (when 
> [push|popp]ing a VLAN
> +         * header, for example.
> +         */
> +        mbuf_priv_data_len += (aligned_mbuf_size - mbuf_size);
> +
>          mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
> -                 sizeof (struct dp_packet) - sizeof (struct 
> rte_mbuf),
> -                 MBUF_SIZE(mtu) - sizeof(struct dp_packet), 
> socket_id);
> +                mbuf_priv_data_len,
> +                mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM, socket_id);
>
>          if (mp) {
>              VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
> -                     mp_name, n_mbufs);
> +                    mp_name, n_mbufs);

While you are at it, can we add some info on the cache line alignment 
size, might be useful for debugging.

>              /* rte_pktmbuf_pool_create has done some initialization 
> of the
>               * rte_mbuf part of each dp_packet. Some OvS specific 
> fields
>               * of the packet still need to be initialized by
> @@ -693,13 +707,13 @@ static int
>  netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
>      OVS_REQUIRES(dev->mutex)
>  {
> -    uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
> +    uint16_t buf_size = dpdk_buf_size(dev->requested_mtu);
>      struct rte_mempool *mp;
>      int ret = 0;
>
>      dpdk_mp_sweep();
>
> -    mp = dpdk_mp_create(dev, FRAME_LEN_TO_MTU(buf_size));
> +    mp = dpdk_mp_create(dev, buf_size);
>      if (!mp) {
>          VLOG_ERR("Failed to create memory pool for netdev "
>                   "%s, with MTU %d on socket %d: %s\n",
> -- 
> 2.7.4


More information about the dev mailing list