[ovs-dev] [RFC v7 01/13] netdev-dpdk: fix mbuf sizing

Loftus, Ciara ciara.loftus at intel.com
Mon May 28 15:41:29 UTC 2018


> 
> From: Mark Kavanagh <mark.b.kavanagh at intel.com>
> 
> There are numerous factors that must be considered when calculating
> the size of an mbuf:
> - the data portion of the mbuf must be sized in accordance With Rx
>   buffer alignment (typically 1024B). So, for example, in order to
>   successfully receive and capture a 1500B packet, mbufs with a
>   data portion of size 2048B must be used.
> - in OvS, the elements that comprise an mbuf are:
>   * the dp packet, which includes a struct rte mbuf (704B)
>   * RTE_PKTMBUF_HEADROOM (128B)
>   * packet data (aligned to 1k, as previously described)
>   * RTE_PKTMBUF_TAILROOM (typically 0)
> 
> Some PMDs require that the total mbuf size (i.e. the total sum of all
> of the above-listed components' lengths) is cache-aligned. To satisfy
> this requirement, it may be necessary to round up the total mbuf size
> with respect to cacheline size. In doing so, it's possible that the
> dp_packet's data portion is inadvertently increased in size, such that
> it no longer adheres to Rx buffer alignment. Consequently, the
> following property of the mbuf no longer holds true:
> 
>     mbuf.data_len == mbuf.buf_len - mbuf.data_off
> 
> This creates a problem in the case of multi-segment mbufs, where that
> assumption is assumed to be true for all but the final segment in an
> mbuf chain. Resolve this issue by adjusting the size of the mbuf's
> private data portion, as opposed to the packet data portion when
> aligning mbuf size to cachelines.
> 
> Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization")
> Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size")
> CC: Santosh Shukla <santosh.shukla at caviumnetworks.com>
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh at intel.com>
> ---
>  lib/netdev-dpdk.c | 46 ++++++++++++++++++++++++++++++----------------
>  1 file changed, 30 insertions(+), 16 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 87152a7..356a967 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -82,12 +82,6 @@ static struct vlog_rate_limit rl =
> VLOG_RATE_LIMIT_INIT(5, 20);
>                                       + (2 * VLAN_HEADER_LEN))
>  #define MTU_TO_FRAME_LEN(mtu)       ((mtu) + ETHER_HDR_LEN +
> ETHER_CRC_LEN)
>  #define MTU_TO_MAX_FRAME_LEN(mtu)   ((mtu) +
> ETHER_HDR_MAX_LEN)
> -#define FRAME_LEN_TO_MTU(frame_len) ((frame_len)                    \
> -                                     - ETHER_HDR_LEN - ETHER_CRC_LEN)
> -#define MBUF_SIZE(mtu)
> ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \
> -                                             + sizeof(struct dp_packet) \
> -                                             + RTE_PKTMBUF_HEADROOM),   \
> -                                             RTE_CACHE_LINE_SIZE)
>  #define NETDEV_DPDK_MBUF_ALIGN      1024
>  #define NETDEV_DPDK_MAX_PKT_LEN     9728
> 
> @@ -491,7 +485,7 @@ is_dpdk_class(const struct netdev_class *class)
>   * behaviour, which reduces performance. To prevent this, use a buffer size
>   * that is closest to 'mtu', but which satisfies the aforementioned criteria.
>   */
> -static uint32_t
> +static uint16_t
>  dpdk_buf_size(int mtu)
>  {
>      return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) +
> RTE_PKTMBUF_HEADROOM),
> @@ -582,7 +576,7 @@ dpdk_mp_do_not_free(struct rte_mempool *mp)
> OVS_REQUIRES(dpdk_mp_mutex)
>   *  - a new mempool was just created;
>   *  - a matching mempool already exists. */
>  static struct rte_mempool *
> -dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
> +dpdk_mp_create(struct netdev_dpdk *dev, uint16_t mbuf_pkt_data_len)
>  {
>      char mp_name[RTE_MEMPOOL_NAMESIZE];
>      const char *netdev_name = netdev_get_name(&dev->up);
> @@ -590,6 +584,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
>      uint32_t n_mbufs;
>      uint32_t hash = hash_string(netdev_name, 0);
>      struct rte_mempool *mp = NULL;
> +    uint16_t mbuf_size, aligned_mbuf_size, mbuf_priv_data_len;
> 
>      /*
>       * XXX: rough estimation of number of mbufs required for this port:
> @@ -609,12 +604,13 @@ dpdk_mp_create(struct netdev_dpdk *dev, int
> mtu)
>           * longer than RTE_MEMPOOL_NAMESIZE. */
>          int ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
>                             "ovs%08x%02d%05d%07u",
> -                           hash, socket_id, mtu, n_mbufs);
> +                           hash, socket_id, mbuf_pkt_data_len, n_mbufs);

%u for printing the unsigned mbuf_pkt_data_len

>          if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
>              VLOG_DBG("snprintf returned %d. "
>                       "Failed to generate a mempool name for \"%s\". "
> -                     "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
> -                     ret, netdev_name, hash, socket_id, mtu, n_mbufs);
> +                     "Hash:0x%x, socket_id: %d, pkt data room:%d, mbufs:%u.",
> +                     ret, netdev_name, hash, socket_id, mbuf_pkt_data_len,
> +                     n_mbufs);

Same here

>              break;
>          }
> 
> @@ -623,13 +619,31 @@ dpdk_mp_create(struct netdev_dpdk *dev, int
> mtu)
>                    netdev_name, n_mbufs, socket_id,
>                    dev->requested_n_rxq, dev->requested_n_txq);
> 
> +        mbuf_priv_data_len = sizeof(struct dp_packet) -
> +                                 sizeof(struct rte_mbuf);
> +        /* The size of the entire mbuf. */
> +        mbuf_size = sizeof (struct dp_packet) +
> +                                mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM;
> +        /* mbuf size, rounded up to cacheline size. */
> +        aligned_mbuf_size = ROUND_UP(mbuf_size, RTE_CACHE_LINE_SIZE);
> +        /* If there is a size discrepancy, add padding to mbuf_priv_data_len.
> +         * This maintains mbuf size cache alignment, while also honoring RX
> +         * buffer alignment in the data portion of the mbuf. If this adjustment
> +         * is not made, there is a possiblity later on that for an element of
> +         * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off).
> +         * This is problematic in the case of multi-segment mbufs, particularly
> +         * when an mbuf segment needs to be resized (when [push|popp]ing a
> VLAN
> +         * header, for example.
> +         */
> +        mbuf_priv_data_len += (aligned_mbuf_size - mbuf_size);
> +
>          mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
> -                 sizeof (struct dp_packet) - sizeof (struct rte_mbuf),
> -                 MBUF_SIZE(mtu) - sizeof(struct dp_packet), socket_id);
> +                mbuf_priv_data_len,
> +                mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM, socket_id);
> 
>          if (mp) {
>              VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
> -                     mp_name, n_mbufs);
> +                    mp_name, n_mbufs);
>              /* rte_pktmbuf_pool_create has done some initialization of the
>               * rte_mbuf part of each dp_packet. Some OvS specific fields
>               * of the packet still need to be initialized by
> @@ -690,13 +704,13 @@ static int
>  netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
>      OVS_REQUIRES(dev->mutex)
>  {
> -    uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
> +    uint16_t buf_size = dpdk_buf_size(dev->requested_mtu);
>      struct rte_mempool *mp;
>      int ret = 0;
> 
>      dpdk_mp_sweep();
> 
> -    mp = dpdk_mp_create(dev, FRAME_LEN_TO_MTU(buf_size));
> +    mp = dpdk_mp_create(dev, buf_size);
>      if (!mp) {
>          VLOG_ERR("Failed to create memory pool for netdev "
>                   "%s, with MTU %d on socket %d: %s\n",
> --
> 2.7.4
> 
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


More information about the dev mailing list