[ovs-dev] [QoS v2 17/17] Implement QoS framework.

Justin Pettit jpettit at nicira.com
Wed Jun 16 06:25:49 UTC 2010


On Jun 8, 2010, at 1:41 PM, Ben Pfaff wrote:

> --- a/include/openflow/openflow.h
> +++ b/include/openflow/openflow.h
> 
> +/* All ones is used to indicate all queues in a port (for stats retrieval). */
> +#define OFPQ_ALL      0xffffffff
> +
> +/* Body for ofp_stats_request of type OFPST_QUEUE. */
> +struct ofp_queue_stats_request {
> +    uint16_t port_no;        /* All ports if OFPT_ALL. */

I imagine that should be OFPP_ALL.  It looks like the OpenFlow reference code has the same issue.

> +/* Body for ofp_stats_reply of type OFPST_QUEUE consists of an array of this
> + * structure type. */
> +struct ofp_queue_stats {
> +    uint16_t port_no;
> +    uint8_t pad[2];          /* Align to 32-bits. */
> +    uint32_t queue_id;       /* Queue i.d */

That dot in "i.d" looks kind of odd.  This is also in the reference code.

> +/* Action structure for ODPAT_SET_PRIORITY. */
> +struct odp_action_priority {
> +    uint16_t type;              /* ODPAT_SET_PRIORITY. */
> +    uint16_t reserved;
> +    uint16_t priority;          /* skb->priority value. */

Is there a reason this is 16 bits instead of 32?

> --- a/lib/netdev-gre.c
> +++ b/lib/netdev-gre.c
> 
> +    NULL,                       /* get_queue */
> +    NULL,                       /* delete_queue */
> +    NULL,                       /* set_queue */

I think set_queue and delete_queue are reversed here.

> +/* Traffic control. */
> +
> +struct tc {
> +    const struct tc_ops *ops;
> +    struct port_array queues;

Should we rename the "port_array" library if we're going to use it for something other than ports?

> +};
> +
> +struct tc_ops {
> ...
> +    /* Constructor, for . */

This comment looks a bit, odd .

> +    int (*tc_install)(struct netdev *netdev, const struct shash *details);
> +    int (*tc_load)(struct netdev *, struct ofpbuf *nlmsg);
> +    void (*tc_destroy)(struct tc *);
> +
> +    /* Non-static member functions. */
> +    int (*qdisc_get)(const struct netdev *, struct shash *details);
> +    int (*qdisc_set)(struct netdev *, const struct shash *details);
> +
> +    int (*class_get)(const struct netdev *, unsigned int queue_id,
> +                     struct shash *details);
> +    int (*class_set)(struct netdev *, unsigned int queue_id,
> +                     const struct shash *details);
> +    int (*class_delete)(struct netdev *, unsigned int queue_id);
> +
> +    int (*class_get_stats)(const struct netdev *netdev, unsigned int queue_id,
> +                           struct netdev_queue_stats *stats);
> +    int (*class_dump_stats)(const struct netdev *netdev,
> +                            const struct ofpbuf *nlmsg,
> +                            netdev_dump_queue_stats_cb *cb, void *aux);
> +};

It may be nice to provide brief explanations of what these methods are supposed to do.

> +static const struct tc_ops tc_ops_htb;
> +static const struct tc_ops tc_ops_default;
> +static const struct tc_ops tc_ops_other;
> +
> +static const struct tc_ops *tcs[] = {
> +    &tc_ops_htb,                /* Hierarchy token bucket (see tc-htb(8)). */
> +    &tc_ops_default,            /* Default qdisc (see tc-pfifo_fast(8)). */

I don't think this is defined in the configuration schema.

> +    &tc_ops_other,              /* Some other qdisc. */
> +    NULL
> +};

The configuration schema indicates that linux-cbq is supported, but I don't see it defined anywhere.

> +static const struct tc_ops *
> +tc_lookup_linux_name(const char *name)
> +{
> +    const struct tc_ops **opsp;
> +
> +    for (opsp = tcs; *opsp != NULL; opsp++) {
> +        const struct tc_ops *ops = *opsp;
> +        if (!strcmp(name, ops->linux_name)) {

I believe "linux_name" may be NULL (like in linux-default), which would cause this to segfault.

> +static int
> +netdev_linux_get_queue(const struct netdev *netdev,
> +                       unsigned int queue_id, struct shash *details)
> +{
> ...
> +    return netdev_dev->tc->ops->class_get(netdev, queue_id, details);

The "class_get" method isn't always defined.  Do we know it will be if we've gotten to this point?

> +static int
> +netdev_linux_set_queue(struct netdev *netdev,
> +                       unsigned int queue_id, const struct shash *details)
> +{
> ...
> +    return netdev_dev->tc->ops->class_set(netdev, queue_id, details);

Same for "class_set"...

> +static int
> +netdev_linux_delete_queue(struct netdev *netdev, unsigned int queue_id)
> +{
> ....
> +    return netdev_dev->tc->ops->class_delete(netdev, queue_id);

...and "class_delete".

> +static int
> +htb_parse_class_details__(struct netdev *netdev,
> +                          const struct shash *details, struct htb_class *hc)
> +{
> ...
> +    hc->min_rate = strtoull(min_rate_s, NULL, 10) / 8;
> +    hc->min_rate = MAX(hc->min_rate, 10*1000 / 8);

What is this 10,000 min rate based on?  Should we have it be a #define?

> ...
> +    netdev_get_mtu(netdev, &mtu);
> +    hc->burst = burst_s ? strtoull(burst_s, NULL, 10) / 8 : 0;
> +    hc->burst = MAX(hc->burst, mtu + 64);

Similarly, what is the 64 based on?

> +static int
> +htb_class_get(const struct netdev *netdev, unsigned int queue_id,
> +              struct shash *details)
> ...
> +    shash_add(details, "burst", xasprintf("%u", hc->burst));

Don't you want to multiply this by eight, as well?

> +static void
> +read_psched(void)
> +{
> +    static const char fn[] = "/proc/net/psched";
> +    unsigned int a, b, c, d;

If you can derive them, I think more meaningful names would be helpful here.  Perhaps something along the lines of: nsec_per_usec, psched_us2ns, const_million, hertz.  I understand that they may not be consistent across kernel versions, though.

> +    ticks_per_s = (double) a * c / b;

On my Ubuntu 10.04 system running in VMware Fusion, I get the following from my psched:

	000003e8 00000040 000f4240 3b9aca00

That means "ticks_per_s" is 15,625,000.  Does that seem correct?

> +    if (c == 1000000) {
> +        hz = d;

The value for "d" on my box is 1,000,000,000, which seems like a high hertz value.

It may be worth sanity checking the numbers we get on our SPARC Linux box to see if there are any architectural effects on the numbers as well.

> +/* Returns the number of ticks that it would take to transmit 'size' bytes at a
> + * rate of 'rate' bytes per second. */
> +static unsigned int
> +tc_bytes_to_ticks(unsigned int rate, unsigned int size)
> +{
> +    if (!hz) {
> +        read_psched();
> +    }
> +    return (ticks_per_s * size) / rate;
> +}

With some of the values I mentioned earlier, it seems like we could run into overflow issues while doing this calculations.

> +/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
> + * e.g. "htb", into '*kind' (if it is nonnull).  If 'options' is nonnull,
> + * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
> + * stores NULL into it if it is absent.

It would be useful to know who owns "kind" and "options" when this returns.

> + *
> + * Returns 0 if successful, otherwise a positive errno value. */
> +static int
> +tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
> +               struct nlattr **options)
> +{
> ...
> +}

> +/* Initializes 'rate' properly for a rate of 'bps' bytes per second with an MTU
> + * of 'mtu'. */

That 'bps' should be 'Bps'.

> +/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
> + * attribute of the specified "type".
> + *
> + * See tc_calc_cell_log() above for a description of "rtab"s. */
> +static void
> +tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
> +{
> ...
> +}

> diff --git a/lib/netdev-patch.c b/lib/netdev-patch.c
> index 22353a1..94838f4 100644
> --- a/lib/netdev-patch.c
> +++ b/lib/netdev-patch.c
> 
> +    NULL,                       /* get_queue */
> +    NULL,                       /* set_queue */
> +    NULL,                       /* delete_queue */

I think set_queue and delete_queue are reversed here.

> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
> index 024b1fb..7f1703e 100644
> --- a/lib/netdev-provider.h
> +++ b/lib/netdev-provider.h
> 
> +    /* Queries 'netdev' for its capabilities regarding the specified 'type' of
> +     * QoS.  On success, initializes 'caps' with the QoS capabilities.
> +     *
> +     * Should return EOPNOTSUPP if 'netdev' does not support 'type'.  May be
> +     * NULL if 'netdev' does not support QoS at all. */
> +    int (*get_qos_capabilities)(const struct netdev *netdev,
> +                                const char *type,
> +                                struct netdev_qos_capabilities *details);

There's a mismatch between the description and this prototype in this last argument.  The description and netdev-linux implementation use "caps", but the prototype uses "details".

> +    /* Iterates over all of 'netdev''s queues, calling 'cb' with the queue's
> +     * ID, its configuration, and the 'aux' specified by the caller.  The order
> +     * of iteration is unspecified, but (when successful) each queue is visited
> +     * exactly once.
> +     *
> +     * 'cb' will not modify or free the 'details' argument passed in. */
> +    int (*dump_queues)(const struct netdev *netdev,
> +                       netdev_dump_queues_cb *cb, void *aux);

It might be nice to provide the prototype of "cb" right here..

> +    /* Iterates over all of 'netdev''s queues, calling 'cb' with the queue's
> +     * ID, its statistics, and the 'aux' specified by the caller.  The order of
> +     * iteration is unspecified, but (when successful) each queue must be
> +     * visited exactly once.
> +     *
> +     * 'cb' will not modify or free the statistics passed in. */
> +    int (*dump_queue_stats)(const struct netdev *netdev,
> +                            netdev_dump_queue_stats_cb *cb, void *aux);

Same here.

> diff --git a/lib/netdev.c b/lib/netdev.c
> index a15315a..a2f443f 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> 
> +/* Obtains the number of queues supported by 'netdev' for the specified 'type'
> + * of QoS.  Returns 0 if successful, otherwise a positive errno value.  Stores
> + * the number of queues (zero on failure) in '*n_queuesp'.
> + *
> + * This is just a simple wrapper around netdev_get_qos_capabilities(). */
> +int
> +netdev_get_n_queues(const struct netdev *netdev,
> +                    const char *type, int *n_queuesp)
> +{
> +    struct netdev_qos_capabilities caps;
> +    int retval;
> +
> +    retval = netdev_get_qos_capabilities(netdev, type, &caps);
> +    *n_queuesp = caps.n_queues;
> +    return retval;
> +}

It's probably not terribly important, but caps.n_queues is an unsigned int, but n_queuesp is signed.

> +
> +/* Queries 'netdev' about its currently configured form of QoS.  If successful,
> + * stores the name of the current form of QoS into '*typep', stores any details
> + * of configuration as string key-value pairs in 'details', and returns 0.  On
> + * failure, sets '*typep' to NULL, clears 'details', and returns a positive
> + * errno value.
> + *
> + * A '*typep' of "" indicates that QoS is currently disabled on 'netdev'.
> + *
> + * The caller must initialize 'details' (e.g. with shash_init()) before calling
> + * this function.  The caller must free 'details', including 'data' members,
> + * when it is no longer needed (e.g. with shash_destroy_free_data()).
> + *
> + * The caller must not modify or free '*typep'.
> + *
> + * '*typep' will be one of the types returned by netdev_get_qos_types() for
> + * 'netdev'.  The contents of 'details' should be documented as valid for
> + * '*typep' in the "other_config" column in the "QoS" table in
> + * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)). */
> +int
> +netdev_get_qos(const struct netdev *netdev,
> +               const char **typep, struct shash *details)
> +{
> +    const struct netdev_class *class = netdev_get_dev(netdev)->netdev_class;
> +    int retval;
> +
> +    if (class->get_qos) {
> +        retval = class->get_qos(netdev, typep, details);
> +        if (retval) {
> +            *typep = NULL;
> +            shash_clear_free_data(details);
> +        }
> +        return retval;
> +    } else {
> +        /* 'netdev' doesn't support QoS, so report that QoS is disabled. */
> +        *typep = "";

Based on the description, I would have expected "details" to be cleared.  I'm guessing you're not assuming that it's clean, since you free the data when the "get_qos" method isn't defined.

> +        return 0;
> +    }
> +}


> +/* Attempts to reconfigure QoS on 'netdev', changing the form of QoS to 'type'
> + * with details of configuration from 'details'.  Returns 0 if successful,
> + * otherwise a positive errno value.  On error, the previous QoS configuration
> + * is retained.
> + *
> + * When this function changes the type of QoS (not just 'details'), this also
> + * resets all queue configuration for 'netdev' to their defaults (which depend
> + * on the specific type of QoS).  Otherwise, the queue configuration for
> + * 'netdev' is unchanged.
> + *
> + * 'type' should be "" (to disable QoS) or one of the types returned by
> + * netdev_get_qos_types() for 'netdev'.  The contents of 'details' should be
> + * documented as valid for the given 'type' in the "other_config" column in the
> + * "QoS" table in vswitchd/vswitch.xml (which is built as
> + * ovs-vswitchd.conf.db(8)).
> + *
> + * NULL may be specified for 'details' if there are no configuration
> + * details. */
> +int
> +netdev_set_qos(struct netdev *netdev,
> +               const char *type, const struct shash *details)
> +{

I'm guessing that ownership of "details" is maintained by the caller.  It might be nice to mention that here and in netdev-provider.h.

> +    const struct netdev_class *class = netdev_get_dev(netdev)->netdev_class;
> +
> +    if (!type) {
> +        type = "";
> +    }
> +
> +    if (class->set_qos) {
> +        if (!details) {
> +            static struct shash empty = SHASH_INITIALIZER(&empty);
> +            details = ∅
> +        }
> +        return class->set_qos(netdev, type, details);
> +    } else {
> +        return *type ? EOPNOTSUPP : 0;

Won't "type" always be set?  It's set to "" early on, if the value was NULL.  

> +/* Iterates over all of 'netdev''s queues, calling 'cb' with the queue's ID,
> + * its statistics, and the 'aux' specified by the caller.  The order of
> + * iteration is unspecified, but (when successful) each queue is visited
> + * exactly once.
> + *
> + * Calling this function may be more efficient than calling
> + * netdev_get_queue_stats() for every queue.
> + *
> + * 'cb' must not modify or free the statistics passed in.
> + *
> + * Returns 0 if successful, otherwise a positive errno value.  On error, some
> + * configured queues may not have been included in the iteration. */
> +int
> +netdev_dump_queue_stats(const struct netdev *netdev,
> +                        netdev_dump_queue_stats_cb *cb, void *aux)
> +{
> +    const struct netdev_class *class = netdev_get_dev(netdev)->netdev_class;
> +    return (class->dump_queues

I think this should be "class->dump_queue_stats".

> +            ? class->dump_queue_stats(netdev, cb, aux)
> +            : EOPNOTSUPP);
> +}


> diff --git a/lib/ofp-util.c b/lib/ofp-util.c
> index d773a3f..e990f0f 100644
> 
> -check_action_port(int port, int max_ports)
> +check_output_port(uint16_t port, int max_ports)
> {
>     switch (port) {
>     case OFPP_IN_PORT:
> @@ -503,7 +507,7 @@ check_action_port(int port, int max_ports)
>         return 0;
> 
>     default:
> -        if (port >= 0 && port < max_ports) {
> +        if (port < max_ports) {

Isn't an OpenFlow port of zero still invalid (or at least preferred not to be used)?

> +/* Checks that 'action' is a valid OFPAT_ENQUEUE action, given that the switch
> + * will never have more than 'max_ports' ports.  Returns 0 if 'port' is valid,
> + * otherwise an ofp_mkerr() return code. */
> +static int
> +check_enqueue_action(const union ofp_action *a, unsigned int len,
> +                     int max_ports)
> +{
> ...
> +    oae = (const struct ofp_action_enqueue *) a;
> +    port = ntohs(oae->port);
> +    if (port < max_ports || port == OFPP_IN_PORT) {
> +        return 0;
> +    }

Same here.  I didn't think we'd ever have a port numbered zero.  

> +    VLOG_WARN_RL(&bad_ofmsg_rl, "unknown enqueue port %x", port);
> +    return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_OUT_PORT);
> +}


Is there a reason we don't check to see if the queue id is valid?  The OFPBAC_BAD_QUEUE message is available for raising the issue.

> diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
> index d877d36..b14a591 100644
> --- a/ofproto/ofproto.c
> +++ b/ofproto/ofproto.c
> 
> +static void
> +xlate_enqueue_action(struct action_xlate_ctx *ctx,
> +                     const struct ofp_action_enqueue *oae)
> ...
> +    /* Add ODP actions. */
> +    remove_pop_action(ctx);
> +    odp_actions_add(ctx->out, ODPAT_SET_PRIORITY)->priority.priority
> +        = TC_H_MAKE(1, ntohl(oae->queue_id)); /* XXX */

This is not particularly portable, especially since hardware will use a different mechanism.  I'm guessing that's what the "XXX" is for... 

> +struct queue_stats_cbdata {
> +    struct ofconn *ofconn;
> +    struct ofpbuf *msg;
> +    uint16_t port_no;
> +};
> +
> +static void
> +put_queue_stats(struct queue_stats_cbdata *cbdata,
> +                uint16_t port_no, uint16_t queue_id,
> +                const struct netdev_queue_stats *stats)
> +{
> +    struct ofp_queue_stats *reply;
> +
> +    reply = append_stats_reply(sizeof *reply, cbdata->ofconn, &cbdata->msg);
> +    reply->port_no = htons(port_no);

Since the port number is available in cbdata, is there a reason not to just always set the value there and not pass it in as an additional argument to the function?

> +static void
> +handle_queue_stats_for_port(struct ofport *port, uint16_t port_no,
> +                            uint16_t queue_id,
> +                            struct queue_stats_cbdata *cbdata)
> +{
> +    if (queue_id == OFPQ_ALL) {
> +        cbdata->port_no = port_no;
> +        netdev_dump_queue_stats(port->netdev,
> +                                handle_queue_stats_dump_cb, cbdata);
> +    } else {
> +        struct netdev_queue_stats stats;
> +
> +        netdev_get_queue_stats(port->netdev, queue_id, &stats);

Should we not check if the queue_id is valid and send an error if it's not?

> +static int
> +handle_queue_stats_request(struct ofproto *ofproto, struct ofconn *ofconn,
> +                           const struct ofp_stats_request *osr,
> +                           size_t arg_size)
> ...
> +    if (port_no == OFPP_ALL) {
> +        PORT_ARRAY_FOR_EACH (port, &ofproto->ports, port_no) {
> +            handle_queue_stats_for_port(port, port_no, queue_id, &cbdata);
> +        }
> +    } else if (port_no < ofproto->max_ports) {

Once again, would we ever allow a port zero?  I guess in this case it's not a big deal, since a reference to the port won't be found.

> --- a/ovsdb/ovsdbmonitor/Ui_MainWindow.py
> +++ b/ovsdb/ovsdbmonitor/Ui_MainWindow.py
> @@ -1,15 +1,17 @@
> # -*- coding: utf-8 -*-
> 
> -# Form implementation generated from reading ui file 'MainWindow.ui'
> +# Form implementation generated from reading ui file '../ovsdb/ovsdbmonitor/MainWindow.ui'

I'm going to assume this code is all right, since it was auto-generated.

> diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
> index b43a9bf..a13da6e 100644
> --- a/vswitchd/bridge.c
> +++ b/vswitchd/bridge.c
> 
> +static void
> +iface_update_qos(struct iface *iface, const struct ovsrec_qos *qos)
> +{
> ...
> +        /* Configure queues for 'iface'. */
> +        for (i = 0; i < qos->n_queues; i++) {
> +            const struct ovsrec_queue *queue = qos->value_queues[i];
> +            unsigned int queue_id = qos->key_queues[i];
> +
> +            shash_from_ovs_idl_map(queue->key_other_config,
> +                                   queue->value_other_config,
> +                                   queue->n_other_config, &details);
> +            netdev_set_queue(iface->netdev, queue_id, &details);
> +            shash_destroy(&details);
> +        }
> +
> +        /* Deconfigure queues that were deleted. */
> +        cbdata.netdev = iface->netdev;
> +        cbdata.queue_ids = qos->key_queues;
> +        cbdata.n_queue_ids = qos->n_queues;
> +        netdev_dump_queues(iface->netdev, iface_delete_queues, &cbdata);

Since there are potentially a lot of queues, would it make sense to deconfigure the queues that don't exist before configuring the ones that do?  We're already getting beat up on our memory usage in embedded devices.  ;-)

> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 345744e..910959f 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> 
>       <column name="ingress_policing_rate">
> -        <p>Maximum rate for data received on this interface, in kbps.  Data
> +        <p>Maximum rate for data received on this interface, in bit/s.  Data

Is there a reason this was changed from kbits to bits?  I think the code is still doing kbits.

> +  <table name="Queue" title="QoS output queue.">
> +    <p>A configuration for a port output queue, used in configuring Quality of
> +      Server (QoS) features.  May be referenced by <ref column="queues"

I believe you meant "Service" not "Server".

Thanks for implementing all of this!  QoS functionality has been sorely needed for a long time.

--Justin






More information about the dev mailing list