[ovs-dev] [PATCH ovs V2 12/21] dpif-netlink: Use netdev flow put api to insert a flow

Wed Jan 11 10:20:07 UTC 2017

On 10/01/2017 23:58, Joe Stringer wrote:
> On 10 January 2017 at 06:36, Paul Blakey <paulb at mellanox.com> wrote:
>>
>>
>> On 06/01/2017 01:28, Joe Stringer wrote:
>>>
>>> On 25 December 2016 at 03:39, Paul Blakey <paulb at mellanox.com> wrote:
>>>>
>>>> Using the new netdev flow api operate will now try and
>>>> offload flows to the relevant netdev of the input port.
>>>> Other operate methods flows will come in later patches.
>>>>
>>>> Signed-off-by: Paul Blakey <paulb at mellanox.com>
>>>> Reviewed-by: Roi Dayan <roid at mellanox.com>
>>>> ---
>>>>   lib/dpif-netlink.c | 232
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>   1 file changed, 228 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>>>> index 3d8940e..717af90 100644
>>>> --- a/lib/dpif-netlink.c
>>>> +++ b/lib/dpif-netlink.c
>>>> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink
>>>> *dpif,
>>>>       return n_ops;
>>>>   }
>>>>
>>>> +static int
>>>> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
>>>> +                            const struct nlattr *mask, size_t mask_len,
>>>> +                            struct match *match)
>>>> +{
>>>> +    enum odp_key_fitness fitness;
>>>> +
>>>> +    fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
>>>> +    if (fitness) {
>>>> +        /* This should not happen: it indicates that
>>>> odp_flow_key_from_flow()
>>>> +         * and odp_flow_key_to_flow() disagree on the acceptable form of
>>>> a
>>>> +         * flow.  Log the problem as an error, with enough details to
>>>> enable
>>>> +         * debugging. */
>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>> +
>>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>>> +            struct ds s;
>>>> +
>>>> +            ds_init(&s);
>>>> +            odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
>>>> +            VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
>>>> +            ds_destroy(&s);
>>>> +        }
>>>> +
>>>> +        return EINVAL;
>>>> +    }
>>>> +
>>>> +    fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc,
>>>> &match->flow);
>>>> +    if (fitness) {
>>>> +        /* This should not happen: it indicates that
>>>> +         * odp_flow_key_from_mask() and odp_flow_key_to_mask()
>>>> +         * disagree on the acceptable form of a mask.  Log the problem
>>>> +         * as an error, with enough details to enable debugging. */
>>>> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>> +
>>>> +        if (!VLOG_DROP_ERR(&rl)) {
>>>> +            struct ds s;
>>>> +
>>>> +            VLOG_ERR("internal error parsing flow mask %s (%s)",
>>>> +                     ds_cstr(&s), odp_key_fitness_to_string(fitness));
>>>> +            ds_destroy(&s);
>>>> +        }
>>>> +
>>>> +        return EINVAL;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static bool
>>>> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
>>>> +{
>>>> +    struct match match;
>>>> +    odp_port_t in_port;
>>>> +    const struct nlattr *nla;
>>>> +    size_t left;
>>>> +    int outputs = 0;
>>>> +    struct ofpbuf buf;
>>>> +    uint64_t act_stub[1024 / 8];
>>>> +    size_t offset;
>>>> +    struct nlattr *act;
>>>> +    struct netdev *dev;
>>>> +    int err;
>>>> +
>>>> +    /* 0x1234 - fake eth type sent to probe feature */
>>>> +    if (put->flags & DPIF_FP_PROBE || match.flow.dl_type ==
>>>> htons(0x1234)) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
>>>> +                                put->mask_len, &match)) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    in_port = match.flow.in_port.odp_port;
>>>> +    ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
>>>> +    offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
>>>> +    NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
>>>> +        if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>>>> +            struct netdev *outdev;
>>>> +            int ifindex_out;
>>>> +            const struct netdev_tunnel_config *tnl_cfg;
>>>> +            size_t out_off;
>>>> +            odp_port_t out_port;
>>>> +
>>>> +            outputs++;
>>>> +            if (outputs > 1) {
>>>> +                break;
>>>> +            }
>>>> +
>>>> +            out_port = nl_attr_get_u32(nla);
>>>> +            outdev = netdev_hmap_port_get(out_port,
>>>> dpif->dpif.dpif_class);
>>>> +            tnl_cfg = netdev_get_tunnel_config(outdev);
>>>> +
>>>> +            out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
>>>> +            ifindex_out = netdev_get_ifindex(outdev);
>>>> +            nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>>>> +            if (tnl_cfg && tnl_cfg->dst_port != 0) {
>>>> +                nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
>>>> tnl_cfg->dst_port);
>>>> +            }
>>>> +            nl_msg_end_nested(&buf, out_off);
>>>> +
>>>> +            if (outdev) {
>>>> +                netdev_close(outdev);
>>>> +            }
>>>> +        } else {
>>>> +            nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>>>> +                              nl_attr_get_size(nla));
>>>> +        }
>>>> +    }
>>>> +    nl_msg_end_nested(&buf, offset);
>>>> +
>>>> +    if (outputs > 1) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>>>> +    dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>>>> +    err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>>>> +                                                  nl_attr_get(act)),
>>>> +                          nl_attr_get_size(act), put->stats,
>>>> +                          CONST_CAST(ovs_u128 *, put->ufid));
>>>> +    netdev_close(dev);
>>>> +
>>>> +    if (!err) {
>>>> +        if (put->flags & DPIF_FP_MODIFY) {
>>>> +            struct dpif_op *opp;
>>>> +            struct dpif_op op;
>>>> +
>>>> +            op.type = DPIF_OP_FLOW_DEL;
>>>> +            op.u.flow_del.key = put->key;
>>>> +            op.u.flow_del.key_len = put->key_len;
>>>> +            op.u.flow_del.ufid = put->ufid;
>>>> +            op.u.flow_del.pmd_id = put->pmd_id;
>>>> +            op.u.flow_del.stats = NULL;
>>>> +            op.u.flow_del.terse = false;
>>>> +
>>>> +            opp = &op;
>>>> +            dpif_netlink_operate__(dpif, &opp, 1);
>>>> +        }
>>>> +        VLOG_DBG("added flow");
>>>> +        return true;
>>>> +    }
>>>> +    VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
>>>> +
>>>> +    return false;
>>>> +}
>>>> +
>>>> +static void
>>>> +dbg_print_flow(const struct nlattr *key, size_t key_len,
>>>> +               const struct nlattr *mask, size_t mask_len,
>>>> +               const struct nlattr *actions, size_t actions_len,
>>>> +               const ovs_u128 *ufid,
>>>> +               const char *op)
>>>> +{
>>>> +        struct ds s;
>>>> +
>>>> +        ds_init(&s);
>>>> +        ds_put_cstr(&s, op);
>>>> +        ds_put_cstr(&s, " (");
>>>> +        odp_format_ufid(ufid, &s);
>>>> +        ds_put_cstr(&s, ")");
>>>> +        if (key_len) {
>>>> +            ds_put_cstr(&s, "\nflow (verbose): ");
>>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>> true);
>>>> +            ds_put_cstr(&s, "\nflow: ");
>>>> +            odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>> false);
>>>> +        }
>>>> +        if (actions_len) {
>>>> +            ds_put_cstr(&s, "\nactions: ");
>>>> +            format_odp_actions(&s, actions, actions_len);
>>>> +        }
>>>> +        VLOG_DBG("\n%s", ds_cstr(&s));
>>>> +        ds_destroy(&s);
>>>> +}
>>>> +
>>>> +static bool
>>>> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
>>>> +{
>>>> +    switch (op->type) {
>>>> +    case DPIF_OP_FLOW_PUT: {
>>>> +        struct dpif_flow_put *put = &op->u.flow_put;
>>>> +
>>>> +        if (!put->ufid) {
>>>> +            return false;
>>>> +        }
>>>> +        dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
>>>> +                       put->actions, put->actions_len, put->ufid,
>>>> "PUT");
>>>> +        return parse_flow_put(dpif, put);
>>>> +    }
>>>> +    case DPIF_OP_FLOW_DEL:
>>>> +    case DPIF_OP_FLOW_GET:
>>>> +    case DPIF_OP_EXECUTE:
>>>> +    default:
>>>> +        break;
>>>> +    }
>>>> +    return false;
>>>> +}
>>>> +
>>>>   static void
>>>>   dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t
>>>> n_ops)
>>>>   {
>>>>       struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>>> +    struct dpif_op **new_ops;
>>>> +    int n_new_ops = 0;
>>>> +    int i = 0;
>>>> +
>>>> +    if (!netdev_flow_api_enabled) {
>>>> +        new_ops = ops;
>>>> +        n_new_ops = n_ops;
>>>> +    } else {
>>>> +        new_ops = xmalloc((sizeof *new_ops) * n_ops);
>>>> +        n_new_ops = 0;
>>>> +
>>>> +        for (i = 0; i < n_ops; i++) {
>>>> +            if (try_send_to_netdev(dpif, ops[i])) {
>>>> +                ops[i]->error = 0;
>>>
>>> What if the hardware returns EEXIST? Shouldn't we return EEXIST?
>>
>> Right it should, we'll fix that.
>>>
>>>
>>> What if the hardware reaches some resource constraint? This isn't
>>> required for an initial implementation, but it may be nice to have
>>> some heuristic to try to cut down on the failed syscalls if userspace
>>> has become aware that the hardware is out of resources. (Getting good
>>> visibility on this would also matter if you tried to deploy this).
>>
>> Right, do you mean that if certain kinds of flow fail (a specific mask
>> type), don't try again (with the same mask)?
>> Is it done in kernel?
>
> There's a couple of things: On the side of resource constraints, there
> is currently a ceiling of about 200,000 flows, above which userspace
> will not attempt to install a new flow. This logic is in
> ofproto-dpif-upcall. Depending on how complex your constraints are,
> maybe there is a way to model the hardware resource in the userspace
> so that once the limits are hit, userspace minimizes the number of
> failed syscalls due to hardware constraints. Simplest model would be
> something like, when TC starts returning error codes for ENOSPC or
> whatever the "out of hardware resource" error codes are, then you take
> the current number of hardware flows and choose that as the maximum
> number of hardware flows. Future flow installs to hardware will fail
> out early based on this "n_flows > max_flows" logic. Obviously
> depending on how constrained your hardware is, and what the
> constraints look like, this may be useful or useless. If hardware
> supports 2K flows, then you're more likely to get benefit out of being
> aware of this than if the hardware allows 200K arbitrary flows. Also I
> recognise that hardware may arrange flows differently so two flows may
> consume different amounts of hardware resources.
>
> The second part is if it's certain kinds of flow. The "probe"s in
> ofproto-dpif allow datapath feature detection at runtime, which can
> then be used to change the behaviour. For instance, if there is no
> support in datapath for a particular action, then the OpenFlow layer
> will return errors when a controller attempts to use that action (as
> we can't satisfy the flow mod request). For TC, it may be more like,
> during datapath initialization we detect the supported features so
> that flows may be checked against this feature support before going
> down to the kernel to install the flow (which would fail if, for
> instance, you tried to use one of the newer fields against an older
> kernel that has only the initial flower support).
>

Thanks for the suggestions. We need to look into that but I think
we can postpone this for the current patch set, and do these 
optimizations later.

> One thing I'm considering to avoid is where ovs-vswitchd logs end up
> getting flooded with all of these "hardware flow failed to be
> installed" errors, and you spend extra CPU attempting things that we
> can easily detect and avoid. Something to consider.
>
> While I remember, please check all the VLOG calls and use ratelimiters
> for them. (I think I provided this feedback somewhere, but I put it
> here as well just in case I forgot to).
>