[ovs-dev] [PATCH ovs V2 12/21] dpif-netlink: Use netdev flow put api to insert a flow
Paul Blakey
paulb at mellanox.com
Wed Jan 11 10:20:07 UTC 2017
On 10/01/2017 23:58, Joe Stringer wrote:
> On 10 January 2017 at 06:36, Paul Blakey <paulb at mellanox.com> wrote:
>>
>>
>> On 06/01/2017 01:28, Joe Stringer wrote:
>>>
>>> On 25 December 2016 at 03:39, Paul Blakey <paulb at mellanox.com> wrote:
>>>>
>>>> Using the new netdev flow api operate will now try and
>>>> offload flows to the relevant netdev of the input port.
>>>> Other operate methods flows will come in later patches.
>>>>
>>>> Signed-off-by: Paul Blakey <paulb at mellanox.com>
>>>> Reviewed-by: Roi Dayan <roid at mellanox.com>
>>>> ---
>>>> lib/dpif-netlink.c | 232
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>> 1 file changed, 228 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>>>> index 3d8940e..717af90 100644
>>>> --- a/lib/dpif-netlink.c
>>>> +++ b/lib/dpif-netlink.c
>>>> @@ -1908,15 +1908,239 @@ dpif_netlink_operate__(struct dpif_netlink
>>>> *dpif,
>>>> return n_ops;
>>>> }
>>>>
>>>> +static int
>>>> +parse_key_and_mask_to_match(const struct nlattr *key, size_t key_len,
>>>> + const struct nlattr *mask, size_t mask_len,
>>>> + struct match *match)
>>>> +{
>>>> + enum odp_key_fitness fitness;
>>>> +
>>>> + fitness = odp_flow_key_to_flow(key, key_len, &match->flow);
>>>> + if (fitness) {
>>>> + /* This should not happen: it indicates that
>>>> odp_flow_key_from_flow()
>>>> + * and odp_flow_key_to_flow() disagree on the acceptable form of
>>>> a
>>>> + * flow. Log the problem as an error, with enough details to
>>>> enable
>>>> + * debugging. */
>>>> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>> +
>>>> + if (!VLOG_DROP_ERR(&rl)) {
>>>> + struct ds s;
>>>> +
>>>> + ds_init(&s);
>>>> + odp_flow_format(key, key_len, NULL, 0, NULL, &s, true);
>>>> + VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
>>>> + ds_destroy(&s);
>>>> + }
>>>> +
>>>> + return EINVAL;
>>>> + }
>>>> +
>>>> + fitness = odp_flow_key_to_mask(mask, mask_len, &match->wc,
>>>> &match->flow);
>>>> + if (fitness) {
>>>> + /* This should not happen: it indicates that
>>>> + * odp_flow_key_from_mask() and odp_flow_key_to_mask()
>>>> + * disagree on the acceptable form of a mask. Log the problem
>>>> + * as an error, with enough details to enable debugging. */
>>>> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>>>> +
>>>> + if (!VLOG_DROP_ERR(&rl)) {
>>>> + struct ds s;
>>>> +
>>>> + VLOG_ERR("internal error parsing flow mask %s (%s)",
>>>> + ds_cstr(&s), odp_key_fitness_to_string(fitness));
>>>> + ds_destroy(&s);
>>>> + }
>>>> +
>>>> + return EINVAL;
>>>> + }
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static bool
>>>> +parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
>>>> +{
>>>> + struct match match;
>>>> + odp_port_t in_port;
>>>> + const struct nlattr *nla;
>>>> + size_t left;
>>>> + int outputs = 0;
>>>> + struct ofpbuf buf;
>>>> + uint64_t act_stub[1024 / 8];
>>>> + size_t offset;
>>>> + struct nlattr *act;
>>>> + struct netdev *dev;
>>>> + int err;
>>>> +
>>>> + /* 0x1234 - fake eth type sent to probe feature */
>>>> + if (put->flags & DPIF_FP_PROBE || match.flow.dl_type ==
>>>> htons(0x1234)) {
>>>> + return false;
>>>> + }
>>>> +
>>>> + if (parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
>>>> + put->mask_len, &match)) {
>>>> + return false;
>>>> + }
>>>> +
>>>> + in_port = match.flow.in_port.odp_port;
>>>> + ofpbuf_use_stub(&buf, act_stub, sizeof act_stub);
>>>> + offset = nl_msg_start_nested(&buf, OVS_FLOW_ATTR_ACTIONS);
>>>> + NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
>>>> + if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>>>> + struct netdev *outdev;
>>>> + int ifindex_out;
>>>> + const struct netdev_tunnel_config *tnl_cfg;
>>>> + size_t out_off;
>>>> + odp_port_t out_port;
>>>> +
>>>> + outputs++;
>>>> + if (outputs > 1) {
>>>> + break;
>>>> + }
>>>> +
>>>> + out_port = nl_attr_get_u32(nla);
>>>> + outdev = netdev_hmap_port_get(out_port,
>>>> dpif->dpif.dpif_class);
>>>> + tnl_cfg = netdev_get_tunnel_config(outdev);
>>>> +
>>>> + out_off = nl_msg_start_nested(&buf, OVS_ACTION_ATTR_OUTPUT);
>>>> + ifindex_out = netdev_get_ifindex(outdev);
>>>> + nl_msg_put_u32(&buf, OVS_ACTION_ATTR_OUTPUT, ifindex_out);
>>>> + if (tnl_cfg && tnl_cfg->dst_port != 0) {
>>>> + nl_msg_put_u32(&buf, OVS_TUNNEL_KEY_ATTR_TP_DST,
>>>> tnl_cfg->dst_port);
>>>> + }
>>>> + nl_msg_end_nested(&buf, out_off);
>>>> +
>>>> + if (outdev) {
>>>> + netdev_close(outdev);
>>>> + }
>>>> + } else {
>>>> + nl_msg_put_unspec(&buf, nl_attr_type(nla), nl_attr_get(nla),
>>>> + nl_attr_get_size(nla));
>>>> + }
>>>> + }
>>>> + nl_msg_end_nested(&buf, offset);
>>>> +
>>>> + if (outputs > 1) {
>>>> + return false;
>>>> + }
>>>> +
>>>> + act = ofpbuf_at_assert(&buf, offset, sizeof(struct nlattr));
>>>> + dev = netdev_hmap_port_get(in_port, dpif->dpif.dpif_class);
>>>> + err = netdev_flow_put(dev, &match, CONST_CAST(struct nlattr *,
>>>> + nl_attr_get(act)),
>>>> + nl_attr_get_size(act), put->stats,
>>>> + CONST_CAST(ovs_u128 *, put->ufid));
>>>> + netdev_close(dev);
>>>> +
>>>> + if (!err) {
>>>> + if (put->flags & DPIF_FP_MODIFY) {
>>>> + struct dpif_op *opp;
>>>> + struct dpif_op op;
>>>> +
>>>> + op.type = DPIF_OP_FLOW_DEL;
>>>> + op.u.flow_del.key = put->key;
>>>> + op.u.flow_del.key_len = put->key_len;
>>>> + op.u.flow_del.ufid = put->ufid;
>>>> + op.u.flow_del.pmd_id = put->pmd_id;
>>>> + op.u.flow_del.stats = NULL;
>>>> + op.u.flow_del.terse = false;
>>>> +
>>>> + opp = &op;
>>>> + dpif_netlink_operate__(dpif, &opp, 1);
>>>> + }
>>>> + VLOG_DBG("added flow");
>>>> + return true;
>>>> + }
>>>> + VLOG_DBG("failed adding flow: %s", ovs_strerror(err));
>>>> +
>>>> + return false;
>>>> +}
>>>> +
>>>> +static void
>>>> +dbg_print_flow(const struct nlattr *key, size_t key_len,
>>>> + const struct nlattr *mask, size_t mask_len,
>>>> + const struct nlattr *actions, size_t actions_len,
>>>> + const ovs_u128 *ufid,
>>>> + const char *op)
>>>> +{
>>>> + struct ds s;
>>>> +
>>>> + ds_init(&s);
>>>> + ds_put_cstr(&s, op);
>>>> + ds_put_cstr(&s, " (");
>>>> + odp_format_ufid(ufid, &s);
>>>> + ds_put_cstr(&s, ")");
>>>> + if (key_len) {
>>>> + ds_put_cstr(&s, "\nflow (verbose): ");
>>>> + odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>> true);
>>>> + ds_put_cstr(&s, "\nflow: ");
>>>> + odp_flow_format(key, key_len, mask, mask_len, NULL, &s,
>>>> false);
>>>> + }
>>>> + if (actions_len) {
>>>> + ds_put_cstr(&s, "\nactions: ");
>>>> + format_odp_actions(&s, actions, actions_len);
>>>> + }
>>>> + VLOG_DBG("\n%s", ds_cstr(&s));
>>>> + ds_destroy(&s);
>>>> +}
>>>> +
>>>> +static bool
>>>> +try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
>>>> +{
>>>> + switch (op->type) {
>>>> + case DPIF_OP_FLOW_PUT: {
>>>> + struct dpif_flow_put *put = &op->u.flow_put;
>>>> +
>>>> + if (!put->ufid) {
>>>> + return false;
>>>> + }
>>>> + dbg_print_flow(put->key, put->key_len, put->mask, put->mask_len,
>>>> + put->actions, put->actions_len, put->ufid,
>>>> "PUT");
>>>> + return parse_flow_put(dpif, put);
>>>> + }
>>>> + case DPIF_OP_FLOW_DEL:
>>>> + case DPIF_OP_FLOW_GET:
>>>> + case DPIF_OP_EXECUTE:
>>>> + default:
>>>> + break;
>>>> + }
>>>> + return false;
>>>> +}
>>>> +
>>>> static void
>>>> dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t
>>>> n_ops)
>>>> {
>>>> struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>>> + struct dpif_op **new_ops;
>>>> + int n_new_ops = 0;
>>>> + int i = 0;
>>>> +
>>>> + if (!netdev_flow_api_enabled) {
>>>> + new_ops = ops;
>>>> + n_new_ops = n_ops;
>>>> + } else {
>>>> + new_ops = xmalloc((sizeof *new_ops) * n_ops);
>>>> + n_new_ops = 0;
>>>> +
>>>> + for (i = 0; i < n_ops; i++) {
>>>> + if (try_send_to_netdev(dpif, ops[i])) {
>>>> + ops[i]->error = 0;
>>>
>>> What if the hardware returns EEXIST? Shouldn't we return EEXIST?
>>
>> Right it should, we'll fix that.
>>>
>>>
>>> What if the hardware reaches some resource constraint? This isn't
>>> required for an initial implementation, but it may be nice to have
>>> some heuristic to try to cut down on the failed syscalls if userspace
>>> has become aware that the hardware is out of resources. (Getting good
>>> visibility on this would also matter if you tried to deploy this).
>>
>> Right, do you mean that if certain kinds of flow fail (a specific mask
>> type), don't try again (with the same mask)?
>> Is it done in kernel?
>
> There's a couple of things: On the side of resource constraints, there
> is currently a ceiling of about 200,000 flows, above which userspace
> will not attempt to install a new flow. This logic is in
> ofproto-dpif-upcall. Depending on how complex your constraints are,
> maybe there is a way to model the hardware resource in the userspace
> so that once the limits are hit, userspace minimizes the number of
> failed syscalls due to hardware constraints. Simplest model would be
> something like, when TC starts returning error codes for ENOSPC or
> whatever the "out of hardware resource" error codes are, then you take
> the current number of hardware flows and choose that as the maximum
> number of hardware flows. Future flow installs to hardware will fail
> out early based on this "n_flows > max_flows" logic. Obviously
> depending on how constrained your hardware is, and what the
> constraints look like, this may be useful or useless. If hardware
> supports 2K flows, then you're more likely to get benefit out of being
> aware of this than if the hardware allows 200K arbitrary flows. Also I
> recognise that hardware may arrange flows differently so two flows may
> consume different amounts of hardware resources.
>
> The second part is if it's certain kinds of flow. The "probe"s in
> ofproto-dpif allow datapath feature detection at runtime, which can
> then be used to change the behaviour. For instance, if there is no
> support in datapath for a particular action, then the OpenFlow layer
> will return errors when a controller attempts to use that action (as
> we can't satisfy the flow mod request). For TC, it may be more like,
> during datapath initialization we detect the supported features so
> that flows may be checked against this feature support before going
> down to the kernel to install the flow (which would fail if, for
> instance, you tried to use one of the newer fields against an older
> kernel that has only the initial flower support).
>
Thanks for the suggestions. We need to look into that but I think
we can postpone this for the current patch set, and do these
optimizations later.
> One thing I'm considering to avoid is where ovs-vswitchd logs end up
> getting flooded with all of these "hardware flow failed to be
> installed" errors, and you spend extra CPU attempting things that we
> can easily detect and avoid. Something to consider.
>
> While I remember, please check all the VLOG calls and use ratelimiters
> for them. (I think I provided this feedback somewhere, but I put it
> here as well just in case I forgot to).
>
More information about the dev
mailing list