[ovs-dev] [PATCH 3/4] tc: Add header rewrite using tc pedit action

Tue Aug 15 17:04:04 UTC 2017

On 15 August 2017 at 01:19, Paul Blakey <paulb at mellanox.com> wrote:
>
>
> On 15/08/2017 00:56, Joe Stringer wrote:
>>
>> On 8 August 2017 at 07:21, Roi Dayan <roid at mellanox.com> wrote:
>>>
>>> From: Paul Blakey <paulb at mellanox.com>
>>>
>>> @@ -346,6 +425,96 @@ nl_parse_flower_ip(struct nlattr **attrs, struct
>>> tc_flower *flower) {
>>>       }
>>>   }
>>>
>>> +static const struct nl_policy pedit_policy[] = {
>>> +            [TCA_PEDIT_PARMS_EX] = { .type = NL_A_UNSPEC,
>>> +                                     .min_len = sizeof(struct tc_pedit),
>>> +                                     .optional = false, },
>>> +            [TCA_PEDIT_KEYS_EX]   = { .type = NL_A_NESTED,
>>> +                                      .optional = false, },
>>> +};
>>> +
>>> +static int
>>> +nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower)
>>> +{
>>> +    struct nlattr *pe_attrs[ARRAY_SIZE(pedit_policy)];
>>> +    const struct tc_pedit *pe;
>>> +    const struct tc_pedit_key *keys;
>>> +    const struct nlattr *nla, *keys_ex, *ex_type;
>>> +    const void *keys_attr;
>>> +    char *rewrite_key = (void *) &flower->rewrite.key;
>>> +    char *rewrite_mask = (void *) &flower->rewrite.mask;
>>
>>
>> These are actually 'struct tc_flower_key', shouldn't we use the actual
>> types? (I realise there is pointer arithmetic below, but usually we
>> restrict the usage of void casting and pointer arithmetic as much as
>> possible).
>>
>
> Yes, It's for the pointer arithmetic. (void *) cast was for the
> clangs warning "cast increases required alignment of target type"
> which we couldn't find another way to suppress.
> I don't think alignments an issue here.

Ah. I don't have particularly much experience with pointer alignment.

>
>>> +    size_t keys_ex_size, left;
>>> +    int type, i = 0;
>>> +
>>> +    if (!nl_parse_nested(options, pedit_policy, pe_attrs,
>>> +                         ARRAY_SIZE(pedit_policy))) {
>>> +        VLOG_ERR_RL(&error_rl, "failed to parse pedit action options");
>>> +        return EPROTO;
>>> +    }
>>> +
>>> +    pe = nl_attr_get_unspec(pe_attrs[TCA_PEDIT_PARMS_EX], sizeof *pe);
>>> +    keys = pe->keys;
>>> +    keys_attr = pe_attrs[TCA_PEDIT_KEYS_EX];
>>> +    keys_ex = nl_attr_get(keys_attr);
>>> +    keys_ex_size = nl_attr_get_size(keys_attr);
>>> +
>>> +    NL_ATTR_FOR_EACH(nla, left, keys_ex, keys_ex_size) {
>>> +        if (i >= pe->nkeys) {
>>> +            break;
>>> +        }
>>> +
>>> +        if (nl_attr_type(nla) == TCA_PEDIT_KEY_EX) {
>>> +            ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE);
>>> +            type = nl_attr_get_u16(ex_type);
>>> +
>>> +            for (int j = 0; j < ARRAY_SIZE(flower_pedit_map); j++) {
>>> +                struct flower_key_to_pedit *m = &flower_pedit_map[j];
>>> +                int flower_off = j;
>>> +                int sz = m->size;
>>> +                int mf = m->offset;
>>> +
>>> +                if (!sz || m->htype != type) {
>>> +                   continue;
>>> +                }
>>
>>
>> If flower_pedit_map was just a regular array and the offset was
>> included in 'struct flower_key_to_pedit', then I don't think we need
>> the above check?
>>
>>> +
>>> +                /* check overlap between current pedit key, which is
>>> always
>>> +                 * 4 bytes (range [off, off + 3]), and a map entry in
>>> +                 * flower_pedit_map (range [mf, mf + sz - 1]) */
>>> +                if ((keys->off >= mf && keys->off < mf + sz)
>>> +                    || (keys->off + 3 >= mf && keys->off + 3 < mf + sz))
>>> {
>>> +                    int diff = flower_off + (keys->off - mf);
>>> +                    uint32_t *dst = (void *) (rewrite_key + diff);
>>> +                    uint32_t *dst_m = (void *) (rewrite_mask + diff);
>>> +                    uint32_t mask = ~(keys->mask);
>>> +                    uint32_t zero_bits;
>>> +
>>> +                    if (keys->off < mf) {
>>> +                        zero_bits = 8 * (mf - keys->off);
>>> +                        mask &= UINT32_MAX << zero_bits;
>>> +                    } else if (keys->off + 4 > mf + m->size) {
>>> +                        zero_bits = 8 * (keys->off + 4 - mf - m->size);
>>> +                        mask &= UINT32_MAX >> zero_bits;
>>> +                    }
>>
>>
>> I think this is all trying to avoid having a giant switch case here
>> which would handle all of the possible flower key attributes, similar
>> to somewhere else where this kind of iteration occurs.
>
> Right that was the objective.
>
>  Is there any
>>
>> overlap between this code and calc_offsets()calc_offsets was to calculate
>> the word offsets and masks for the first
>
> and last word to write, here its the reverse so its already in word size
> with correct masks, and easier to write back.

OK.

>>
>> Could we somehow trigger an assert or a strong warning message if the
>> offsets of our flower_pedit_map and the flower key don't align, given
>> that there's a bunch of pointer arithmetic happening here?
>>
>
> Not sure what you mean here, we are aligned to words (4 bytes) and
> "cut" the words if they overflow the target. we also padded flower.rewrite
> key and mask so we won't overflow to flower member we don't mean to write.

What I was trying to figure out is if there is any additional
compile-time check we could have to ensure that we copy exactly the
right bits to exactly the right place. I'm not sure that there is.

>> <snip>
>>
>>> @@ -589,6 +758,10 @@ nl_parse_single_action(struct nlattr *action, struct
>>> tc_flower *flower)
>>>           nl_parse_act_vlan(act_options, flower);
>>>       } else if (!strcmp(act_kind, "tunnel_key")) {
>>>           nl_parse_act_tunnel_key(act_options, flower);
>>> +    } else if (!strcmp(act_kind, "pedit")) {
>>> +        nl_parse_act_pedit(act_options, flower);
>>> +    } else if (!strcmp(act_kind, "csum")) {
>>> +        /* not doing anything for now */
>>
>>
>> Should we log a message that we're not handling csums?
>>
>
> We aren't handling them because we don't need to, OVS has an implicit
> csum recalculation of rewritten packet headers.

OK.

>> <snip>
>>
>>> +static int
>>> +nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request,
>>> +                                 struct tc_flower *flower)
>>> +{
>>> +    struct {
>>> +        struct tc_pedit sel;
>>> +        struct tc_pedit_key keys[MAX_PEDIT_OFFSETS];
>>> +        struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS];
>>> +    } sel = {
>>> +        .sel = {
>>> +            .nkeys = 0
>>> +        }
>>> +    };
>>> +    int i, j;
>>> +
>>> +    for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) {
>>> +        struct flower_key_to_pedit *m = &flower_pedit_map[i];
>>> +        struct tc_pedit_key *pedit_key = NULL;
>>> +        struct tc_pedit_key_ex *pedit_key_ex = NULL;
>>> +        uint32_t *mask, *data, first_word_mask, last_word_mask;
>>> +        int cnt = 0, cur_offset = 0;
>>> +
>>> +        if (!m->size) {
>>> +            continue;
>>> +        }
>>> +
>>> +        calc_offsets(flower, m, i, &cur_offset, &cnt, &last_word_mask,
>>> +                     &first_word_mask, &mask, &data);
>>> +
>>> +        for (j = 0; j < cnt; j++,  mask++, data++, cur_offset += 4) {
>>> +            uint32_t mask_word = *mask;
>>> +
>>> +            if (j == 0) {
>>> +                mask_word &= first_word_mask;
>>> +            }
>>> +            if (j == cnt - 1) {
>>> +                mask_word &= last_word_mask;
>>> +            }
>>> +            if (!mask_word) {
>>> +                continue;
>>> +            }
>>> +            if (sel.sel.nkeys == MAX_PEDIT_OFFSETS) {
>>> +                VLOG_ERR_RL(&error_rl, "reached too many pedit offsets:
>>> %d",
>>> +                            MAX_PEDIT_OFFSETS);
>>> +                return EOPNOTSUPP;
>>> +            }
>>
>>
>> Just wondering if this should be err or warn, since presumably the
>> result is that such flows avoid TC and should end up being correctly
>> handled by the openvswitch kernel datapath?
>
>
> Right, we'll change that.

Thanks.

>>> @@ -1062,7 +1422,7 @@ nl_msg_put_flower_options(struct ofpbuf *request,
>>> struct tc_flower *flower)
>>>           nl_msg_put_flower_tunnel(request, flower);
>>>       }
>>>
>>> -    nl_msg_put_flower_acts(request, flower);
>>> +    return 0;
>>>   }
>>
>>
>> The above snippet seems like it could be logically separate and kept
>> as a different patch, as it shouldn't affect behaviour at all but it
>> does actually introduce additional error checking. This could assist
>> git bisect if necessary.
>
>
> We'll do that, thanks.

OK.

>>
>>>
>>>   int
>>> @@ -1085,11 +1445,17 @@ tc_replace_flower(int ifindex, uint16_t prio,
>>> uint32_t handle,
>>>       nl_msg_put_string(&request, TCA_KIND, "flower");
>>>       basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
>>>       {
>>> -        nl_msg_put_flower_options(&request, flower);
>>> +        error = nl_msg_put_flower_options(&request, flower);
>>> +
>>> +        if (error) {
>>> +            ofpbuf_uninit(&request);
>>> +            return error;
>>> +        }
>>>       }
>>>       nl_msg_end_nested(&request, basic_offset);
>>>
>>>       error = tc_transact(&request, &reply);
>>> +
>>
>>
>> Random extra newline?
>
>
> Will be removed, thanks.
>
>
>>
>>>       if (!error) {
>>>           struct tcmsg *tc =
>>>               ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
>>> diff --git a/lib/tc.h b/lib/tc.h
>>> index 5f363d0..2269a22 100644
>>> --- a/lib/tc.h
>>> +++ b/lib/tc.h
>>> @@ -93,6 +93,7 @@ struct tc_flower_key {
>>>       struct {
>>>           ovs_be32 ipv4_src;
>>>           ovs_be32 ipv4_dst;
>>> +        uint8_t rewrite_ttl;
>>>       } ipv4;
>>>       struct {
>>>           struct in6_addr ipv6_src;
>>> @@ -117,6 +118,17 @@ struct tc_flower {
>>>       uint64_t lastused;
>>>
>>>       struct {
>>> +        bool rewrite;
>>> +        uint8_t pad1[3];
>>> +        struct tc_flower_key key;
>>> +        uint8_t pad2[3];
>>> +        struct tc_flower_key mask;
>>> +        uint8_t pad3[3];
>>> +    } rewrite;

Now that I get why the pads are here.. ;)

Is there an existing macro we can use to ensure that these pad out to
32-bit boundaries?

>>> +
>>> +    uint32_t csum_update_flags;
>>> +
>>> +    struct {
>>>           bool set;
>>>           ovs_be64 id;
>>>           ovs_be16 tp_src;
>>
>>
>> This is maybe an aside, but it seems like 'rewrite_ttl' already exists
>> in 'struct tc_flower_key'. Should it also be in 'struct tc_flower'
>> like this?
>>
>
> On latest master, we have ip_ttl in struct tc_flower, which is for matching
> on ipv6/ipv4 ttl/tos.
> This one is for rewriting of ttl, we need to distinguish between
> ipv4 ttl and ipv6 ttl to translate them to different action pedit
> offsets/layers, we didn't want to add ip protocol dependencies in the map.