[ovs-dev] [ovs-dev v5] ipf: add ipf context

Mike Pattrick mkp at redhat.com
Mon Nov 29 21:08:14 UTC 2021


Looks good to me!

Acked-by: Mike Pattrick <mkp at redhat.com>


On Sat, 2021-11-27 at 03:39 +0000, Peng He wrote:
> From: Peng He <xnhp0320 at gmail.com>
> 
> ipf_postprocess will emit packets into the datapath pipeline ignoring
> the conntrack context, this might casuse weird issues when a packet
> batch has less space to hold all the fragments belonging to single
> packet.
> 
> Given the below ruleest and consider sending a 64K ICMP packet which
> is splitted into 64 fragments.
> 
> priority=1,action=drop
> priority=10,arp,action=normal
> priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0)
> priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2
> priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
> priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
> 
> Batch 1:
> the first 32 packets will be buffered in the ipf preprocessing, nothing
> more proceeds.
> 
> Batch 2:
> the second 32 packets succeed the fragment reassembly and goes to ct
> and ipf_post will emits the first 32 packets due to the limit of batch
> size.
> 
> the first 32 packets goes to the datapath again due to the
> recirculation, and again buffered at ipf preprocessing before ct commit,
> then the ovs tries to call ct commit as well as ipf_postprocessing which emits
> the last 32 packets, in this case the last 32 packets will follow
> the current actions which will be sent to port 2 directly without
> recirculation and going to ipf preprocssing and ct commit again.
> 
> This will cause the first 32 packets never get the chance to
> reassemble and evevntually this large ICMP packets fail to transmit.
> 
> this patch fixes this issue by adding firstly ipf context to avoid
> ipf_postprocessing emits packets in the wrong context. Then by
> re-executing the action list again to emit the last 32 packets
> in the right context to correctly transmitting multiple fragments.
> 
> This might be one of the root cause why the OVS log warns:
> 
> "
> skipping output to input port on bridge br-int while processing
> "
> 
> As a pkt might enter into different ipf context.
> 
> Signed-off-by: Peng He <hepeng.0320 at bytedance.com>
> Co-authored-by: Mike Pattrick <mkp at redhat.com>
> ---
>  lib/conntrack.c         |  9 ++++---
>  lib/conntrack.h         | 15 ++++++------
>  lib/dpif-netdev.c       | 52 +++++++++++++++++++++++++++++++++++++----
>  lib/ipf.c               | 40 ++++++++++++++++++++++++-------
>  lib/ipf.h               | 11 +++++++--
>  tests/system-traffic.at | 33 ++++++++++++++++++++++++++
>  tests/test-conntrack.c  |  6 ++---
>  7 files changed, 136 insertions(+), 30 deletions(-)
> 
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 33a1a9295..72999f1ae 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1434,14 +1434,14 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
>   * connection tables.  'setmark', if not NULL, should point to a two
>   * elements array containing a value and a mask to set the connection mark.
>   * 'setlabel' behaves similarly for the connection label.*/
> -int
> +bool
>  conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
>                    ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
>                    const uint32_t *setmark,
>                    const struct ovs_key_ct_labels *setlabel,
>                    ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
>                    const struct nat_action_info_t *nat_action_info,
> -                  long long now, uint32_t tp_id)
> +                  long long now, uint32_t tp_id, struct ipf_ctx *ipf_ctx)
>  {
>      ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
>                               ct->hash_basis);
> @@ -1468,9 +1468,8 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
>          }
>      }
>  
> -    ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
> -
> -    return 0;
> +    return ipf_postprocess_conntrack(ct->ipf, pkt_batch, ipf_ctx, \
> +                                     now, dl_type);
>  }
>  
>  void
> diff --git a/lib/conntrack.h b/lib/conntrack.h
> index 9553b188a..211efad3f 100644
> --- a/lib/conntrack.h
> +++ b/lib/conntrack.h
> @@ -64,6 +64,7 @@
>  struct dp_packet_batch;
>  
>  struct conntrack;
> +struct ipf_ctx;
>  
>  union ct_addr {
>      ovs_be32 ipv4;
> @@ -88,13 +89,13 @@ struct nat_action_info_t {
>  struct conntrack *conntrack_init(void);
>  void conntrack_destroy(struct conntrack *);
>  
> -int conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
> -                      ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
> -                      const uint32_t *setmark,
> -                      const struct ovs_key_ct_labels *setlabel,
> -                      ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
> -                      const struct nat_action_info_t *nat_action_info,
> -                      long long now, uint32_t tp_id);
> +bool conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
> +                       ovs_be16 dl_type, bool force, bool commit,
> +                       uint16_t zone, const uint32_t *setmark,
> +                       const struct ovs_key_ct_labels *setlabel,
> +                       ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
> +                       const struct nat_action_info_t *nat_action_info,
> +                       long long now, uint32_t tp_id, struct ipf_ctx *ipf_ctx);
>  void conntrack_clear(struct dp_packet *packet);
>  
>  struct conntrack_dump {
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 98453a206..af928a6fa 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -7756,6 +7756,7 @@ dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
>  struct dp_netdev_execute_aux {
>      struct dp_netdev_pmd_thread *pmd;
>      const struct flow *flow;
> +    const struct nlattr *redo_actions;
>  };
>  
>  static void
> @@ -8285,10 +8286,23 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>              VLOG_WARN_RL(&rl, "NAT specified without commit.");
>          }
>  
> -        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
> -                          commit, zone, setmark, setlabel, aux->flow->tp_src,
> -                          aux->flow->tp_dst, helper, nat_action_info_ref,
> -                          pmd->ctx.now / 1000, tp_id);
> +        bool more_pkts;
> +        struct ipf_ctx ipf_ctx = { aux->flow->recirc_id,
> +                                   aux->flow->in_port,
> +                                   zone };
> +        more_pkts = conntrack_execute(dp->conntrack, packets_,
> +                                      aux->flow->dl_type,
> +                                      force, commit, zone,
> +                                      setmark, setlabel,
> +                                      aux->flow->tp_src,
> +                                      aux->flow->tp_dst,
> +                                      helper, nat_action_info_ref,
> +                                      pmd->ctx.now / 1000, tp_id, &ipf_ctx);
> +        if (more_pkts) {
> +            aux->redo_actions = a;
> +        } else {
> +            aux->redo_actions = NULL;
> +        }
>          break;
>      }
>  
> @@ -8322,16 +8336,44 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
>      dp_packet_delete_batch(packets_, should_steal);
>  }
>  
> +static size_t
> +dp_netdev_find_actions_left(const struct nlattr *actions,
> +                            size_t actions_len,
> +                            const struct nlattr *target)
> +{
> +    const struct nlattr *a;
> +    size_t left;
> +    NL_ATTR_FOR_EACH_UNSAFE (a, left, actions, actions_len) {
> +        if (a == target) {
> +            break;
> +        }
> +    }
> +    return left;
> +}
> +
>  static void
>  dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
>                            struct dp_packet_batch *packets,
>                            bool should_steal, const struct flow *flow,
>                            const struct nlattr *actions, size_t actions_len)
>  {
> -    struct dp_netdev_execute_aux aux = { pmd, flow };
> +    int max_reexec = 2;
> +    struct dp_netdev_execute_aux aux = { pmd, flow, NULL };
>  
>      odp_execute_actions(&aux, packets, should_steal, actions,
>                          actions_len, dp_execute_cb);
> +
> +    while (OVS_UNLIKELY(aux.redo_actions && max_reexec --)) {
> +        struct dp_packet_batch batch;
> +        dp_packet_batch_init(&batch);
> +        size_t redo_actions_len = \
> +                                  dp_netdev_find_actions_left(actions,
> +                                          actions_len,
> +                                          aux.redo_actions);
> +
> +        odp_execute_actions(&aux, &batch, should_steal,
> +                aux.redo_actions, redo_actions_len, dp_execute_cb);
> +    }
>  }
>  
>  struct dp_netdev_ct_dump {
> diff --git a/lib/ipf.c b/lib/ipf.c
> index 507db2aea..1fd3d8d30 100644
> --- a/lib/ipf.c
> +++ b/lib/ipf.c
> @@ -1046,30 +1046,52 @@ ipf_send_frags_in_list(struct ipf *ipf, struct ipf_list *ipf_list,
>      OVS_NOT_REACHED();
>  }
>  
> +static bool
> +ipf_ctx_eq(struct ipf_list *ipf_list, struct ipf_ctx *ctx)
> +{
> +    struct dp_packet *pkt =
> +        ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
> +
> +    if (pkt->md.recirc_id != ctx->recirc_id ||
> +            pkt->md.in_port.odp_port != ctx->in_port.odp_port ||
> +            pkt->md.ct_zone != ctx->zone) {
> +        return false;
> +    }
> +    return true;
> +}
> +
>  /* Adds fragments associated with a completed fragment list to a packet batch
>   * to be processed by the calling application, typically conntrack. Also
>   * cleans up the list context when it is empty.*/
> -static void
> +static bool
>  ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
> -                         long long now, bool v6)
> +                         struct ipf_ctx *ctx, long long now, bool v6)
>  {
>      if (ovs_list_is_empty(&ipf->frag_complete_list)) {
> -        return;
> +        return false;
>      }
>  
> +    bool more_pkts = false;
>      ovs_mutex_lock(&ipf->ipf_lock);
>      struct ipf_list *ipf_list, *next;
>  
>      LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_complete_list) {
> +
> +        if (ctx && !ipf_ctx_eq(ipf_list, ctx)) {
> +            continue;
> +        }
> +
>          if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST,
>                                     v6, now)) {
>              ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
>          } else {
> +            more_pkts = true;
>              break;
>          }
>      }
>  
>      ovs_mutex_unlock(&ipf->ipf_lock);
> +    return more_pkts;
>  }
>  
>  /* Conservatively adds fragments associated with a expired fragment list to
> @@ -1225,8 +1247,8 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
>   * be added to the batch to be sent through conntrack. */
>  void
>  ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> -                         long long now, ovs_be16 dl_type, uint16_t zone,
> -                         uint32_t hash_basis)
> +                         long long now, ovs_be16 dl_type,
> +                         uint16_t zone, uint32_t hash_basis)
>  {
>      if (ipf_get_enabled(ipf)) {
>          ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, hash_basis);
> @@ -1241,16 +1263,18 @@ ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
>   * through conntrack and adds these fragments to any batches seen.  Expired
>   * fragments are marked as invalid and also added to the batches seen
>   * with low priority.  Reassembled packets are freed. */
> -void
> +bool
>  ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> -                          long long now, ovs_be16 dl_type)
> +                          struct ipf_ctx *ctx, long long now, ovs_be16 dl_type)
>  {
> +    bool more_pkts = false;
>      if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
>          bool v6 = dl_type == htons(ETH_TYPE_IPV6);
>          ipf_post_execute_reass_pkts(ipf, pb, v6);
> -        ipf_send_completed_frags(ipf, pb, now, v6);
> +        more_pkts = ipf_send_completed_frags(ipf, pb, ctx, now, v6);
>          ipf_send_expired_frags(ipf, pb, now, v6);
>      }
> +    return more_pkts;
>  }
>  
>  static void *
> diff --git a/lib/ipf.h b/lib/ipf.h
> index 6ac91b270..7bbf453af 100644
> --- a/lib/ipf.h
> +++ b/lib/ipf.h
> @@ -40,14 +40,21 @@ struct ipf_status {
>     unsigned int nfrag_max;
>  };
>  
> +struct ipf_ctx {
> +    uint32_t recirc_id;
> +    union flow_in_port in_port; /* Input port. */
> +    uint16_t zone;
> +};
> +
>  struct ipf *ipf_init(void);
>  void ipf_destroy(struct ipf *ipf);
>  void ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
>                                long long now, ovs_be16 dl_type, uint16_t zone,
>                                uint32_t hash_basis);
>  
> -void ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> -                               long long now, ovs_be16 dl_type);
> +bool ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
> +                               struct ipf_ctx *ctx, long long now,
> +                               ovs_be16 dl_type);
>  
>  int ipf_set_enabled(struct ipf *ipf, bool v6, bool enable);
>  int ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value);
> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> index a69896d33..a09318882 100644
> --- a/tests/system-traffic.at
> +++ b/tests/system-traffic.at
> @@ -2741,6 +2741,39 @@ DPCTL_CHECK_FRAGMENTATION_PASS()
>  OVS_TRAFFIC_VSWITCHD_STOP
>  AT_CLEANUP
>  
> +AT_SETUP([conntrack - IPv4 large fragmentation])
> +CHECK_CONNTRACK()
> +OVS_TRAFFIC_VSWITCHD_START()
> +
> +ADD_NAMESPACES(at_ns0, at_ns1)
> +
> +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
> +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
> +
> +dnl Sending ping through conntrack
> +AT_DATA([flows.txt], [dnl
> +priority=1,action=drop
> +priority=10,arp,action=normal
> +priority=100,in_port=1,ct_state=-trk,icmp,action=ct(zone=9,table=0)
> +priority=100,in_port=1,ct_state=+new+trk,icmp,action=ct(zone=9,commit),2
> +priority=100,in_port=1,ct_state=-new+est+trk,icmp,action=2
> +priority=100,in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
> +priority=100,in_port=2,ct_state=+trk+est-new,icmp,action=1
> +])
> +
> +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt])
> +
> +dnl Modify userspace conntrack fragmentation handling.
> +DPCTL_MODIFY_FRAGMENTATION()
> +
> +dnl Ipv4 larger fragmentation connectivity check.
> +NS_CHECK_EXEC([at_ns0], [ping -s 65000 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +OVS_TRAFFIC_VSWITCHD_STOP
> +AT_CLEANUP
> +
>  AT_SETUP([conntrack - IPv4 fragmentation expiry])
>  CHECK_CONNTRACK()
>  OVS_TRAFFIC_VSWITCHD_START()
> diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c
> index 24c93e4a4..f90b7aa78 100644
> --- a/tests/test-conntrack.c
> +++ b/tests/test-conntrack.c
> @@ -91,7 +91,7 @@ ct_thread_main(void *aux_)
>      ovs_barrier_block(&barrier);
>      for (i = 0; i < n_pkts; i += batch_size) {
>          conntrack_execute(ct, pkt_batch, dl_type, false, true, 0, NULL, NULL,
> -                          0, 0, NULL, NULL, now, 0);
> +                          0, 0, NULL, NULL, now, 0, NULL);
>          DP_PACKET_BATCH_FOR_EACH (j, pkt, pkt_batch) {
>              pkt_metadata_init_conn(&pkt->md);
>          }
> @@ -178,7 +178,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_,
>  
>          if (flow.dl_type != dl_type) {
>              conntrack_execute(ct_, &new_batch, dl_type, false, true, 0,
> -                              NULL, NULL, 0, 0, NULL, NULL, now, 0);
> +                              NULL, NULL, 0, 0, NULL, NULL, now, 0, NULL);
>              dp_packet_batch_init(&new_batch);
>          }
>          dp_packet_batch_add(&new_batch, packet);
> @@ -186,7 +186,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_,
>  
>      if (!dp_packet_batch_is_empty(&new_batch)) {
>          conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, NULL, NULL,
> -                          0, 0, NULL, NULL, now, 0);
> +                          0, 0, NULL, NULL, now, 0, NULL);
>      }
>  
>  }



More information about the dev mailing list