[ovs-dev] [PATCH 8/8] dpif-netlink: Allow MRU packet attribute.

Joe Stringer joestringer at nicira.com
Thu Sep 10 02:00:22 UTC 2015


From: Andy Zhou <azhou at nicira.com>

User space now may receive re-assembled IP fragments. The user space
netlink handler can now accept packets with the new OVS_PACKET_ATTR_MRU
attribute. This allows the kernel to assemble fragmented packets for the
duration of OpenFlow processing, then re-fragment at output time. Most
notably this occurs for packets that are sent through the connection
tracker.

Signed-off-by: Andy Zhou <azhou at nicira.com>
---
 datapath/linux/compat/include/linux/openvswitch.h |   5 +
 lib/dpif-netlink.c                                |   5 +
 lib/dpif.c                                        |   2 +
 lib/dpif.h                                        |   3 +
 ofproto/ofproto-dpif-upcall.c                     |  17 +-
 ofproto/ofproto-dpif.c                            |   3 +
 tests/system-traffic.at                           | 221 ++++++++++++++++++++++
 7 files changed, 254 insertions(+), 2 deletions(-)

diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h
index bbc6ca4..1997175 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -187,6 +187,10 @@ enum ovs_packet_cmd {
  * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
  * output port is actually a tunnel port. Contains the output tunnel key
  * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
+ * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
+ * size.
+ *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
  */
@@ -202,6 +206,7 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_UNUSED2,
 	OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
 				       error logging should be suppressed. */
+	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
 	__OVS_PACKET_ATTR_MAX
 };
 
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index ffeb124..007f4ec 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1547,6 +1547,9 @@ dpif_netlink_encode_execute(int dp_ifindex, const struct dpif_execute *d_exec,
     if (d_exec->probe) {
         nl_msg_put_flag(buf, OVS_PACKET_ATTR_PROBE);
     }
+    if (d_exec->mru) {
+        nl_msg_put_u16(buf, OVS_PACKET_ATTR_MRU, d_exec->mru);
+    }
 }
 
 /* Executes, against 'dpif', up to the first 'n_ops' operations in 'ops'.
@@ -1965,6 +1968,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf,
         [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_UNSPEC, .optional = true },
         [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = { .type = NL_A_NESTED, .optional = true },
         [OVS_PACKET_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true },
+        [OVS_PACKET_ATTR_MRU] = { .type = NL_A_BE16, .optional = true }
     };
 
     struct ovs_header *ovs_header;
@@ -2002,6 +2006,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf,
     upcall->userdata = a[OVS_PACKET_ATTR_USERDATA];
     upcall->out_tun_key = a[OVS_PACKET_ATTR_EGRESS_TUN_KEY];
     upcall->actions = a[OVS_PACKET_ATTR_ACTIONS];
+    upcall->mru = a[OVS_PACKET_ATTR_MRU];
 
     /* Allow overwriting the netlink attribute header without reallocating. */
     dp_packet_use_stub(&upcall->packet,
diff --git a/lib/dpif.c b/lib/dpif.c
index c03aa1d..f56d2f2 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1126,6 +1126,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet **packets, int cnt,
         execute.packet = packet;
         execute.needs_help = false;
         execute.probe = false;
+        execute.mru = 0;
         aux->error = dpif_execute(aux->dpif, &execute);
         log_execute_message(aux->dpif, &execute, true, aux->error);
 
@@ -1693,6 +1694,7 @@ log_execute_message(struct dpif *dpif, const struct dpif_execute *execute,
             ds_put_format(&ds, " failed (%s)", ovs_strerror(error));
         }
         ds_put_format(&ds, " on packet %s", packet);
+        ds_put_format(&ds, " mtu %d", execute->mru);
         vlog(THIS_MODULE, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds));
         ds_destroy(&ds);
         free(packet);
diff --git a/lib/dpif.h b/lib/dpif.h
index 1b8a7b9..76317e4 100644
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -697,6 +697,8 @@ struct dpif_execute {
     size_t actions_len;             /* Length of 'actions' in bytes. */
     bool needs_help;
     bool probe;                     /* Suppress error messages. */
+    unsigned int mru;               /* packet's Maximum receive unit.
+                                       0 if not a fragmented packet */
 
     /* Input, but possibly modified as a side effect of execution. */
     struct dp_packet *packet;          /* Packet to execute. */
@@ -780,6 +782,7 @@ struct dpif_upcall {
     struct nlattr *key;         /* Flow key. */
     size_t key_len;             /* Length of 'key' in bytes. */
     ovs_u128 ufid;              /* Unique flow identifier for 'key'. */
+    struct nlattr *mru;         /* Maximum receive unit. */
 
     /* DPIF_UC_ACTION only. */
     struct nlattr *userdata;    /* Argument to OVS_ACTION_ATTR_USERSPACE. */
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index 8a43bbf..badb4d9 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -183,6 +183,8 @@ struct upcall {
     unsigned pmd_id;               /* Datapath poll mode driver id. */
     const struct dp_packet *packet;   /* Packet associated with this upcall. */
     ofp_port_t in_port;            /* OpenFlow in port, or OFPP_NONE. */
+    unsigned int  mru;             /* If !0, Maximum receive unit of
+                                      fragmented IP packet */
 
     enum dpif_upcall_type type;    /* Datapath type of the upcall. */
     const struct nlattr *userdata; /* Userdata for DPIF_UC_ACTION Upcalls. */
@@ -329,6 +331,7 @@ static enum upcall_type classify_upcall(enum dpif_upcall_type type,
 static int upcall_receive(struct upcall *, const struct dpif_backer *,
                           const struct dp_packet *packet, enum dpif_upcall_type,
                           const struct nlattr *userdata, const struct flow *,
+                          const unsigned int mru,
                           const ovs_u128 *ufid, const unsigned pmd_id);
 static void upcall_uninit(struct upcall *);
 
@@ -716,6 +719,7 @@ recv_upcalls(struct handler *handler)
         struct dpif_upcall *dupcall = &dupcalls[n_upcalls];
         struct upcall *upcall = &upcalls[n_upcalls];
         struct flow *flow = &flows[n_upcalls];
+        unsigned int mru;
         int error;
 
         ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls],
@@ -730,8 +734,14 @@ recv_upcalls(struct handler *handler)
             goto free_dupcall;
         }
 
+        if (dupcall->mru) {
+            mru = nl_attr_get_u16(dupcall->mru);
+        } else {
+            mru = 0;
+        }
+
         error = upcall_receive(upcall, udpif->backer, &dupcall->packet,
-                               dupcall->type, dupcall->userdata, flow,
+                               dupcall->type, dupcall->userdata, flow, mru,
                                &dupcall->ufid, PMD_ID_NULL);
         if (error) {
             if (error == ENODEV) {
@@ -977,6 +987,7 @@ static int
 upcall_receive(struct upcall *upcall, const struct dpif_backer *backer,
                const struct dp_packet *packet, enum dpif_upcall_type type,
                const struct nlattr *userdata, const struct flow *flow,
+               const unsigned int mru,
                const ovs_u128 *ufid, const unsigned pmd_id)
 {
     int error;
@@ -1006,6 +1017,7 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer,
     upcall->ukey = NULL;
     upcall->key = NULL;
     upcall->key_len = 0;
+    upcall->mru = mru;
 
     upcall->out_tun_key = NULL;
     upcall->actions = NULL;
@@ -1133,7 +1145,7 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi
     atomic_read_relaxed(&udpif->flow_limit, &flow_limit);
 
     error = upcall_receive(&upcall, udpif->backer, packet, type, userdata,
-                           flow, ufid, pmd_id);
+                           flow, 0, ufid, pmd_id);
     if (error) {
         return error;
     }
@@ -1355,6 +1367,7 @@ handle_upcalls(struct udpif *udpif, struct upcall *upcalls,
             op->dop.u.execute.actions_len = upcall->odp_actions.size;
             op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0;
             op->dop.u.execute.probe = false;
+            op->dop.u.execute.mru = upcall->mru;
         }
     }
 
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index ddf177a..b210881 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -1116,6 +1116,7 @@ check_variable_length_userdata(struct dpif_backer *backer)
     execute.packet = &packet;
     execute.needs_help = false;
     execute.probe = true;
+    execute.mru = 0;
 
     error = dpif_execute(backer->dpif, &execute);
 
@@ -1214,6 +1215,7 @@ check_masked_set_action(struct dpif_backer *backer)
     execute.packet = &packet;
     execute.needs_help = false;
     execute.probe = true;
+    execute.mru = 0;
 
     error = dpif_execute(backer->dpif, &execute);
 
@@ -3722,6 +3724,7 @@ ofproto_dpif_execute_actions(struct ofproto_dpif *ofproto,
     execute.packet = packet;
     execute.needs_help = (xout.slow & SLOW_ACTION) != 0;
     execute.probe = false;
+    execute.mru = 0;
 
     /* Fix up in_port. */
     in_port = flow->in_port.ofp_port;
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index cdd0315..4074eaa 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -923,3 +923,224 @@ TIME_WAIT src=10.1.1.1 dst=10.1.1.2 sport=<cleared> dport=<cleared> src=10.1.1.2
 
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
+
+AT_SETUP([fragmentation - ipv4 icmp ])
+CHECK_CONNTRACK()
+OVS_TRAFFIC_VSWITCHD_START(
+   [set-fail-mode br0 secure -- ])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+
+dnl Sending ping through conntrack
+AT_DATA([flows.txt], [dnl
+priority=1,action=drop
+priority=10,arp,action=normal
+in_port=1,icmp,action=ct(commit,zone=9),2
+in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
+in_port=2,ct_state=+trk+est-new,icmp,action=1
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+dnl Basic connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 larger fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
+AT_SETUP([fragmentation - ipv4 icmp + vlan])
+CHECK_CONNTRACK()
+OVS_TRAFFIC_VSWITCHD_START(
+   [set-fail-mode br0 secure -- ])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+ADD_VLAN(p0, at_ns0, 100, "10.2.2.1/24")
+ADD_VLAN(p1, at_ns1, 100, "10.2.2.2/24")
+
+dnl Sending ping through conntrack
+AT_DATA([flows.txt], [dnl
+priority=1,action=drop
+priority=10,arp,action=normal
+in_port=1,icmp,action=ct(commit,zone=9),2
+in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9)
+in_port=2,ct_state=+trk+est-new,icmp,action=1
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+dnl Basic connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 larger fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
+AT_SETUP([fragmentation - ipv6 icmp ])
+CHECK_CONNTRACK()
+OVS_TRAFFIC_VSWITCHD_START(
+   [set-fail-mode br0 secure -- ])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "fc00::1/96")
+ADD_VETH(p1, at_ns1, br0, "fc00::2/96")
+
+dnl Sending ping through conntrack
+AT_DATA([flows.txt], [dnl
+priority=1,action=drop
+priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2
+priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9)
+priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1
+icmp6,icmp_type=135,action=normal
+icmp6,icmp_type=136,action=normal
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+dnl Without this sleep, we get occasional failures due to the following error:
+dnl "connect: Cannot assign requested address"
+sleep 2;
+
+dnl Basic connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 larger fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
+AT_SETUP([fragmentation - ipv6 icmp + vlan])
+CHECK_CONNTRACK()
+OVS_TRAFFIC_VSWITCHD_START(
+   [set-fail-mode br0 secure -- ])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "fc00::1/96")
+ADD_VETH(p1, at_ns1, br0, "fc00::2/96")
+
+ADD_VLAN(p0, at_ns0, 100, "fc00:1::3/96")
+ADD_VLAN(p1, at_ns1, 100, "fc00:1::4/96")
+
+dnl Sending ping through conntrack
+AT_DATA([flows.txt], [dnl
+priority=1,action=drop
+priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2
+priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9)
+priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1
+icmp6,icmp_type=135,action=normal
+icmp6,icmp_type=136,action=normal
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+dnl Without this sleep, we get occasional failures due to the following error:
+dnl "connect: Cannot assign requested address"
+sleep 2;
+
+dnl Basic connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Ipv4 larger fragmentation connectivity check.
+NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
+AT_SETUP([fragmentation over vxlan tunnel])
+AT_SKIP_IF([! ip link help 2>&1 | grep vxlan >/dev/null])
+CHECK_CONNTRACK()
+
+OVS_TRAFFIC_VSWITCHD_START(
+   [set-fail-mode br0 standalone --])
+ADD_BR([br-underlay], [set-fail-mode br-underlay standalone])
+ADD_NAMESPACES(at_ns0)
+
+dnl Sending ping through conntrack
+AT_DATA([flows.txt], [dnl
+priority=1,action=drop
+priority=10,arp,action=normal
+in_port=1,icmp,action=ct(commit,zone=9),LOCAL
+in_port=LOCAL,ct_state=-trk,icmp,action=ct(table=0,zone=9)
+in_port=LOCAL,ct_state=+trk+est-new,icmp,action=1
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+dnl Set up underlay link from host into the namespace using veth pair.
+ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24")
+AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"])
+AT_CHECK([ip link set dev br-underlay up])
+
+dnl Set up tunnel endpoints on OVS outside the namespace and with a native
+dnl linux device inside the namespace.
+ADD_OVS_TUNNEL([vxlan], [br0], [at_ns0], [172.31.1.1], [10.1.1.100/24])
+ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24],
+                  [id 0 dstport 4789])
+
+dnl First, check the underlay
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Okay, now check the overlay with different packet sizes
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
-- 
2.1.4




More information about the dev mailing list