[ovs-dev] [netlink v5 54/61] datapath: Convert ODP_DP_* commands to use AF_NETLINK socket layer.

Ben Pfaff blp at nicira.com
Thu Jan 27 00:23:37 UTC 2011


This commit calls genl_lock() and thus doesn't support Linux before
2.6.35, which wasn't exported before that version.  That problem will
be fixed once the whole userspace interface transitions to Generic
Netlink a few commits from now.

Signed-off-by: Ben Pfaff <blp at nicira.com>
---
 datapath/datapath.c                                |  303 +++++++++-----------
 datapath/linux-2.6/compat-2.6/genetlink.inc        |   59 ++++
 .../linux-2.6/compat-2.6/include/linux/rtnetlink.h |    2 +-
 .../linux-2.6/compat-2.6/include/net/genetlink.h   |   43 +++-
 .../linux-2.6/compat-2.6/include/net/netlink.h     |   38 +++
 include/openvswitch/datapath-protocol.h            |   69 +++--
 lib/dpif-linux.c                                   |  184 ++++++------
 7 files changed, 417 insertions(+), 281 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index db20cf9..f42ead1 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -1156,27 +1156,38 @@ exit:
 }
 
 static const struct nla_policy datapath_policy[ODP_DP_ATTR_MAX + 1] = {
+#ifdef HAVE_NLA_NUL_STRING
 	[ODP_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+#endif
 	[ODP_DP_ATTR_IPV4_FRAGS] = { .type = NLA_U32 },
 	[ODP_DP_ATTR_SAMPLING] = { .type = NLA_U32 },
 };
 
-/* Called with genl_mutex. */
-static int copy_datapath_to_user(void __user *dst, struct datapath *dp, uint32_t total_len)
+static struct genl_family dp_datapath_genl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = sizeof(struct odp_header),
+	.name = ODP_DATAPATH_FAMILY,
+	.version = 1,
+	.maxattr = ODP_DP_ATTR_MAX
+};
+
+static struct genl_multicast_group dp_datapath_multicast_group = {
+	.name = ODP_DATAPATH_MCGROUP
+};
+
+static int odp_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
+				u32 pid, u32 seq, u32 flags, u8 cmd)
 {
-	struct odp_datapath *odp_datapath;
-	struct sk_buff *skb;
+	struct odp_header *odp_header;
 	struct nlattr *nla;
 	int err;
 
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	err = -ENOMEM;
-	if (!skb)
-		goto exit;
+	odp_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family,
+				   flags, cmd);
+	if (!odp_header)
+		goto error;
 
-	odp_datapath = (struct odp_datapath*)__skb_put(skb, sizeof(struct odp_datapath));
-	odp_datapath->dp_idx = dp->dp_idx;
-	odp_datapath->total_len = total_len;
+	odp_header->dp_idx = dp->dp_idx;
 
 	rcu_read_lock();
 	err = nla_put_string(skb, ODP_DP_ATTR_NAME, dp_name(dp));
@@ -1203,77 +1214,49 @@ static int copy_datapath_to_user(void __user *dst, struct datapath *dp, uint32_t
 	NLA_PUT_U32(skb, ODP_PACKET_CMD_SAMPLE, packet_mc_group(dp, ODP_PACKET_CMD_SAMPLE));
 	nla_nest_end(skb, nla);
 
-	if (skb->len > total_len)
-		goto nla_put_failure;
-
-	odp_datapath->len = skb->len;
-	err = copy_to_user(dst, skb->data, skb->len) ? -EFAULT : 0;
-	goto exit_free_skb;
+	return genlmsg_end(skb, odp_header);
 
 nla_put_failure:
-	err = -EMSGSIZE;
-exit_free_skb:
-	kfree_skb(skb);
-exit:
-	return err;
+	genlmsg_cancel(skb, odp_header);
+error:
+	return -EMSGSIZE;
 }
 
-/* Called with genl_mutex. */
-static struct sk_buff *copy_datapath_from_user(struct odp_datapath __user *uodp_datapath, struct nlattr *a[ODP_DP_ATTR_MAX + 1])
+static struct sk_buff *odp_dp_cmd_build_info(struct datapath *dp, u32 pid,
+					     u32 seq, u8 cmd)
 {
-	struct odp_datapath *odp_datapath;
 	struct sk_buff *skb;
-	u32 len;
-	int err;
-
-	if (get_user(len, &uodp_datapath->len))
-		return ERR_PTR(-EFAULT);
-	if (len < sizeof(struct odp_datapath))
-		return ERR_PTR(-EINVAL);
+	int retval;
 
-	skb = alloc_skb(len, GFP_KERNEL);
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
-	err = -EFAULT;
-	if (copy_from_user(__skb_put(skb, len), uodp_datapath, len))
-		goto error_free_skb;
-
-	odp_datapath = (struct odp_datapath *)skb->data;
-	err = -EINVAL;
-	if (odp_datapath->len != len)
-		goto error_free_skb;
-
-	err = nla_parse(a, ODP_DP_ATTR_MAX,
-			(struct nlattr *)(skb->data + sizeof(struct odp_datapath)),
-			skb->len - sizeof(struct odp_datapath), datapath_policy);
-	if (err)
-		goto error_free_skb;
+	retval = odp_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd);
+	if (retval < 0) {
+		kfree_skb(skb);
+		return ERR_PTR(retval);
+	}
+	return skb;
+}
 
+static int odp_dp_cmd_validate(struct nlattr *a[ODP_DP_ATTR_MAX + 1])
+{
 	if (a[ODP_DP_ATTR_IPV4_FRAGS]) {
 		u32 frags = nla_get_u32(a[ODP_DP_ATTR_IPV4_FRAGS]);
 
-		err = -EINVAL;
 		if (frags != ODP_DP_FRAG_ZERO && frags != ODP_DP_FRAG_DROP)
-			goto error_free_skb;
+			return -EINVAL;
 	}
 
-	err = VERIFY_NUL_STRING(a[ODP_DP_ATTR_NAME], IFNAMSIZ - 1);
-	if (err)
-		goto error_free_skb;
-
-	return skb;
-
-error_free_skb:
-	kfree_skb(skb);
-	return ERR_PTR(err);
+	return VERIFY_NUL_STRING(a[ODP_DP_ATTR_NAME], IFNAMSIZ - 1);
 }
 
 /* Called with genl_mutex and optionally with RTNL lock also. */
-static struct datapath *lookup_datapath(struct odp_datapath *odp_datapath, struct nlattr *a[ODP_DP_ATTR_MAX + 1])
+static struct datapath *lookup_datapath(struct odp_header *odp_header, struct nlattr *a[ODP_DP_ATTR_MAX + 1])
 {
 	if (!a[ODP_DP_ATTR_NAME]) {
-		struct datapath *dp = get_dp(odp_datapath->dp_idx);
+		struct datapath *dp = get_dp(odp_header->dp_idx);
 		if (!dp)
 			return ERR_PTR(-ENODEV);
 		return dp;
@@ -1301,33 +1284,31 @@ static void change_datapath(struct datapath *dp, struct nlattr *a[ODP_DP_ATTR_MA
 		dp->sflow_probability = nla_get_u32(a[ODP_DP_ATTR_SAMPLING]);
 }
 
-static int new_datapath(struct odp_datapath __user *uodp_datapath)
+static int odp_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr *a[ODP_DP_ATTR_MAX + 1];
-	struct odp_datapath *odp_datapath;
+	struct nlattr **a = info->attrs;
+	struct odp_header *odp_header = info->userhdr;
 	struct vport_parms parms;
-	struct sk_buff *skb;
+	struct sk_buff *reply;
 	struct datapath *dp;
 	struct vport *vport;
 	int dp_idx;
 	int err;
 
-	skb = copy_datapath_from_user(uodp_datapath, a);
-	err = PTR_ERR(skb);
-	if (IS_ERR(skb))
-		goto err;
-	odp_datapath = (struct odp_datapath *)skb->data;
-
 	err = -EINVAL;
 	if (!a[ODP_DP_ATTR_NAME])
-		goto err_free_skb;
+		goto err;
+
+	err = odp_dp_cmd_validate(a);
+	if (err)
+		goto err;
 
 	rtnl_lock();
 	err = -ENODEV;
 	if (!try_module_get(THIS_MODULE))
 		goto err_unlock_rtnl;
 
-	dp_idx = odp_datapath->dp_idx;
+	dp_idx = odp_header->dp_idx;
 	if (dp_idx < 0) {
 		err = -EFBIG;
 		for (dp_idx = 0; dp_idx < ARRAY_SIZE(dps); dp_idx++) {
@@ -1385,11 +1366,18 @@ static int new_datapath(struct odp_datapath __user *uodp_datapath)
 
 	change_datapath(dp, a);
 
+	reply = odp_dp_cmd_build_info(dp, info->snd_pid, info->snd_seq, ODP_DP_CMD_NEW);
+	err = PTR_ERR(reply);
+	if (IS_ERR(reply))
+		goto err_destroy_local_port;
+
 	rcu_assign_pointer(dps[dp_idx], dp);
 	dp_sysfs_add_dp(dp);
 
 	rtnl_unlock();
 
+	genl_notify(reply, genl_info_net(info), info->snd_pid,
+		    dp_datapath_multicast_group.id, info->nlhdr, GFP_KERNEL);
 	return 0;
 
 err_destroy_local_port:
@@ -1402,30 +1390,31 @@ err_put_module:
 	module_put(THIS_MODULE);
 err_unlock_rtnl:
 	rtnl_unlock();
-err_free_skb:
-	kfree_skb(skb);
 err:
 	return err;
 }
 
-static int del_datapath(struct odp_datapath __user *uodp_datapath)
+static int odp_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr *a[ODP_DP_ATTR_MAX + 1];
 	struct vport *vport, *next_vport;
+	struct sk_buff *reply;
 	struct datapath *dp;
-	struct sk_buff *skb;
 	int err;
 
-	skb = copy_datapath_from_user(uodp_datapath, a);
-	err = PTR_ERR(skb);
-	if (IS_ERR(skb))
+	err = odp_dp_cmd_validate(info->attrs);
+	if (err)
 		goto exit;
 
 	rtnl_lock();
-	dp = lookup_datapath((struct odp_datapath *)skb->data, a);
+	dp = lookup_datapath(info->userhdr, info->attrs);
 	err = PTR_ERR(dp);
 	if (IS_ERR(dp))
-		goto exit_free;
+		goto exit_unlock;
+
+	reply = odp_dp_cmd_build_info(dp, info->snd_pid, info->snd_seq, ODP_DP_CMD_DEL);
+	err = PTR_ERR(reply);
+	if (IS_ERR(reply))
+		goto exit_unlock;
 
 	list_for_each_entry_safe (vport, next_vport, &dp->port_list, node)
 		if (vport->port_no != ODPP_LOCAL)
@@ -1438,96 +1427,108 @@ static int del_datapath(struct odp_datapath __user *uodp_datapath)
 	call_rcu(&dp->rcu, destroy_dp_rcu);
 	module_put(THIS_MODULE);
 
+	genl_notify(reply, genl_info_net(info), info->snd_pid,
+		    dp_datapath_multicast_group.id, info->nlhdr, GFP_KERNEL);
 	err = 0;
 
-exit_free:
-	kfree_skb(skb);
+exit_unlock:
 	rtnl_unlock();
 exit:
 	return err;
 }
 
-static int set_datapath(struct odp_datapath __user *uodp_datapath)
+static int odp_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr *a[ODP_DP_ATTR_MAX + 1];
+	struct sk_buff *reply;
 	struct datapath *dp;
-	struct sk_buff *skb;
 	int err;
 
-	skb = copy_datapath_from_user(uodp_datapath, a);
-	err = PTR_ERR(skb);
-	if (IS_ERR(skb))
-		goto exit;
+	err = odp_dp_cmd_validate(info->attrs);
+	if (err)
+		return err;
 
-	dp = lookup_datapath((struct odp_datapath *)skb->data, a);
-	err = PTR_ERR(dp);
+	dp = lookup_datapath(info->userhdr, info->attrs);
 	if (IS_ERR(dp))
-		goto exit_free;
+		return PTR_ERR(dp);
 
-	change_datapath(dp, a);
-	err = 0;
+	change_datapath(dp, info->attrs);
 
-exit_free:
-	kfree_skb(skb);
-exit:
-	return err;
+	reply = odp_dp_cmd_build_info(dp, info->snd_pid, info->snd_seq, ODP_DP_CMD_NEW);
+	if (IS_ERR(reply)) {
+		err = PTR_ERR(reply);
+		netlink_set_err(INIT_NET_GENL_SOCK, 0,
+				dp_datapath_multicast_group.id, err);
+		return 0;
+	}
+
+	genl_notify(reply, genl_info_net(info), info->snd_pid,
+		    dp_datapath_multicast_group.id, info->nlhdr, GFP_KERNEL);
+	return 0;
 }
 
-static int get_datapath(struct odp_datapath __user *uodp_datapath)
+static int odp_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
 {
-	struct nlattr *a[ODP_DP_ATTR_MAX + 1];
-	struct odp_datapath *odp_datapath;
+	struct sk_buff *reply;
 	struct datapath *dp;
-	struct sk_buff *skb;
 	int err;
 
-	skb = copy_datapath_from_user(uodp_datapath, a);
-	err = PTR_ERR(skb);
-	if (IS_ERR(skb))
-		goto exit;
-	odp_datapath = (struct odp_datapath *)skb->data;
-
-	dp = lookup_datapath(odp_datapath, a);
+	err = odp_dp_cmd_validate(info->attrs);
+	if (err)
+		return err;
 
-	err = PTR_ERR(dp);
+	dp = lookup_datapath(info->userhdr, info->attrs);
 	if (IS_ERR(dp))
-		goto exit_free;
+		return PTR_ERR(dp);
 
-	err = copy_datapath_to_user(uodp_datapath, dp, odp_datapath->total_len);
-exit_free:
-	kfree_skb(skb);
-exit:
-	return err;
+	reply = odp_dp_cmd_build_info(dp, info->snd_pid, info->snd_seq, ODP_DP_CMD_NEW);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	return genlmsg_reply(reply, info);
 }
 
-static int dump_datapath(struct odp_datapath __user *uodp_datapath)
+static int odp_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct nlattr *a[ODP_DP_ATTR_MAX + 1];
-	struct odp_datapath *odp_datapath;
-	struct sk_buff *skb;
 	u32 dp_idx;
-	int err;
 
-	skb = copy_datapath_from_user(uodp_datapath, a);
-	err = PTR_ERR(skb);
-	if (IS_ERR(skb))
-		goto exit;
-	odp_datapath = (struct odp_datapath *)skb->data;
-
-	err = -ENODEV;
-	for (dp_idx = odp_datapath->dp_idx; dp_idx < ARRAY_SIZE(dps); dp_idx++) {
+	for (dp_idx = cb->args[0]; dp_idx < ARRAY_SIZE(dps); dp_idx++) {
 		struct datapath *dp = get_dp(dp_idx);
 		if (!dp)
 			continue;
-
-		err = copy_datapath_to_user(uodp_datapath, dp, odp_datapath->total_len);
-		break;
+		if (odp_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid,
+					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					 ODP_DP_CMD_NEW) < 0)
+			break;
 	}
-	kfree_skb(skb);
-exit:
-	return err;
+
+	cb->args[0] = dp_idx;
+	return skb->len;
 }
 
+static struct genl_ops dp_datapath_genl_ops[] = {
+	{ .cmd = ODP_DP_CMD_NEW,
+	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .policy = datapath_policy,
+	  .doit = odp_dp_cmd_new
+	},
+	{ .cmd = ODP_DP_CMD_DEL,
+	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .policy = datapath_policy,
+	  .doit = odp_dp_cmd_del
+	},
+	{ .cmd = ODP_DP_CMD_GET,
+	  .flags = 0,		    /* OK for unprivileged users. */
+	  .policy = datapath_policy,
+	  .doit = odp_dp_cmd_get,
+	  .dumpit = odp_dp_cmd_dump
+	},
+	{ .cmd = ODP_DP_CMD_SET,
+	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .policy = datapath_policy,
+	  .doit = odp_dp_cmd_set,
+	},
+};
+
 static const struct nla_policy vport_policy[ODP_VPORT_ATTR_MAX + 1] = {
 	[ODP_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
 	[ODP_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
@@ -1926,26 +1927,6 @@ static long openvswitch_ioctl(struct file *f, unsigned int cmd,
 
 	genl_lock();
 	switch (cmd) {
-	case ODP_DP_NEW:
-		err = new_datapath((struct odp_datapath __user *)argp);
-		goto exit;
-
-	case ODP_DP_GET:
-		err = get_datapath((struct odp_datapath __user *)argp);
-		goto exit;
-
-	case ODP_DP_DEL:
-		err = del_datapath((struct odp_datapath __user *)argp);
-		goto exit;
-
-	case ODP_DP_SET:
-		err = set_datapath((struct odp_datapath __user *)argp);
-		goto exit;
-
-	case ODP_DP_DUMP:
-		err = dump_datapath((struct odp_datapath __user *)argp);
-		goto exit;
-
 	case ODP_VPORT_NEW:
 		err = attach_vport((struct odp_vport __user *)argp);
 		goto exit;
@@ -2001,11 +1982,6 @@ static long openvswitch_compat_ioctl(struct file *f, unsigned int cmd, unsigned
 		/* Ioctls that don't need any translation at all. */
 		return openvswitch_ioctl(f, cmd, argp);
 
-	case ODP_DP_NEW:
-	case ODP_DP_GET:
-	case ODP_DP_DEL:
-	case ODP_DP_SET:
-	case ODP_DP_DUMP:
 	case ODP_VPORT_NEW:
 	case ODP_VPORT_DEL:
 	case ODP_VPORT_GET:
@@ -2043,6 +2019,9 @@ struct genl_family_and_ops {
 };
 
 static const struct genl_family_and_ops dp_genl_families[] = {
+	{ &dp_datapath_genl_family,
+	  dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
+	  &dp_datapath_multicast_group },
 	{ &dp_packet_genl_family,
 	  dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
 	  NULL },
diff --git a/datapath/linux-2.6/compat-2.6/genetlink.inc b/datapath/linux-2.6/compat-2.6/genetlink.inc
index 63760bb..080d3c9 100644
--- a/datapath/linux-2.6/compat-2.6/genetlink.inc
+++ b/datapath/linux-2.6/compat-2.6/genetlink.inc
@@ -71,3 +71,62 @@ err_out:
 	return err;
 }
 #endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+/**
+ * nlmsg_notify - send a notification netlink message
+ * @sk: netlink socket to use
+ * @skb: notification message
+ * @pid: destination netlink pid for reports or 0
+ * @group: destination multicast group or 0
+ * @report: 1 to report back, 0 to disable
+ * @flags: allocation flags
+ */
+int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
+		 unsigned int group, int report, gfp_t flags)
+{
+	int err = 0;
+
+	if (group) {
+		int exclude_pid = 0;
+
+		if (report) {
+			atomic_inc(&skb->users);
+			exclude_pid = pid;
+		}
+
+		/* errors reported via destination sk->sk_err, but propagate
+		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
+		err = nlmsg_multicast(sk, skb, exclude_pid, group, flags);
+	}
+
+	if (report) {
+		int err2;
+
+		err2 = nlmsg_unicast(sk, skb, pid);
+		if (!err || err == -ESRCH)
+			err = err2;
+	}
+
+	return err;
+}
+#endif
+
+/* This is analogous to rtnl_notify() but uses genl_sock instead of rtnl.
+ *
+ * This is not (yet) in any upstream kernel. */
+void genl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+		 struct nlmsghdr *nlh, gfp_t flags)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
+	struct sock *sk = net->genl_sock;
+#else
+	struct sock *sk = genl_sock;
+#endif
+	int report = 0;
+
+	if (nlh)
+		report = nlmsg_report(nlh);
+
+	nlmsg_notify(sk, skb, pid, group, report, flags);
+}
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h b/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h
index 59be83f..0a02149 100644
--- a/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h
+++ b/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h
@@ -12,7 +12,7 @@ static inline void rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
 	BUG_ON(nlh != NULL);		/* not implemented */
 	if (group) {
 		/* errors reported via destination sk->sk_err */
-		nlmsg_multicast(rtnl, skb, 0, group);
+		nlmsg_multicast(rtnl, skb, 0, group, flags);
 	}
 }
 
diff --git a/datapath/linux-2.6/compat-2.6/include/net/genetlink.h b/datapath/linux-2.6/compat-2.6/include/net/genetlink.h
index f5bff63..5c71c45 100644
--- a/datapath/linux-2.6/compat-2.6/include/net/genetlink.h
+++ b/datapath/linux-2.6/compat-2.6/include/net/genetlink.h
@@ -1,12 +1,44 @@
 #ifndef __NET_GENERIC_NETLINK_WRAPPER_H
 #define __NET_GENERIC_NETLINK_WRAPPER_H 1
 
-
+#include <linux/version.h>
 #include <linux/netlink.h>
+
+/* Very special super-nasty workaround here:
+ *
+ * Before 2.6.19, nlmsg_multicast() lacked a 'flags' parameter.  We work
+ * around that in our <net/netlink.h> replacement, so that nlmsg_multicast
+ * is a macro that expands to rpl_nlmsg_multicast, which in turn has the
+ * 'flags' parameter.
+ *
+ * However, also before 2.6.19, <net/genetlink.h> contains an inline definition
+ * of genlmsg_multicast() that, of course, calls it without the 'flags'
+ * parameter.  This causes a build failure.
+ *
+ * This works around the problem by temporarily renaming both nlmsg_multicast
+ * and genlmsg_multicast with a "busted_" prefix.  (Nothing actually defines
+ * busted_nlmsg_multicast(), so if anything actually tries to call it, then
+ * we'll get a link error.)
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#undef nlmsg_multicast
+#define nlmsg_multicast busted_nlmsg_multicast
+#define genlmsg_multicast busted_genlmsg_multicast
+extern int busted_nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
+				  u32 pid, unsigned int group);
+#endif	/* linux kernel < v2.6.19 */
+
 #include_next <net/genetlink.h>
+
+/* Drop the "busted_" prefix described above. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#undef nlmsg_multicast
+#undef genlmsg_multicast
+#define nlmsg_multicast rpl_nlmsg_multicast
+#endif	/* linux kernel < v2.6.19 */
+
 #include <net/net_namespace.h>
 
-#include <linux/version.h>
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
 
 #include <linux/genetlink.h>
@@ -120,4 +152,11 @@ int genl_register_family_with_ops(struct genl_family *family,
 	struct genl_ops *ops, size_t n_ops);
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#define genl_notify(skb, net, pid, group, nlh, flags) \
+	genl_notify(skb, pid, group, nlh, flags)
+#endif
+extern void genl_notify(struct sk_buff *skb, struct net *net, u32 pid,
+			u32 group, struct nlmsghdr *nlh, gfp_t flags);
+
 #endif /* genetlink.h */
diff --git a/datapath/linux-2.6/compat-2.6/include/net/netlink.h b/datapath/linux-2.6/compat-2.6/include/net/netlink.h
index 52238d8..f4fb843 100644
--- a/datapath/linux-2.6/compat-2.6/include/net/netlink.h
+++ b/datapath/linux-2.6/compat-2.6/include/net/netlink.h
@@ -128,4 +128,42 @@ static inline struct nlattr *nla_find_nested(struct nlattr *nla, int attrtype)
 }
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+/**
+ * nlmsg_report - need to report back to application?
+ * @nlh: netlink message header
+ *
+ * Returns 1 if a report back to the application is requested.
+ */
+static inline int nlmsg_report(const struct nlmsghdr *nlh)
+{
+	return !!(nlh->nlmsg_flags & NLM_F_ECHO);
+}
+
+extern int		nlmsg_notify(struct sock *sk, struct sk_buff *skb,
+				     u32 pid, unsigned int group, int report,
+				     gfp_t flags);
+#endif	/* linux kernel < 2.6.19 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+/* Before 2.6.19 the 'flags' parameter was missing, so replace it.  We have to
+ * #include <net/genetlink.h> first because the 2.6.18 version of that header
+ * has an inline call to nlmsg_multicast() without, of course, any 'flags'
+ * argument. */
+#define nlmsg_multicast rpl_nlmsg_multicast
+static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
+				  u32 pid, unsigned int group, gfp_t flags)
+{
+	int err;
+
+	NETLINK_CB(skb).dst_group = group;
+
+	err = netlink_broadcast(sk, skb, pid, group, flags);
+	if (err > 0)
+		err = 0;
+
+	return err;
+}
+#endif	/* linux kernel < 2.6.19 */
+
 #endif /* net/netlink.h */
diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h
index e903f08..39a3365 100644
--- a/include/openvswitch/datapath-protocol.h
+++ b/include/openvswitch/datapath-protocol.h
@@ -70,12 +70,6 @@
 #include <linux/if_link.h>
 #include <linux/netlink.h>
 
-#define ODP_DP_NEW              _IOWR('O', 0, struct odp_datapath)
-#define ODP_DP_DEL              _IOR('O', 1, struct odp_datapath)
-#define ODP_DP_GET              _IOWR('O', 2, struct odp_datapath)
-#define ODP_DP_SET		_IOWR('O', 3, struct odp_datapath)
-#define ODP_DP_DUMP		_IOWR('O', 4, struct odp_datapath)
-
 #define ODP_VPORT_NEW           _IOR('O', 7, struct odp_vport)
 #define ODP_VPORT_DEL           _IOR('O', 8, struct odp_vport)
 #define ODP_VPORT_GET           _IOWR('O', 9, struct odp_vport)
@@ -88,6 +82,19 @@
 #define ODP_FLOW_SET            _IOWR('O', 16, struct odp_flow)
 #define ODP_FLOW_DUMP           _IOWR('O', 17, struct odp_flow)
 #define ODP_FLOW_FLUSH          _IO('O', 19)
+
+/* Datapaths. */
+
+#define ODP_DATAPATH_FAMILY  "odp_datapath"
+#define ODP_DATAPATH_MCGROUP "odp_datapath"
+
+enum odp_datapath_cmd {
+	ODP_DP_CMD_UNSPEC,
+	ODP_DP_CMD_NEW,
+	ODP_DP_CMD_DEL,
+	ODP_DP_CMD_GET,
+	ODP_DP_CMD_SET
+};
 
 /**
  * struct odp_header - header for ODP Generic Netlink messages.
@@ -101,22 +108,30 @@ struct odp_header {
 };
 
 /**
- * struct odp_datapath - header with basic information about a datapath.
- * @dp_idx: Datapath index (-1 to make a request not specific to a datapath).
- * @len: Length of this structure plus the Netlink attributes following it.
- * @total_len: Total space available for kernel reply to request.
+ * enum odp_datapath_attr - attributes for %ODP_DP_* commands.
+ * @ODP_DP_ATTR_NAME: Name of the network device that serves as the "local
+ * port".  This is the name of the network device whose dp_idx is given in the
+ * &struct odp_header.  Always present in notifications.  Required in
+ * %ODP_DP_NEW requests.  May be used as an alternative to specifying dp_idx on
+ * other requests (with a dp_idx of %UINT32_MAX).
+ * @ODP_DP_ATTR_STATS: Statistics about packets that have passed through the
+ * datapath.  Always present in notifications.
+ * @ODP_DP_ATTR_IPV4_FRAGS: One of %ODP_DP_FRAG_*.  Always present in
+ * notifications.  May be included in %ODP_DP_NEW or %ODP_DP_SET requests to
+ * change the fragment handling policy.
+ * @ODP_DP_ATTR_SAMPLING: 32-bit fraction of packets to sample with
+ * @ODP_PACKET_CMD_SAMPLE.  A value of 0 samples no packets, a value of
+ * %UINT32_MAX samples all packets, and intermediate values sample intermediate
+ * fractions of packets.
+ * @ODP_DP_ATTR_MCGROUPS: Nested attributes with multicast groups.  Each nested
+ * attribute has a %ODP_PACKET_CMD_* type with a 32-bit value giving the
+ * Generic Netlink multicast group number used for sending this datapath's
+ * messages with that command type up to userspace.
  *
- * Followed by &struct nlattr attributes, whose types are drawn from
- * %ODP_DP_ATTR_*, up to a length of @len bytes including the &struct
- * odp_datapath header.
+ * These attributes follow the &struct odp_header within the Generic Netlink
+ * payload for %ODP_DP_* commands.
  */
-struct odp_datapath {
-	int32_t dp_idx;
-	uint32_t len;
-	uint32_t total_len;
-};
-
-enum odp_datapath_type {
+enum odp_datapath_attr {
 	ODP_DP_ATTR_UNSPEC,
 	ODP_DP_ATTR_NAME,       /* name of dp_ifidx netdev */
 	ODP_DP_ATTR_STATS,      /* struct odp_stats */
@@ -128,7 +143,13 @@ enum odp_datapath_type {
 
 #define ODP_DP_ATTR_MAX (__ODP_DP_ATTR_MAX - 1)
 
-/* Values for ODP_DP_ATTR_IPV4_FRAGS. */
+/**
+ * enum odp_frag_handling - policy for handling received IPv4 fragments.
+ * @ODP_DP_FRAG_ZERO: Treat IP fragments as IP protocol 0 and transport ports
+ * zero.
+ * @ODP_DP_FRAG_DROP: Drop IP fragments.  Do not pass them through the flow
+ * table or up to userspace.
+ */
 enum odp_frag_handling {
 	ODP_DP_FRAG_UNSPEC,
 	ODP_DP_FRAG_ZERO,	/* Treat IP fragments as transport port 0. */
@@ -179,12 +200,6 @@ enum odp_packet_cmd {
  *
  * These attributes follow the &struct odp_header within the Generic Netlink
  * payload for %ODP_PACKET_* commands.
- *
- * The %ODP_PACKET_ATTR_TYPE, %ODP_PACKET_ATTR_PACKET and %ODP_PACKET_ATTR_KEY
- * attributes are present for all notifications.  For %ODP_PACKET_CMD_ACTION,
- * the %ODP_PACKET_ATTR_USERDATA attribute is included if it would be nonzero.
- * For %ODP_PACKET_CMD_SAMPLE, the %ODP_PACKET_ATTR_SAMPLE_POOL and
- * %ODP_PACKET_ATTR_ACTIONS attributes are included.
  */
 enum odp_packet_attr {
 	ODP_PACKET_ATTR_UNSPEC,
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index e5e66ff..5fb7035 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -55,10 +55,10 @@
 VLOG_DEFINE_THIS_MODULE(dpif_linux);
 
 struct dpif_linux_dp {
-    /* ioctl command argument. */
-    int cmd;
+    /* Generic Netlink header. */
+    uint8_t cmd;
 
-    /* struct odp_datapath header. */
+    /* struct odp_header. */
     uint32_t dp_idx;
 
     /* Attributes. */
@@ -70,6 +70,9 @@ struct dpif_linux_dp {
 };
 
 static void dpif_linux_dp_init(struct dpif_linux_dp *);
+static int dpif_linux_dp_from_ofpbuf(struct dpif_linux_dp *,
+                                     const struct ofpbuf *);
+static void dpif_linux_dp_dump_start(struct nl_dump *);
 static int dpif_linux_dp_transact(const struct dpif_linux_dp *request,
                                   struct dpif_linux_dp *reply,
                                   struct ofpbuf **bufp);
@@ -131,6 +134,7 @@ struct dpif_linux {
 static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(9999, 5);
 
 /* Generic Netlink family numbers for ODP. */
+static int odp_datapath_family;
 static int odp_packet_family;
 
 /* Generic Netlink socket. */
@@ -156,13 +160,14 @@ dpif_linux_cast(const struct dpif *dpif)
 static int
 dpif_linux_enumerate(struct svec *all_dps)
 {
-    uint32_t dp_idx;
+    struct nl_dump dump;
+    struct ofpbuf msg;
     int major;
-    int err;
+    int error;
 
-    err = dpif_linux_init();
-    if (err) {
-        return err;
+    error = dpif_linux_init();
+    if (error) {
+        return error;
     }
 
     /* Check that the Open vSwitch module is loaded. */
@@ -171,28 +176,15 @@ dpif_linux_enumerate(struct svec *all_dps)
         return -major;
     }
 
-    dp_idx = 0;
-    for (;;) {
-        struct dpif_linux_dp request, reply;
-        struct ofpbuf *buf;
-        char devname[16];
-        int error;
+    dpif_linux_dp_dump_start(&dump);
+    while (nl_dump_next(&dump, &msg)) {
+        struct dpif_linux_dp dp;
 
-        dpif_linux_dp_init(&request);
-        request.dp_idx = dp_idx;
-        request.cmd = ODP_DP_DUMP;
-
-        error = dpif_linux_dp_transact(&request, &reply, &buf);
-        if (error) {
-            return error == ENODEV ? 0 : error;
+        if (!dpif_linux_dp_from_ofpbuf(&dp, &msg)) {
+            svec_add(all_dps, dp.name);
         }
-        ofpbuf_delete(buf);
-
-        sprintf(devname, "dp%d", reply.dp_idx);
-        svec_add(all_dps, devname);
-
-        dp_idx = reply.dp_idx + 1;
     }
+    return nl_dump_done(&dump);
 }
 
 static int
@@ -215,7 +207,7 @@ dpif_linux_open(const struct dpif_class *class OVS_UNUSED, const char *name,
 
     /* Create or look up datapath. */
     dpif_linux_dp_init(&dp_request);
-    dp_request.cmd = create ? ODP_DP_NEW : ODP_DP_GET;
+    dp_request.cmd = create ? ODP_DP_CMD_NEW : ODP_DP_CMD_GET;
     dp_request.dp_idx = minor;
     dp_request.name = minor < 0 ? name : NULL;
     error = dpif_linux_dp_transact(&dp_request, &dp, &buf);
@@ -323,7 +315,7 @@ dpif_linux_destroy(struct dpif *dpif_)
     struct dpif_linux_dp dp;
 
     dpif_linux_dp_init(&dp);
-    dp.cmd = ODP_DP_DEL;
+    dp.cmd = ODP_DP_CMD_DEL;
     dp.dp_idx = dpif->minor;
     return dpif_linux_dp_transact(&dp, NULL, NULL);
 }
@@ -365,7 +357,7 @@ dpif_linux_set_drop_frags(struct dpif *dpif_, bool drop_frags)
     struct dpif_linux_dp dp;
 
     dpif_linux_dp_init(&dp);
-    dp.cmd = ODP_DP_SET;
+    dp.cmd = ODP_DP_CMD_SET;
     dp.dp_idx = dpif->minor;
     dp.ipv4_frags = drop_frags ? ODP_DP_FRAG_DROP : ODP_DP_FRAG_ZERO;
     return dpif_linux_dp_transact(&dp, NULL, NULL);
@@ -806,7 +798,7 @@ dpif_linux_set_sflow_probability(struct dpif *dpif_, uint32_t probability)
     struct dpif_linux_dp dp;
 
     dpif_linux_dp_init(&dp);
-    dp.cmd = ODP_DP_SET;
+    dp.cmd = ODP_DP_CMD_SET;
     dp.dp_idx = dpif->minor;
     dp.sampling = &probability;
     return dpif_linux_dp_transact(&dp, NULL, NULL);
@@ -847,6 +839,7 @@ parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
     struct nlmsghdr *nlmsg;
     struct genlmsghdr *genl;
     struct ofpbuf b;
+    int type;
 
     ofpbuf_use_const(&b, buf->data, buf->size);
 
@@ -854,18 +847,22 @@ parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall,
     genl = ofpbuf_try_pull(&b, sizeof *genl);
     odp_header = ofpbuf_try_pull(&b, sizeof *odp_header);
     if (!nlmsg || !genl || !odp_header
+        || nlmsg->nlmsg_type != odp_packet_family
         || !nl_policy_parse(&b, 0, odp_packet_policy, a,
                             ARRAY_SIZE(odp_packet_policy))) {
         return EINVAL;
     }
 
-    memset(upcall, 0, sizeof *upcall);
-
-    upcall->type = (genl->cmd == ODP_PACKET_CMD_MISS ? DPIF_UC_MISS
-                    : genl->cmd == ODP_PACKET_CMD_ACTION ? DPIF_UC_ACTION
-                    : genl->cmd == ODP_PACKET_CMD_SAMPLE ? DPIF_UC_SAMPLE
-                    : -1);
+    type = (genl->cmd == ODP_PACKET_CMD_MISS ? DPIF_UC_MISS
+            : genl->cmd == ODP_PACKET_CMD_ACTION ? DPIF_UC_ACTION
+            : genl->cmd == ODP_PACKET_CMD_SAMPLE ? DPIF_UC_SAMPLE
+            : -1);
+    if (type < 0) {
+        return EINVAL;
+    }
 
+    memset(upcall, 0, sizeof *upcall);
+    upcall->type = type;
     upcall->packet = buf;
     upcall->packet->data = (void *) nl_attr_get(a[ODP_PACKET_ATTR_PACKET]);
     upcall->packet->size = nl_attr_get_size(a[ODP_PACKET_ATTR_PACKET]);
@@ -990,7 +987,12 @@ dpif_linux_init(void)
     static int error = -1;
 
     if (error < 0) {
-        error = nl_lookup_genl_family(ODP_PACKET_FAMILY, &odp_packet_family);
+        error = nl_lookup_genl_family(ODP_DATAPATH_FAMILY,
+                                      &odp_datapath_family);
+        if (!error) {
+            error = nl_lookup_genl_family(ODP_PACKET_FAMILY,
+                                          &odp_packet_family);
+        }
         if (!error) {
             error = nl_sock_create(NETLINK_GENERIC, &genl_sock);
         }
@@ -1391,9 +1393,9 @@ dpif_linux_vport_get(const char *name, struct dpif_linux_vport *reply,
     return dpif_linux_vport_transact(&request, reply, bufp);
 }
 
-/* Parses the contents of 'buf', which contains a "struct odp_datapath"
- * followed by Netlink attributes, into 'dp'.  Returns 0 if successful,
- * otherwise a positive errno value.
+/* Parses the contents of 'buf', which contains a "struct odp_header" followed
+ * by Netlink attributes, into 'dp'.  Returns 0 if successful, otherwise a
+ * positive errno value.
  *
  * 'dp' will contain pointers into 'buf', so the caller should not free 'buf'
  * while 'dp' is still in use. */
@@ -1411,18 +1413,27 @@ dpif_linux_dp_from_ofpbuf(struct dpif_linux_dp *dp, const struct ofpbuf *buf)
         [ODP_DP_ATTR_MCGROUPS] = { .type = NL_A_NESTED, .optional = true },
     };
 
-    struct odp_datapath *odp_dp;
     struct nlattr *a[ARRAY_SIZE(odp_datapath_policy)];
+    struct odp_header *odp_header;
+    struct nlmsghdr *nlmsg;
+    struct genlmsghdr *genl;
+    struct ofpbuf b;
 
     dpif_linux_dp_init(dp);
 
-    if (!nl_policy_parse(buf, sizeof *odp_dp, odp_datapath_policy,
-                         a, ARRAY_SIZE(odp_datapath_policy))) {
+    ofpbuf_use_const(&b, buf->data, buf->size);
+    nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
+    genl = ofpbuf_try_pull(&b, sizeof *genl);
+    odp_header = ofpbuf_try_pull(&b, sizeof *odp_header);
+    if (!nlmsg || !genl || !odp_header
+        || nlmsg->nlmsg_type != odp_datapath_family
+        || !nl_policy_parse(&b, 0, odp_datapath_policy, a,
+                            ARRAY_SIZE(odp_datapath_policy))) {
         return EINVAL;
     }
-    odp_dp = buf->data;
 
-    dp->dp_idx = odp_dp->dp_idx;
+    dp->cmd = genl->cmd;
+    dp->dp_idx = odp_header->dp_idx;
     dp->name = nl_attr_get_string(a[ODP_DP_ATTR_NAME]);
     if (a[ODP_DP_ATTR_STATS]) {
         /* Can't use structure assignment because Netlink doesn't ensure
@@ -1468,14 +1479,17 @@ dpif_linux_dp_from_ofpbuf(struct dpif_linux_dp *dp, const struct ofpbuf *buf)
     return 0;
 }
 
-/* Appends to 'buf' (which must initially be empty) a "struct odp_datapath"
- * followed by Netlink attributes corresponding to 'dp'. */
+/* Appends to 'buf' the Generic Netlink message described by 'dp'. */
 static void
 dpif_linux_dp_to_ofpbuf(const struct dpif_linux_dp *dp, struct ofpbuf *buf)
 {
-    struct odp_datapath *odp_dp;
+    struct odp_header *odp_header;
 
-    ofpbuf_reserve(buf, sizeof odp_dp);
+    nl_msg_put_genlmsghdr(buf, 0, odp_datapath_family,
+                          NLM_F_REQUEST | NLM_F_ECHO, dp->cmd, 1);
+
+    odp_header = ofpbuf_put_uninit(buf, sizeof *odp_header);
+    odp_header->dp_idx = dp->dp_idx;
 
     if (dp->name) {
         nl_msg_put_string(buf, ODP_DP_ATTR_NAME, dp->name);
@@ -1490,11 +1504,6 @@ dpif_linux_dp_to_ofpbuf(const struct dpif_linux_dp *dp, struct ofpbuf *buf)
     if (dp->sampling) {
         nl_msg_put_u32(buf, ODP_DP_ATTR_SAMPLING, *dp->sampling);
     }
-
-    odp_dp = ofpbuf_push_uninit(buf, sizeof *odp_dp);
-    odp_dp->dp_idx = dp->dp_idx;
-    odp_dp->len = buf->size;
-    odp_dp->total_len = (char *) ofpbuf_end(buf) - (char *) buf->data;
 }
 
 /* Clears 'dp' to "empty" values. */
@@ -1505,53 +1514,50 @@ dpif_linux_dp_init(struct dpif_linux_dp *dp)
     dp->dp_idx = -1;
 }
 
+static void
+dpif_linux_dp_dump_start(struct nl_dump *dump)
+{
+    struct dpif_linux_dp request;
+    struct ofpbuf *buf;
+
+    dpif_linux_dp_init(&request);
+    request.cmd = ODP_DP_CMD_GET;
+
+    buf = ofpbuf_new(1024);
+    dpif_linux_dp_to_ofpbuf(&request, buf);
+    nl_dump_start(dump, genl_sock, buf);
+    ofpbuf_delete(buf);
+}
+
 /* Executes 'request' in the kernel datapath.  If the command fails, returns a
  * positive errno value.  Otherwise, if 'reply' and 'bufp' are null, returns 0
  * without doing anything else.  If 'reply' and 'bufp' are nonnull, then the
- * result of the command is expected to be an odp_datapath also, which is
- * decoded and stored in '*reply' and '*bufp'.  The caller must free '*bufp'
- * when the reply is no longer needed ('reply' will contain pointers into
- * '*bufp'). */
+ * result of the command is expected to be of the same form, which is decoded
+ * and stored in '*reply' and '*bufp'.  The caller must free '*bufp' when the
+ * reply is no longer needed ('reply' will contain pointers into '*bufp'). */
 int
 dpif_linux_dp_transact(const struct dpif_linux_dp *request,
                        struct dpif_linux_dp *reply, struct ofpbuf **bufp)
 {
-    struct ofpbuf *buf = NULL;
+    struct ofpbuf *request_buf;
     int error;
-    int fd;
 
     assert((reply != NULL) == (bufp != NULL));
 
-    error = get_dp0_fd(&fd);
-    if (error) {
-        goto error;
-    }
+    request_buf = ofpbuf_new(1024);
+    dpif_linux_dp_to_ofpbuf(request, request_buf);
+    error = nl_sock_transact(genl_sock, request_buf, bufp);
+    ofpbuf_delete(request_buf);
 
-    buf = ofpbuf_new(1024);
-    dpif_linux_dp_to_ofpbuf(request, buf);
-
-    error = ioctl(fd, request->cmd, buf->data) ? errno : 0;
-    if (error) {
-        goto error;
-    }
-
-    if (bufp) {
-        buf->size = ((struct odp_datapath *) buf->data)->len;
-        error = dpif_linux_dp_from_ofpbuf(reply, buf);
+    if (reply) {
+        if (!error) {
+            error = dpif_linux_dp_from_ofpbuf(reply, *bufp);
+        }
         if (error) {
-            goto error;
+            dpif_linux_dp_init(reply);
+            ofpbuf_delete(*bufp);
+            *bufp = NULL;
         }
-        *bufp = buf;
-    } else {
-        ofpbuf_delete(buf);
-    }
-    return 0;
-
-error:
-    ofpbuf_delete(buf);
-    if (bufp) {
-        memset(reply, 0, sizeof *reply);
-        *bufp = NULL;
     }
     return error;
 }
@@ -1567,7 +1573,7 @@ dpif_linux_dp_get(const struct dpif *dpif_, struct dpif_linux_dp *reply,
     struct dpif_linux_dp request;
 
     dpif_linux_dp_init(&request);
-    request.cmd = ODP_DP_GET;
+    request.cmd = ODP_DP_CMD_GET;
     request.dp_idx = dpif->minor;
 
     return dpif_linux_dp_transact(&request, reply, bufp);
-- 
1.7.1





More information about the dev mailing list