[ovs-dev] [netlink v4 41/52] datapath: Convert datapath operations to use Netlink framing.

Ben Pfaff blp at nicira.com
Wed Jan 12 05:49:53 UTC 2011


Signed-off-by: Ben Pfaff <blp at nicira.com>
---
 datapath/datapath.c                     |  565 +++++++++++++++++++++----------
 datapath/vport.c                        |    1 +
 include/openvswitch/datapath-protocol.h |   47 ++-
 lib/dpif-linux.c                        |  340 +++++++++++++++----
 4 files changed, 695 insertions(+), 258 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index ff2a6b8..bc64f89 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(dp_ioctl_hook);
  * It is safe to access the datapath and vport structures with just
  * dp_mutex.
  */
-static struct datapath __rcu *dps[ODP_MAX];
+static struct datapath __rcu *dps[256];
 static DEFINE_MUTEX(dp_mutex);
 
 static struct vport *new_vport(const struct vport_parms *);
@@ -76,7 +76,7 @@ static struct vport *new_vport(const struct vport_parms *);
 /* Must be called with rcu_read_lock or dp_mutex. */
 struct datapath *get_dp(int dp_idx)
 {
-	if (dp_idx < 0 || dp_idx >= ODP_MAX)
+	if (dp_idx < 0 || dp_idx >= ARRAY_SIZE(dps))
 		return NULL;
 	return rcu_dereference_check(dps[dp_idx], rcu_read_lock_held() ||
 					 lockdep_is_held(&dp_mutex));
@@ -208,113 +208,6 @@ static struct kobj_type dp_ktype = {
 	.release = release_dp
 };
 
-static int create_dp(int dp_idx, const char __user *devnamep)
-{
-	struct vport_parms parms;
-	char devname[IFNAMSIZ];
-	struct vport *vport;
-	struct datapath *dp;
-	int err;
-	int i;
-
-	if (devnamep) {
-		int retval = strncpy_from_user(devname, devnamep, IFNAMSIZ);
-		if (retval < 0) {
-			err = -EFAULT;
-			goto err;
-		} else if (retval >= IFNAMSIZ) {
-			err = -ENAMETOOLONG;
-			goto err;
-		}
-	} else {
-		snprintf(devname, sizeof devname, "of%d", dp_idx);
-	}
-
-	rtnl_lock();
-	mutex_lock(&dp_mutex);
-	err = -ENODEV;
-	if (!try_module_get(THIS_MODULE))
-		goto err_unlock;
-
-	/* Exit early if a datapath with that number already exists.
-	 * (We don't use -EEXIST because that's ambiguous with 'devname'
-	 * conflicting with an existing network device name.) */
-	err = -EBUSY;
-	if (get_dp(dp_idx))
-		goto err_put_module;
-
-	err = -ENOMEM;
-	dp = kzalloc(sizeof *dp, GFP_KERNEL);
-	if (dp == NULL)
-		goto err_put_module;
-	INIT_LIST_HEAD(&dp->port_list);
-	mutex_init(&dp->mutex);
-	mutex_lock(&dp->mutex);
-	dp->dp_idx = dp_idx;
-	for (i = 0; i < DP_N_QUEUES; i++)
-		skb_queue_head_init(&dp->queues[i]);
-	init_waitqueue_head(&dp->waitqueue);
-
-	/* Initialize kobject for bridge.  This will be added as
-	 * /sys/class/net/<devname>/brif later, if sysfs is enabled. */
-	dp->ifobj.kset = NULL;
-	kobject_init(&dp->ifobj, &dp_ktype);
-
-	/* Allocate table. */
-	err = -ENOMEM;
-	rcu_assign_pointer(dp->table, tbl_create(TBL_MIN_BUCKETS));
-	if (!dp->table)
-		goto err_free_dp;
-
-	/* Set up our datapath device. */
-	parms.name = devname;
-	parms.type = ODPVT_INTERNAL;
-	parms.options = NULL;
-	parms.dp = dp;
-	parms.port_no = ODPP_LOCAL;
-	vport_lock();
-	vport = new_vport(&parms);
-	vport_unlock();
-	if (IS_ERR(vport)) {
-		err = PTR_ERR(vport);
-		if (err == -EBUSY)
-			err = -EEXIST;
-
-		goto err_destroy_table;
-	}
-
-	dp->drop_frags = 0;
-	dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
-	if (!dp->stats_percpu) {
-		err = -ENOMEM;
-		goto err_destroy_local_port;
-	}
-
-	rcu_assign_pointer(dps[dp_idx], dp);
-	dp_sysfs_add_dp(dp);
-
-	mutex_unlock(&dp->mutex);
-	mutex_unlock(&dp_mutex);
-	rtnl_unlock();
-
-	return 0;
-
-err_destroy_local_port:
-	dp_detach_port(get_vport_protected(dp, ODPP_LOCAL));
-err_destroy_table:
-	tbl_destroy(get_table_protected(dp), NULL);
-err_free_dp:
-	mutex_unlock(&dp->mutex);
-	kfree(dp);
-err_put_module:
-	module_put(THIS_MODULE);
-err_unlock:
-	mutex_unlock(&dp_mutex);
-	rtnl_unlock();
-err:
-	return err;
-}
-
 static void destroy_dp_rcu(struct rcu_head *rcu)
 {
 	struct datapath *dp = container_of(rcu, struct datapath, rcu);
@@ -328,22 +221,11 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 	kobject_put(&dp->ifobj);
 }
 
-static int destroy_dp(int dp_idx)
+/* Caller must hold RTNL, dp_mutex, and dp->mutex. */
+static void destroy_dp(struct datapath *dp)
 {
-	struct datapath *dp;
-	int err = 0;
 	struct vport *p, *n;
 
-	rtnl_lock();
-	mutex_lock(&dp_mutex);
-	dp = get_dp(dp_idx);
-	if (!dp) {
-		err = -ENODEV;
-		goto out;
-	}
-
-	mutex_lock(&dp->mutex);
-
 	list_for_each_entry_safe (p, n, &dp->port_list, node)
 		if (p->port_no != ODPP_LOCAL)
 			dp_detach_port(p);
@@ -355,11 +237,6 @@ static int destroy_dp(int dp_idx)
 	mutex_unlock(&dp->mutex);
 	call_rcu(&dp->rcu, destroy_dp_rcu);
 	module_put(THIS_MODULE);
-
-out:
-	mutex_unlock(&dp_mutex);
-	rtnl_unlock();
-	return err;
 }
 
 /* Called with RTNL lock, dp->mutex, and vport_lock. */
@@ -1200,12 +1077,11 @@ static int execute_packet(const struct odp_execute __user *executep)
 	return error;
 }
 
-static int get_dp_stats(struct datapath *dp, struct odp_stats __user *statsp)
+static void get_dp_stats(struct datapath *dp, struct odp_stats *stats)
 {
-	struct odp_stats stats;
 	int i;
 
-	stats.n_frags = stats.n_hit = stats.n_missed = stats.n_lost = 0;
+	stats->n_frags = stats->n_hit = stats->n_missed = stats->n_lost = 0;
 	for_each_possible_cpu(i) {
 		const struct dp_stats_percpu *percpu_stats;
 		struct dp_stats_percpu local_stats;
@@ -1218,12 +1094,11 @@ static int get_dp_stats(struct datapath *dp, struct odp_stats __user *statsp)
 			local_stats = *percpu_stats;
 		} while (read_seqcount_retry(&percpu_stats->seqlock, seqcount));
 
-		stats.n_frags += local_stats.n_frags;
-		stats.n_hit += local_stats.n_hit;
-		stats.n_missed += local_stats.n_missed;
-		stats.n_lost += local_stats.n_lost;
+		stats->n_frags += local_stats.n_frags;
+		stats->n_hit += local_stats.n_hit;
+		stats->n_missed += local_stats.n_missed;
+		stats->n_lost += local_stats.n_lost;
 	}
-	return copy_to_user(statsp, &stats, sizeof stats) ? -EFAULT : 0;
 }
 
 /* MTU of the dp pseudo-device: ETH_DATA_LEN or the minimum of the ports */
@@ -1277,6 +1152,365 @@ static void set_listen_mask(struct file *f, int listen_mask)
 	f->private_data = (void*)(long)listen_mask;
 }
 
+static struct nla_policy datapath_policy[ODPDT_MAX + 1] = {
+	[ODPDT_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+	[ODPDT_IPV4_FRAGS] = { .type = NLA_U32 },
+	[ODPDT_SAMPLING] = { .type = NLA_U32 },
+};
+
+static int copy_datapath_to_user(void __user *dst, struct datapath *dp, uint32_t total_len)
+{
+	struct odp_datapath *odp_datapath;
+	struct sk_buff *skb;
+	struct nlattr *nla;
+	int err;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!skb)
+		goto exit;
+
+	rcu_read_lock();
+	odp_datapath = (struct odp_datapath*)__skb_put(skb, sizeof(struct odp_datapath));
+	odp_datapath->dp_idx = dp->dp_idx;
+	odp_datapath->total_len = total_len;
+
+	NLA_PUT_STRING(skb, ODPPT_NAME, dp_name(dp));
+
+	nla = nla_reserve(skb, ODPDT_STATS, sizeof(struct odp_stats));
+	if (!nla)
+		goto nla_put_failure;
+	get_dp_stats(dp, nla_data(nla));
+
+	NLA_PUT_U32(skb, ODPDT_IPV4_FRAGS, dp->drop_frags ? ODP_DP_FRAG_DROP : ODP_DP_FRAG_ZERO);
+
+	if (dp->sflow_probability)
+		NLA_PUT_U32(skb, ODPDT_SAMPLING, dp->sflow_probability);
+
+	if (skb->len > total_len)
+		goto nla_put_failure;
+
+	odp_datapath->len = skb->len;
+	err = copy_to_user(dst, skb->data, skb->len) ? -EFAULT : 0;
+	goto exit_unlock;
+
+nla_put_failure:
+	err = -EMSGSIZE;
+exit_unlock:
+	rcu_read_unlock();
+	kfree_skb(skb);
+exit:
+	return err;
+}
+
+static struct sk_buff *copy_datapath_from_user(struct odp_datapath __user *uodp_datapath, struct nlattr *a[ODPDT_MAX + 1])
+{
+	struct odp_datapath *odp_datapath;
+	struct sk_buff *skb;
+	u32 len;
+	int err;
+
+	if (get_user(len, &uodp_datapath->len))
+		return ERR_PTR(-EFAULT);
+	if (len < sizeof(struct odp_datapath))
+		return ERR_PTR(-EINVAL);
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	err = -EFAULT;
+	if (copy_from_user(__skb_put(skb, len), uodp_datapath, len))
+		goto error_free_skb;
+
+	odp_datapath = (struct odp_datapath *)skb->data;
+	err = -EINVAL;
+	if (odp_datapath->len != len)
+		goto error_free_skb;
+
+	err = nla_parse(a, ODPDT_MAX, (struct nlattr *)(skb->data + sizeof(struct odp_datapath)),
+			skb->len - sizeof(struct odp_datapath), datapath_policy);
+	if (err)
+		goto error_free_skb;
+
+	if (a[ODPDT_IPV4_FRAGS]) {
+		u32 frags = nla_get_u32(a[ODPDT_IPV4_FRAGS]);
+
+		err = -EINVAL;
+		if (frags != ODP_DP_FRAG_ZERO && frags != ODP_DP_FRAG_DROP)
+			goto error_free_skb;
+	}
+
+	err = VERIFY_NUL_STRING(a[ODPDT_NAME]);
+	if (err)
+		goto error_free_skb;
+
+	return skb;
+
+error_free_skb:
+	kfree_skb(skb);
+	return ERR_PTR(err);
+}
+
+static struct datapath *lookup_datapath(struct odp_datapath *odp_datapath, struct nlattr *a[ODPDT_MAX + 1])
+{
+	WARN_ON_ONCE(!mutex_is_locked(&dp_mutex));
+
+	if (!a[ODPDT_NAME]) {
+		struct datapath *dp;
+
+		dp = get_dp(odp_datapath->dp_idx);
+		if (!dp)
+			return ERR_PTR(-ENODEV);
+		mutex_lock(&dp->mutex);
+		return dp;
+	} else {
+		struct datapath *dp;
+		struct vport *vport;
+		int dp_idx;
+
+		vport_lock();
+		vport = vport_locate(nla_data(a[ODPDT_NAME]));
+		dp_idx = vport && vport->port_no == ODPP_LOCAL ? vport->dp->dp_idx : -1;
+		vport_unlock();
+
+		if (dp_idx < 0)
+			return ERR_PTR(-ENODEV);
+
+		dp = get_dp(dp_idx);
+		mutex_lock(&dp->mutex);
+		return dp;
+	}
+}
+
+static void change_datapath(struct datapath *dp, struct nlattr *a[ODPDT_MAX + 1])
+{
+	if (a[ODPDT_IPV4_FRAGS])
+		dp->drop_frags = nla_get_u32(a[ODPDT_IPV4_FRAGS]) == ODP_DP_FRAG_DROP;
+	if (a[ODPDT_SAMPLING])
+		dp->sflow_probability = nla_get_u32(a[ODPDT_SAMPLING]);
+}
+
+static int new_datapath(struct odp_datapath __user *uodp_datapath)
+{
+	struct nlattr *a[ODPDT_MAX + 1];
+	struct odp_datapath *odp_datapath;
+	struct vport_parms parms;
+	struct sk_buff *skb;
+	struct datapath *dp;
+	struct vport *vport;
+	int dp_idx;
+	int err;
+	int i;
+
+	skb = copy_datapath_from_user(uodp_datapath, a);
+	err = PTR_ERR(skb);
+	if (IS_ERR(skb))
+		goto err;
+	odp_datapath = (struct odp_datapath *)skb->data;
+
+	err = -EINVAL;
+	if (!a[ODPDT_NAME])
+		goto err_free_skb;
+
+	rtnl_lock();
+	mutex_lock(&dp_mutex);
+	err = -ENODEV;
+	if (!try_module_get(THIS_MODULE))
+		goto err_unlock_dp_mutex;
+
+	dp_idx = odp_datapath->dp_idx;
+	if (dp_idx < 0) {
+		err = -EFBIG;
+		for (dp_idx = 0; dp_idx < ARRAY_SIZE(dps); dp_idx++) {
+			if (get_dp(dp_idx))
+				continue;
+			err = 0;
+			break;
+		}
+	} else if (dp_idx < ARRAY_SIZE(dps))
+		err = get_dp(dp_idx) ? -EBUSY : 0;
+	else
+		err = -EINVAL;
+	if (err)
+		goto err_put_module;
+
+	err = -ENOMEM;
+	dp = kzalloc(sizeof *dp, GFP_KERNEL);
+	if (dp == NULL)
+		goto err_put_module;
+	INIT_LIST_HEAD(&dp->port_list);
+	mutex_init(&dp->mutex);
+	mutex_lock(&dp->mutex);
+	dp->dp_idx = dp_idx;
+	for (i = 0; i < DP_N_QUEUES; i++)
+		skb_queue_head_init(&dp->queues[i]);
+	init_waitqueue_head(&dp->waitqueue);
+
+	/* Initialize kobject for bridge.  This will be added as
+	 * /sys/class/net/<devname>/brif later, if sysfs is enabled. */
+	dp->ifobj.kset = NULL;
+	kobject_init(&dp->ifobj, &dp_ktype);
+
+	/* Allocate table. */
+	err = -ENOMEM;
+	rcu_assign_pointer(dp->table, tbl_create(TBL_MIN_BUCKETS));
+	if (!dp->table)
+		goto err_free_dp;
+
+	/* Set up our datapath device. */
+	parms.name = nla_data(a[ODPDT_NAME]);
+	parms.type = ODPVT_INTERNAL;
+	parms.options = NULL;
+	parms.dp = dp;
+	parms.port_no = ODPP_LOCAL;
+	vport_lock();
+	vport = new_vport(&parms);
+	vport_unlock();
+	if (IS_ERR(vport)) {
+		err = PTR_ERR(vport);
+		if (err == -EBUSY)
+			err = -EEXIST;
+
+		goto err_destroy_table;
+	}
+
+	dp->drop_frags = 0;
+	dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
+	if (!dp->stats_percpu) {
+		err = -ENOMEM;
+		goto err_destroy_local_port;
+	}
+
+	change_datapath(dp, a);
+
+	rcu_assign_pointer(dps[dp_idx], dp);
+	dp_sysfs_add_dp(dp);
+
+	mutex_unlock(&dp->mutex);
+	mutex_unlock(&dp_mutex);
+	rtnl_unlock();
+
+	return 0;
+
+err_destroy_local_port:
+	dp_detach_port(get_vport_protected(dp, ODPP_LOCAL));
+err_destroy_table:
+	tbl_destroy(get_table_protected(dp), NULL);
+err_free_dp:
+	mutex_unlock(&dp->mutex);
+	kfree(dp);
+err_put_module:
+	module_put(THIS_MODULE);
+err_unlock_dp_mutex:
+	mutex_unlock(&dp_mutex);
+	rtnl_unlock();
+err_free_skb:
+	kfree_skb(skb);
+err:
+	return err;
+}
+
+static int modify_datapath(unsigned int cmd, struct odp_datapath __user *uodp_datapath)
+{
+	struct nlattr *a[ODPDT_MAX + 1];
+	struct datapath *dp;
+	struct sk_buff *skb;
+	int err;
+
+	skb = copy_datapath_from_user(uodp_datapath, a);
+	err = PTR_ERR(skb);
+	if (IS_ERR(skb))
+		goto exit;
+
+	rtnl_lock();
+	mutex_lock(&dp_mutex);
+	dp = lookup_datapath((struct odp_datapath *)skb->data, a);
+	err = PTR_ERR(dp);
+	if (IS_ERR(dp))
+		goto exit_free;
+
+	if (cmd == ODP_DP_DEL)
+		destroy_dp(dp);
+	else {
+		change_datapath(dp, a);
+		mutex_unlock(&dp->mutex);
+	}
+	err = 0;
+
+exit_free:
+	kfree_skb(skb);
+	mutex_unlock(&dp_mutex);
+	rtnl_unlock();
+exit:
+	return err;
+}
+
+static int get_datapath(struct odp_datapath __user *uodp_datapath)
+{
+	struct nlattr *a[ODPDT_MAX + 1];
+	struct odp_datapath *odp_datapath;
+	struct datapath *dp;
+	struct sk_buff *skb;
+	int err;
+
+	skb = copy_datapath_from_user(uodp_datapath, a);
+	err = PTR_ERR(skb);
+	if (IS_ERR(skb))
+		goto exit;
+	odp_datapath = (struct odp_datapath *)skb->data;
+
+	mutex_lock(&dp_mutex);
+	dp = lookup_datapath(odp_datapath, a);
+	mutex_unlock(&dp_mutex);
+
+	err = PTR_ERR(dp);
+	if (IS_ERR(dp))
+		goto exit_free;
+
+	err = copy_datapath_to_user(uodp_datapath, dp, odp_datapath->total_len);
+	mutex_unlock(&dp->mutex);
+exit_free:
+	kfree_skb(skb);
+exit:
+	return err;
+}
+
+static int dump_datapath(struct odp_datapath __user *uodp_datapath)
+{
+	struct nlattr *a[ODPDT_MAX + 1];
+	struct odp_datapath *odp_datapath;
+	struct sk_buff *skb;
+	u32 dp_idx;
+	int err;
+
+	skb = copy_datapath_from_user(uodp_datapath, a);
+	err = PTR_ERR(skb);
+	if (IS_ERR(skb))
+		goto exit;
+	odp_datapath = (struct odp_datapath *)skb->data;
+
+	mutex_lock(&dp_mutex);
+	for (dp_idx = odp_datapath->dp_idx; dp_idx < ARRAY_SIZE(dps); dp_idx++) {
+		struct datapath *dp = get_dp(dp_idx);
+		if (!dp)
+			continue;
+
+		mutex_lock(&dp->mutex);
+		mutex_unlock(&dp_mutex);
+		err = copy_datapath_to_user(uodp_datapath, dp, odp_datapath->total_len);
+		mutex_unlock(&dp->mutex);
+		goto exit_free;
+	}
+	mutex_unlock(&dp_mutex);
+	err = -ENODEV;
+
+exit_free:
+	kfree_skb(skb);
+exit:
+	return err;
+}
+
 static struct nla_policy vport_policy[ODPPT_MAX + 1] = {
 	[ODPPT_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
 	[ODPPT_STATS] = { .len = sizeof(struct rtnl_link_stats64) },
@@ -1632,18 +1866,26 @@ static long openvswitch_ioctl(struct file *f, unsigned int cmd,
 {
 	int dp_idx = iminor(f->f_dentry->d_inode);
 	struct datapath *dp;
-	int drop_frags, listeners;
-	unsigned int sflow_probability;
+	int listeners;
 	int err;
 
 	/* Handle commands with special locking requirements up front. */
 	switch (cmd) {
-	case ODP_DP_CREATE:
-		err = create_dp(dp_idx, (char __user *)argp);
+	case ODP_DP_NEW:
+		err = new_datapath((struct odp_datapath __user *)argp);
+		goto exit;
+
+	case ODP_DP_GET:
+		err = get_datapath((struct odp_datapath __user *)argp);
+		goto exit;
+
+	case ODP_DP_DEL:
+	case ODP_DP_SET:
+		err = modify_datapath(cmd, (struct odp_datapath __user *)argp);
 		goto exit;
 
-	case ODP_DP_DESTROY:
-		err = destroy_dp(dp_idx);
+	case ODP_DP_DUMP:
+		err = dump_datapath((struct odp_datapath __user *)argp);
 		goto exit;
 
 	case ODP_VPORT_NEW:
@@ -1694,25 +1936,6 @@ static long openvswitch_ioctl(struct file *f, unsigned int cmd,
 		goto exit;
 
 	switch (cmd) {
-	case ODP_DP_STATS:
-		err = get_dp_stats(dp, (struct odp_stats __user *)argp);
-		break;
-
-	case ODP_GET_DROP_FRAGS:
-		err = put_user(dp->drop_frags, (int __user *)argp);
-		break;
-
-	case ODP_SET_DROP_FRAGS:
-		err = get_user(drop_frags, (int __user *)argp);
-		if (err)
-			break;
-		err = -EINVAL;
-		if (drop_frags != 0 && drop_frags != 1)
-			break;
-		dp->drop_frags = drop_frags;
-		err = 0;
-		break;
-
 	case ODP_GET_LISTEN_MASK:
 		err = put_user(get_listen_mask(f), (int __user *)argp);
 		break;
@@ -1728,16 +1951,6 @@ static long openvswitch_ioctl(struct file *f, unsigned int cmd,
 		set_listen_mask(f, listeners);
 		break;
 
-	case ODP_GET_SFLOW_PROBABILITY:
-		err = put_user(dp->sflow_probability, (unsigned int __user *)argp);
-		break;
-
-	case ODP_SET_SFLOW_PROBABILITY:
-		err = get_user(sflow_probability, (unsigned int __user *)argp);
-		if (!err)
-			dp->sflow_probability = sflow_probability;
-		break;
-
 	default:
 		err = -ENOIOCTLCMD;
 		break;
@@ -1922,24 +2135,22 @@ static long openvswitch_compat_ioctl(struct file *f, unsigned int cmd, unsigned
 	int err;
 
 	switch (cmd) {
-	case ODP_DP_DESTROY:
 	case ODP_FLOW_FLUSH:
 		/* Ioctls that don't need any translation at all. */
 		return openvswitch_ioctl(f, cmd, argp);
 
-	case ODP_DP_CREATE:
+	case ODP_DP_NEW:
+	case ODP_DP_GET:
+	case ODP_DP_DEL:
+	case ODP_DP_SET:
+	case ODP_DP_DUMP:
 	case ODP_VPORT_NEW:
 	case ODP_VPORT_DEL:
 	case ODP_VPORT_GET:
 	case ODP_VPORT_SET:
 	case ODP_VPORT_DUMP:
-	case ODP_DP_STATS:
-	case ODP_GET_DROP_FRAGS:
-	case ODP_SET_DROP_FRAGS:
 	case ODP_SET_LISTEN_MASK:
 	case ODP_GET_LISTEN_MASK:
-	case ODP_SET_SFLOW_PROBABILITY:
-	case ODP_GET_SFLOW_PROBABILITY:
 		/* Ioctls that just need their pointer argument extended. */
 		return openvswitch_ioctl(f, cmd, (unsigned long)compat_ptr(argp));
 	}
diff --git a/datapath/vport.c b/datapath/vport.c
index 0fa4dd6..4db1f01 100644
--- a/datapath/vport.c
+++ b/datapath/vport.c
@@ -87,6 +87,7 @@ do {								\
 	}							\
 } while (0)
 
+
 /**
  *	vport_init - initialize vport subsystem
  *
diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h
index fbd32d1..cb4ab26 100644
--- a/include/openvswitch/datapath-protocol.h
+++ b/include/openvswitch/datapath-protocol.h
@@ -70,14 +70,11 @@
 #include <linux/if_link.h>
 #include <linux/netlink.h>
 
-#define ODP_MAX 256             /* Maximum number of datapaths. */
-
-#define ODP_DP_CREATE           _IO('O', 0)
-#define ODP_DP_DESTROY          _IO('O', 1)
-#define ODP_DP_STATS            _IOW('O', 2, struct odp_stats)
-
-#define ODP_GET_DROP_FRAGS      _IOW('O', 3, int)
-#define ODP_SET_DROP_FRAGS      _IOR('O', 4, int)
+#define ODP_DP_NEW              _IOWR('O', 0, struct odp_datapath)
+#define ODP_DP_DEL              _IOR('O', 1, struct odp_datapath)
+#define ODP_DP_GET              _IOWR('O', 2, struct odp_datapath)
+#define ODP_DP_SET		_IOWR('O', 3, struct odp_datapath)
+#define ODP_DP_DUMP		_IOWR('O', 4, struct odp_datapath)
 
 #define ODP_GET_LISTEN_MASK     _IOW('O', 5, int)
 #define ODP_SET_LISTEN_MASK     _IOR('O', 6, int)
@@ -96,8 +93,38 @@
 
 #define ODP_EXECUTE             _IOR('O', 18, struct odp_execute)
 
-#define ODP_SET_SFLOW_PROBABILITY _IOR('O', 19, int)
-#define ODP_GET_SFLOW_PROBABILITY _IOW('O', 20, int)
+/**
+ * struct odp_datapath - header with basic information about a datapath.
+ * @dp_idx: Datapath index (-1 to make a request not specific to a datapath).
+ * @len: Length of this structure plus the Netlink attributes following it.
+ * @total_len: Total space available for kernel reply to request.
+ *
+ * Followed by &struct nlattr attributes, whose types are drawn from %ODPDT_*,
+ * up to a length of @len bytes including the &struct odp_datapath header.
+ */
+struct odp_datapath {
+	int32_t dp_idx;
+	uint32_t len;
+	uint32_t total_len;
+};
+
+enum odp_datapath_type {
+	ODPDT_UNSPEC,
+	ODPDT_NAME,		/* name of dp_ifidx netdev */
+	ODPDT_STATS,		/* struct odp_stats */
+	ODPDT_IPV4_FRAGS,	/* 32-bit enum odp_frag_handling */
+	ODPDT_SAMPLING,		/* 32-bit fraction of packets to sample */
+	__ODPDT_MAX
+};
+
+#define ODPDT_MAX (__ODPDT_MAX - 1)
+
+/* Values for ODPDT_IPV4_FRAGS. */
+enum odp_frag_handling {
+	ODP_DP_FRAG_UNSPEC,
+	ODP_DP_FRAG_ZERO,	/* Treat IP fragments as transport port 0. */
+	ODP_DP_FRAG_DROP	/* Drop IP fragments. */
+};
 
 struct odp_stats {
     uint64_t n_frags;           /* Number of dropped IP fragments. */
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c
index 852b468..38a06ea 100644
--- a/lib/dpif-linux.c
+++ b/lib/dpif-linux.c
@@ -47,11 +47,32 @@
 #include "rtnetlink-link.h"
 #include "shash.h"
 #include "svec.h"
+#include "unaligned.h"
 #include "util.h"
 #include "vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_linux);
 
+struct dpif_linux_dp {
+    /* ioctl command argument. */
+    int cmd;
+
+    /* struct odp_datapath header. */
+    uint32_t dp_idx;
+
+    /* Attributes. */
+    const char *name;                  /* ODPDT_NAME. */
+    struct odp_stats stats;            /* ODPDT_STATS. */
+    enum odp_frag_handling ipv4_frags; /* ODPDT_IPV4_FRAGS. */
+    const uint32_t *sampling;          /* ODPDT_SAMPLING. */
+};
+
+static void dpif_linux_dp_init(struct dpif_linux_dp *);
+static int dpif_linux_dp_transact(const struct dpif_linux_dp *request,
+                                  struct dpif_linux_dp *reply,
+                                  struct ofpbuf **bufp);
+static int dpif_linux_dp_get(const struct dpif *, struct dpif_linux_dp *reply,
+                             struct ofpbuf **bufp);
 /* Datapath interface for the openvswitch Linux kernel module. */
 struct dpif_linux {
     struct dpif dpif;
@@ -75,7 +96,6 @@ static int lookup_internal_device(const char *name, int *dp_idx, int *port_no);
 static int open_dpif(const struct dpif_linux_vport *local_vport,
                      struct dpif **);
 static int get_openvswitch_major(void);
-static int create_minor(const char *name, int minor);
 static int open_minor(int minor, int *fdp);
 static int make_openvswitch_device(int minor, char **fnp);
 static void dpif_linux_port_changed(const struct rtnetlink_link_change *,
@@ -91,9 +111,10 @@ dpif_linux_cast(const struct dpif *dpif)
 static int
 dpif_linux_enumerate(struct svec *all_dps)
 {
+    struct dpif_linux_dp request, reply;
+    struct ofpbuf *buf;
     int major;
-    int error;
-    int i;
+    int err;
 
     /* Check that the Open vSwitch module is loaded. */
     major = get_openvswitch_major();
@@ -101,22 +122,19 @@ dpif_linux_enumerate(struct svec *all_dps)
         return -major;
     }
 
-    error = 0;
-    for (i = 0; i < ODP_MAX; i++) {
-        struct dpif *dpif;
+    dpif_linux_dp_init(&request);
+    request.cmd = ODP_DP_DUMP;
+    for (;
+         !(err = dpif_linux_dp_transact(&request, &reply, &buf));
+         request.dp_idx = reply.dp_idx + 1) {
         char devname[16];
-        int retval;
-
-        sprintf(devname, "dp%d", i);
-        retval = dpif_open(devname, "system", &dpif);
-        if (!retval) {
-            svec_add(all_dps, devname);
-            dpif_uninit(dpif, true);
-        } else if (retval != ENODEV && !error) {
-            error = retval;
-        }
+
+        sprintf(devname, "dp%d", reply.dp_idx);
+        svec_add(all_dps, devname);
+
+        ofpbuf_delete(buf);
     }
-    return error;
+    return err == ENODEV ? 0 : err;
 }
 
 static int
@@ -131,27 +149,20 @@ dpif_linux_open(const struct dpif_class *class OVS_UNUSED, const char *name,
     minor = !strncmp(name, "dp", 2)
             && isdigit((unsigned char)name[2]) ? atoi(name + 2) : -1;
     if (create) {
-        if (minor >= 0) {
-            error = create_minor(name, minor);
-            if (error) {
-                return error;
-            }
-        } else {
-            /* Scan for unused minor number. */
-            for (minor = 0; ; minor++) {
-                if (minor >= ODP_MAX) {
-                    /* All datapath numbers in use. */
-                    return ENOBUFS;
-                }
+        struct dpif_linux_dp request, reply;
+        struct ofpbuf *buf;
+        int error;
 
-                error = create_minor(name, minor);
-                if (!error) {
-                    break;
-                } else if (error != EBUSY) {
-                    return error;
-                }
-            }
+        dpif_linux_dp_init(&request);
+        request.cmd = ODP_DP_NEW;
+        request.dp_idx = minor;
+        request.name = name;
+        error = dpif_linux_dp_transact(&request, &reply, &buf);
+        if (error) {
+            return error;
         }
+        minor = reply.dp_idx;
+        ofpbuf_delete(buf);
     }
 
     dpif_linux_vport_init(&request);
@@ -245,25 +256,41 @@ dpif_linux_get_all_names(const struct dpif *dpif_, struct svec *all_names)
 static int
 dpif_linux_destroy(struct dpif *dpif_)
 {
-    return do_ioctl(dpif_, ODP_DP_DESTROY, NULL);
+    struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+    struct dpif_linux_dp dp;
+
+    dpif_linux_dp_init(&dp);
+    dp.cmd = ODP_DP_DEL;
+    dp.dp_idx = dpif->minor;
+    return dpif_linux_dp_transact(&dp, NULL, NULL);
 }
 
 static int
 dpif_linux_get_stats(const struct dpif *dpif_, struct odp_stats *stats)
 {
-    memset(stats, 0, sizeof *stats);
-    return do_ioctl(dpif_, ODP_DP_STATS, stats);
+    struct dpif_linux_dp dp;
+    struct ofpbuf *buf;
+    int error;
+
+    error = dpif_linux_dp_get(dpif_, &dp, &buf);
+    if (!error) {
+        *stats = dp.stats;
+        ofpbuf_delete(buf);
+    }
+    return error;
 }
 
 static int
 dpif_linux_get_drop_frags(const struct dpif *dpif_, bool *drop_fragsp)
 {
-    int drop_frags;
+    struct dpif_linux_dp dp;
+    struct ofpbuf *buf;
     int error;
 
-    error = do_ioctl(dpif_, ODP_GET_DROP_FRAGS, &drop_frags);
+    error = dpif_linux_dp_get(dpif_, &dp, &buf);
     if (!error) {
-        *drop_fragsp = drop_frags & 1;
+        *drop_fragsp = dp.ipv4_frags == ODP_DP_FRAG_DROP;
+        ofpbuf_delete(buf);
     }
     return error;
 }
@@ -271,8 +298,14 @@ dpif_linux_get_drop_frags(const struct dpif *dpif_, bool *drop_fragsp)
 static int
 dpif_linux_set_drop_frags(struct dpif *dpif_, bool drop_frags)
 {
-    int drop_frags_int = drop_frags;
-    return do_ioctl(dpif_, ODP_SET_DROP_FRAGS, &drop_frags_int);
+    struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+    struct dpif_linux_dp dp;
+
+    dpif_linux_dp_init(&dp);
+    dp.cmd = ODP_DP_SET;
+    dp.dp_idx = dpif->minor;
+    dp.ipv4_frags = drop_frags ? ODP_DP_FRAG_DROP : ODP_DP_FRAG_ZERO;
+    return dpif_linux_dp_transact(&dp, NULL, NULL);
 }
 
 static int
@@ -659,13 +692,29 @@ static int
 dpif_linux_get_sflow_probability(const struct dpif *dpif_,
                                  uint32_t *probability)
 {
-    return do_ioctl(dpif_, ODP_GET_SFLOW_PROBABILITY, probability);
+    struct dpif_linux_dp dp;
+    struct ofpbuf *buf;
+    int error;
+
+    error = dpif_linux_dp_get(dpif_, &dp, &buf);
+    if (!error) {
+        *probability = dp.sampling ? *dp.sampling : 0;
+        ofpbuf_delete(buf);
+    }
+    return error;
 }
 
 static int
 dpif_linux_set_sflow_probability(struct dpif *dpif_, uint32_t probability)
 {
-    return do_ioctl(dpif_, ODP_SET_SFLOW_PROBABILITY, &probability);
+    struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+    struct dpif_linux_dp dp;
+
+    dpif_linux_dp_init(&dp);
+    dp.cmd = ODP_DP_SET;
+    dp.dp_idx = dpif->minor;
+    dp.sampling = &probability;
+    return dpif_linux_dp_transact(&dp, NULL, NULL);
 }
 
 static int
@@ -1002,22 +1051,6 @@ get_major(const char *target)
 }
 
 static int
-create_minor(const char *name, int minor)
-{
-    int error;
-    int fd;
-
-    error = open_minor(minor, &fd);
-    if (error) {
-        return error;
-    }
-
-    error = ioctl(fd, ODP_DP_CREATE, name) ? errno : 0;
-    close(fd);
-    return error;
-}
-
-static int
 open_minor(int minor, int *fdp)
 {
     int error;
@@ -1058,6 +1091,24 @@ dpif_linux_port_changed(const struct rtnetlink_link_change *change,
         dpif->change_error = true;
     }
 }
+
+static int
+get_dp0_fd(int *dp0_fdp)
+{
+    static int dp0_fd = -1;
+    if (dp0_fd < 0) {
+        int error;
+        int fd;
+
+        error = open_minor(0, &fd);
+        if (error) {
+            return error;
+        }
+        dp0_fd = fd;
+    }
+    *dp0_fdp = dp0_fd;
+    return 0;
+}
 
 /* Parses the contents of 'buf', which contains a "struct odp_vport" followed
  * by Netlink attributes, into 'vport'.  Returns 0 if successful, otherwise a
@@ -1188,25 +1239,21 @@ dpif_linux_vport_transact(const struct dpif_linux_vport *request,
                           struct dpif_linux_vport *reply,
                           struct ofpbuf **bufp)
 {
-    static int dp0_fd = -1;
     struct ofpbuf *buf = NULL;
     int error;
+    int fd;
 
     assert((reply != NULL) == (bufp != NULL));
-    if (dp0_fd < 0) {
-        int fd;
 
-        error = open_minor(0, &fd);
-        if (error) {
-            goto error;
-        }
-        dp0_fd = fd;
+    error = get_dp0_fd(&fd);
+    if (error) {
+        goto error;
     }
 
     buf = ofpbuf_new(1024);
     dpif_linux_vport_to_ofpbuf(request, buf);
 
-    error = ioctl(dp0_fd, request->cmd, buf->data) ? errno : 0;
+    error = ioctl(fd, request->cmd, buf->data) ? errno : 0;
     if (error) {
         goto error;
     }
@@ -1247,4 +1294,155 @@ dpif_linux_vport_get(const char *name, struct dpif_linux_vport *reply,
 
     return dpif_linux_vport_transact(&request, reply, bufp);
 }
+
+/* Parses the contents of 'buf', which contains a "struct odp_datapath"
+ * followed by Netlink attributes, into 'dp'.  Returns 0 if successful,
+ * otherwise a positive errno value.
+ *
+ * 'dp' will contain pointers into 'buf', so the caller should not free 'buf'
+ * while 'dp' is still in use. */
+static int
+dpif_linux_dp_from_ofpbuf(struct dpif_linux_dp *dp, const struct ofpbuf *buf)
+{
+    static const struct nl_policy odp_datapath_policy[] = {
+        [ODPDT_NAME] = { .type = NL_A_STRING, .max_len = IFNAMSIZ },
+        [ODPDT_STATS] = { .type = NL_A_UNSPEC,
+                          .min_len = sizeof(struct odp_stats),
+                          .max_len = sizeof(struct odp_stats),
+                          .optional = true },
+        [ODPDT_IPV4_FRAGS] = { .type = NL_A_U32, .optional = true },
+        [ODPDT_SAMPLING] = { .type = NL_A_U32, .optional = true },
+    };
+
+    struct odp_datapath *odp_dp;
+    struct nlattr *a[ARRAY_SIZE(odp_datapath_policy)];
+
+    dpif_linux_dp_init(dp);
+
+    if (!nl_policy_parse(buf, sizeof *odp_dp, odp_datapath_policy,
+                         a, ARRAY_SIZE(odp_datapath_policy))) {
+        return EINVAL;
+    }
+    odp_dp = buf->data;
+
+    dp->dp_idx = odp_dp->dp_idx;
+    dp->name = nl_attr_get_string(a[ODPDT_NAME]);
+    if (a[ODPDT_STATS]) {
+        /* Can't use structure assignment because Netlink doesn't ensure
+         * sufficient alignment for 64-bit members. */
+        memcpy(&dp->stats, nl_attr_get(a[ODPDT_STATS]), sizeof dp->stats);
+    }
+    if (a[ODPDT_IPV4_FRAGS]) {
+        dp->ipv4_frags = nl_attr_get_u32(a[ODPDT_IPV4_FRAGS]);
+    }
+    if (a[ODPDT_SAMPLING]) {
+        dp->sampling = nl_attr_get(a[ODPDT_SAMPLING]);
+    }
+    return 0;
+}
+
+/* Appends to 'buf' (which must initially be empty) a "struct odp_datapath"
+ * followed by Netlink attributes corresponding to 'dp'. */
+static void
+dpif_linux_dp_to_ofpbuf(const struct dpif_linux_dp *dp, struct ofpbuf *buf)
+{
+    struct odp_datapath *odp_dp;
+
+    ofpbuf_reserve(buf, sizeof odp_dp);
+
+    if (dp->name) {
+        nl_msg_put_string(buf, ODPPT_NAME, dp->name);
+    }
+
+    /* Skip ODPDT_STATS since we never have a reason to serialize it. */
+
+    if (dp->ipv4_frags) {
+        nl_msg_put_u32(buf, ODPDT_IPV4_FRAGS, dp->ipv4_frags);
+    }
+
+    if (dp->sampling) {
+        nl_msg_put_u32(buf, ODPDT_SAMPLING, *dp->sampling);
+    }
+
+    odp_dp = ofpbuf_push_uninit(buf, sizeof *odp_dp);
+    odp_dp->dp_idx = dp->dp_idx;
+    odp_dp->len = buf->size;
+    odp_dp->total_len = (char *) ofpbuf_end(buf) - (char *) buf->data;
+}
+
+/* Clears 'dp' to "empty" values. */
+void
+dpif_linux_dp_init(struct dpif_linux_dp *dp)
+{
+    memset(dp, 0, sizeof *dp);
+    dp->dp_idx = -1;
+}
+
+/* Executes 'request' in the kernel datapath.  If the command fails, returns a
+ * positive errno value.  Otherwise, if 'reply' and 'bufp' are null, returns 0
+ * without doing anything else.  If 'reply' and 'bufp' are nonnull, then the
+ * result of the command is expected to be an odp_datapath also, which is
+ * decoded and stored in '*reply' and '*bufp'.  The caller must free '*bufp'
+ * when the reply is no longer needed ('reply' will contain pointers into
+ * '*bufp'). */
+int
+dpif_linux_dp_transact(const struct dpif_linux_dp *request,
+                       struct dpif_linux_dp *reply, struct ofpbuf **bufp)
+{
+    struct ofpbuf *buf = NULL;
+    int error;
+    int fd;
+
+    assert((reply != NULL) == (bufp != NULL));
+
+    error = get_dp0_fd(&fd);
+    if (error) {
+        goto error;
+    }
+
+    buf = ofpbuf_new(1024);
+    dpif_linux_dp_to_ofpbuf(request, buf);
+
+    error = ioctl(fd, request->cmd, buf->data) ? errno : 0;
+    if (error) {
+        goto error;
+    }
+
+    if (bufp) {
+        buf->size = ((struct odp_datapath *) buf->data)->len;
+        error = dpif_linux_dp_from_ofpbuf(reply, buf);
+        if (error) {
+            goto error;
+        }
+        *bufp = buf;
+    } else {
+        ofpbuf_delete(buf);
+    }
+    return 0;
+
+error:
+    ofpbuf_delete(buf);
+    if (bufp) {
+        memset(reply, 0, sizeof *reply);
+        *bufp = NULL;
+    }
+    return error;
+}
+
+/* Obtains information about 'dpif_' and stores it into '*reply' and '*bufp'.
+ * The caller must free '*bufp' when the reply is no longer needed ('reply'
+ * will contain pointers into '*bufp').  */
+int
+dpif_linux_dp_get(const struct dpif *dpif_, struct dpif_linux_dp *reply,
+                  struct ofpbuf **bufp)
+{
+    struct dpif_linux *dpif = dpif_linux_cast(dpif_);
+    struct dpif_linux_dp request;
+
+    dpif_linux_dp_init(&request);
+    request.cmd = ODP_DP_GET;
+    request.dp_idx = dpif->minor;
+
+    return dpif_linux_dp_transact(&request, reply, bufp);
+}
 
-- 
1.7.1





More information about the dev mailing list