[ovs-dev] [RFC PATCH v2 06/10] vxlanipsec: Add userspace support for vxlan ipsec.
Chandran, Sugesh
sugesh.chandran at intel.com
Fri Sep 1 22:15:07 UTC 2017
At very high level, this patch is very long.
Is it possible to split them based on the functionality?
More comments are below,
Regards
_Sugesh
> -----Original Message-----
> From: ovs-dev-bounces at openvswitch.org [mailto:ovs-dev-
> bounces at openvswitch.org] On Behalf Of Ian Stokes
> Sent: Friday, August 25, 2017 5:40 PM
> To: dev at openvswitch.org
> Subject: [ovs-dev] [RFC PATCH v2 06/10] vxlanipsec: Add userspace support
> for vxlan ipsec.
>
> This patch introduces a new tunnel port type 'vxlanipsec'. This port combines
> vxlan tunnelling with IPsec operating in transport mode.
>
> Ciphering and authentication actions ares provided by a DPDK cryptodev.
> The cryptodev operates as a vdev and is associated with the vxlan tunnel
> port. Upon tunnel encapsulation packets are encrypted and a hash digest
> attached to the packet as per RFC4303. Upon decapsulation a packet is first
> verified via the hash and then decrypted.
>
> The cipher algorithm used is 128 AES-CBC and the authentication algorithm is
> HMAC-SHA1-96. Note this work is in progress and is not meant for upstream.
> It's purpose is to solicit feedback on the approach and known issues flagged
> in the accompanying cover letter to the patch series.
>
> Signed-off-by: Ian Stokes <ian.stokes at intel.com>
> ---
[snip]
> }
>
> +
> + padding = (uint8_t *)dp_packet_put_uninit(packet, pad_len +
> digest_len);
> + if (OVS_UNLIKELY(padding == NULL)) {
> + VLOG_ERR("Not enough packet trailing space\n");
> + return;
> + }
> +
[Sugesh] Why should this need to be filled with numbers here. May be I am missing something?
> + /* Fill pad_len using default sequential scheme */
> + for (i = 0; i < pad_len - ESP_TRAILER_LEN; i++) {
> + padding[i] = i + 1;
> + }
> +
> + /* Populate the ESP trailer */
> + padding[pad_len - ESP_TRAILER_LEN] = pad_len - ESP_TRAILER_LEN;
> + padding[pad_len - 1] = IPPROTO_UDP;
> +
> + cop = rte_crypto_op_alloc(ops->crypto_op_pool,
> + RTE_CRYPTO_OP_TYPE_SYMMETRIC);
> +
> + if (cop == NULL) {
> + rte_pktmbuf_free(m);
> + rte_crypto_op_free(cop);
> + VLOG_ERR("Could not allocate crypto op.");
[Sugesh] This shoule be VLOG_ERR_RL.
Since the tunnel combine take care of packet clone, I assume there is no issues of freeing the packet here. However I would suggest to test and make sure
it doesn't break any post tunnel actions and handling of patch ports with tunnel.
> + return;
> + }
> + cop->type = RTE_CRYPTO_OP_TYPE_SYMMETRIC;
> + cop->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED;
> + sym_cop = get_sym_cop(cop);
> + sym_cop->m_src = m;
[Sugesh] Is it possible to define the sym_cop only once and reuse it for all the packets than initializing for every packet,?
> +
[snip]
> + sym_cop->auth.digest.length = digest_len;
> +
> + /* Set and attach sym encrypt session */
> + c_session = ops->en_ops.session;
> + ret = rte_crypto_op_attach_sym_session(cop,c_session);
> +
> + if (ret != 0) {
> + VLOG_ERR("Could not attach crypto session.");
[Sugesh] Should we free the packet here, when the attach failed?
> + }
> +
> + /* Add packet for crypto enqueue and dequeue */
> + cop_p = &cop;
> + ret = rte_cryptodev_enqueue_burst(ops->cdev_id,0, cop_p , 1);
> +
> + if (ret < 1) {
> + rte_pktmbuf_free(cop->sym->m_src);
> + rte_crypto_op_free(cop);
> + VLOG_ERR("Could not enqueue for crypto op.");
[Sugesh] again VLOG_ERR_RL , Use it for all the packet processing path to avoid so many messages.
> + return;
> + }
> +
> + ret = rte_cryptodev_dequeue_burst( ops->cdev_id, 0, cop_p, 1);
> + if (ret < 1) {
> + rte_pktmbuf_free(cop->sym->m_src);
> + rte_crypto_op_free(cop);
> + VLOG_INFO("Could not dequeue crypto op.");
> + return;
> + }
> +
> + if (cop->status != RTE_CRYPTO_OP_STATUS_SUCCESS) {
> + VLOG_ERR("Error occurred during outbound IPsec crypto operation.");
[Sugesh] this case also the packets must be released. Otherwise the packets get forwarded plain.
> + }
> +
> + /* Packet has been modified, free the crypto op */
> + rte_crypto_op_free(cop);
> +
> + return;
> +}
> +
> +
> +void
> +netdev_tnl_push_ipsec_header(struct dp_packet *packet,
> + const struct ovs_action_push_tnl *data) {
> + struct udp_header *udp;
> + int ip_tot_size;
> + struct netdev_vport *dev = data->dev;
> + struct ipsec_options *ops = dev->ipsec_ops;
> + uint16_t iv_len = ops->en_ops.iv.length;
> +
> + /* XXX To do: should handle if packet is not DPDK source first */
> +
> + udp = netdev_tnl_push_esp_header(packet, data->header, data-
> >header_len,
> + ops->spi, ops->seq, iv_len,
> + &ip_tot_size);
> +
> + /* Increment the associated ESP sequence number */
> + ops->seq++;
[Sugesh] Should it be get roll over after some point in time??
I see that the seq is initialized at the time of vport creation.
Is it OK ?
> +
> + /* set udp src port */
> + udp->udp_src = netdev_tnl_get_src_port(packet);
> + udp->udp_len = htons(ip_tot_size);
> +
> + if (udp->udp_csum) {
> + uint32_t csum;
> + if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) {
> + csum = packet_csum_pseudoheader6(netdev_tnl_ipv6_hdr(
> + dp_packet_data(packet)));
> + } else {
> + csum = packet_csum_pseudoheader(netdev_tnl_ip_hdr
> + (dp_packet_data(packet)));
> + }
> +
> + csum = csum_continue(csum, udp, ip_tot_size);
> + udp->udp_csum = csum_finish(csum);
> +
[snip]
> static int
> gre_header_len(ovs_be16 flags)
> {
> @@ -564,6 +813,229 @@ err:
> return NULL;
> }
>
> +/*
> + * The SPI of an IPsec packet is used as an index value of the spi map.
> + * This returns the associated vport netdev and from this we can access
> +the
> + * associated crypto options for decryption.
> + */
> +static struct netdev_vport *
> +get_vport_from_spi(struct dp_packet *packet) {
> + uint8_t *nh = (uint8_t *)dp_packet_data(packet);
> + uint16_t ip_hdr_len = ETH_HEADER_LEN + IP_HEADER_LEN;
> + struct netdev_vport *dev = NULL;
> + struct esp_header *esp = (struct esp_header *)(nh + ip_hdr_len);
> +
> + uint32_t spi = ntohl(esp->spi);
> + dev = spi_map[spi]->vp;
[Sugesh] this has to be a hash map , perhaps it has to point to the ipsec
context instead of vport ?
> + if (!dev) {
> + VLOG_ERR("Could not get associated vport for spi value: %u", spi);
> + }
> +
> + return dev;
> +}
> +
[Sugesh] May be a comment that says inbound is for decrypt
Or handling of packets at the decap side?
> +static int
> +netdev_ipsec_inbound(struct dp_packet *packet, uint16_t *iv_len,
> + uint16_t *trim_len) {
> + struct netdev_vport *dev = NULL;
> + struct ipsec_options *ops = NULL;
> + struct rte_crypto_op *cop;
> + struct rte_crypto_sym_op *sym_cop;
> + struct rte_mbuf *m = (struct rte_mbuf *)packet;
> + struct rte_cryptodev_sym_session *c_session;
> + struct rte_crypto_op **cop_p;
> + uint8_t *pad_len;
> + uint16_t ip_hdr_len, l3_hdr_len, payload_len, digest_len;
> + unsigned ret = 0;
> +
> + dev = get_vport_from_spi(packet);
> +
> + if (!dev) {
> + return -1;
> + }
> +
> + ops = dev->ipsec_ops;
> + digest_len = ops->de_ops.digest_size;
> + *iv_len = ops->de_ops.iv.length;
> +
> + /* Compute expected header lengths. Note this assumes IPv4 */
> + /* XXX To Do: compute both IPv4 andIPv6 header lengths */
> + ip_hdr_len = ETH_HEADER_LEN + IP_HEADER_LEN;
> + l3_hdr_len = ip_hdr_len + ESP_HEADER_LEN;
> + payload_len = dp_packet_size(packet) - (l3_hdr_len + *iv_len +
> + digest_len);
[Snip]
> +
> + /* Set IV offset, length */
> + uint8_t *iv = (uint8_t *) dp_packet_data(packet) + l3_hdr_len;
> + sym_cop->cipher.iv.data = iv;
> + sym_cop->cipher.iv.length = *iv_len;
> +
> + /* Set authentication data offset and length */
> + sym_cop->auth.data.offset = ip_hdr_len;
> + sym_cop->auth.data.length = ESP_HEADER_LEN + *iv_len + payload_len;
> +
> + /* Set authentication digest length, data and adress */
> + sym_cop->auth.digest.data = rte_pktmbuf_mtod_offset(m, void *,
> + rte_pktmbuf_pkt_len(m)
> + - digest_len);
> + sym_cop->auth.digest.phys_addr = rte_pktmbuf_mtophys_offset(m,
> + rte_pktmbuf_pkt_len(m)
> + - digest_len);
> + sym_cop->auth.digest.length = digest_len;
> +
> + /* Set and attach sym encrypt session */
> + c_session = ops->de_ops.session;
> + ret = rte_crypto_op_attach_sym_session(cop,c_session);
> + if (ret != 0) {
> + VLOG_ERR(" Could not attach de crypto session");
> + rte_pktmbuf_free(cop->sym->m_src);
> + return -1;
> + }
> +
> + /* Add packet for crypto enqueue and dequeue */
> + cop_p = &cop;
> + ret = rte_cryptodev_enqueue_burst(ops->cdev_id,0, cop_p , 1);
> +
[Sugesh] I don't see free up the packets on any of following error cases.
Any reason for that?
Also for the free case above , what would be the impact if we have more actions
Something like
PKT-IN --> decap -->VM
--> sample -->controller
This case the same packet is cloned for sample, will it handle properly here?
> + if (ret < 1) {
> + rte_crypto_op_free(cop);
> + VLOG_INFO("Could not enqueue for crypto op");
> + return -1;
> + }
> +
> + ret = rte_cryptodev_dequeue_burst(ops->cdev_id, 0, cop_p, 1);
> +
> + if (ret < 1) {
> + rte_crypto_op_free(cop);
> + VLOG_INFO("Could not dequeue crypto op");
> + return -1;
> + }
> +
> + if (cop->status != RTE_CRYPTO_OP_STATUS_SUCCESS) {
> + VLOG_ERR("Could not verify/decrypt crypto operation");
> + }
> +
> + /* Packet has been modified, free the crypto op */
> + rte_crypto_op_free(cop);
> +
> + /* Set the correct l4 offset for the packet now that it has been
> + * verified and decrypted by taking the IV length into account
> + */
> + packet->l4_ofs = l3_hdr_len + *iv_len;
> +
> + /* Now that the packet is decrypted we can compute the trailer length
> + * that will be stripped from the end of the packet using the padding
> + * value in the esp trailer */
> + pad_len = rte_pktmbuf_mtod_offset(m, uint8_t *,
> + rte_pktmbuf_pkt_len(m) - digest_len
> + - ESP_TRAILER_LEN);
> + *trim_len = digest_len + *pad_len + ESP_TRAILER_LEN;
> +
> + return 0;
> +}
> +
> +struct dp_packet *
> +netdev_vxlanipsec_pop_header(struct dp_packet *packet) {
> + struct pkt_metadata *md = &packet->md;
> + struct flow_tnl *tnl = &md->tunnel;
> + struct vxlanhdr *vxh;
> + unsigned int hlen;
> + ovs_be32 vx_flags;
> + enum packet_type next_pt = PT_ETH;
> + int ret = 0;
> + uint16_t iv_len, trim_len;
> + struct rte_mbuf *m = (struct rte_mbuf *)packet;
> +
> + /* XXX To do: should handle if packet is not DPDK source first */
> +
> + /* Need to verify and decrypt the packet before performing further
> + * operations.
> + */
> + ret = netdev_ipsec_inbound(packet, &iv_len, &trim_len);
> +
> + if (ret) {
> + /* Error occurred during verification/decryption */
> + goto err;
> + }
> +
> + pkt_metadata_init_tnl(md);
> + if (VXLAN_HLEN > dp_packet_l4_size(packet)) {
> + goto err;
> + }
> +
> + vxh = udp_extract_tnl_md(packet, tnl, &hlen);
> + if (!vxh) {
> + goto err;
> + }
> +
> + vx_flags = get_16aligned_be32(&vxh->vx_flags);
> + if (vx_flags & htonl(VXLAN_HF_GPE)) {
> + vx_flags &= htonl(~VXLAN_GPE_USED_BITS);
> + /* Drop the OAM packets */
> + if (vxh->vx_gpe.flags & VXLAN_GPE_FLAGS_O) {
> + goto err;
> + }
> + switch (vxh->vx_gpe.next_protocol) {
> + case VXLAN_GPE_NP_IPV4:
> + next_pt = PT_IPV4;
> + break;
> + case VXLAN_GPE_NP_IPV6:
> + next_pt = PT_IPV6;
> + break;
> + case VXLAN_GPE_NP_NSH:
> + next_pt = PT_NSH;
> + break;
> + case VXLAN_GPE_NP_ETHERNET:
> + next_pt = PT_ETH;
> + break;
> + default:
> + goto err;
> + }
> + }
> +
> + if (vx_flags != htonl(VXLAN_FLAGS) ||
> + (get_16aligned_be32(&vxh->vx_vni) & htonl(0xff))) {
> + VLOG_WARN_RL(&err_rl, "invalid vxlan flags=%#x vni=%#x\n",
> + ntohl(vx_flags),
> + ntohl(get_16aligned_be32(&vxh->vx_vni)));
> + goto err;
> + }
> + tnl->tun_id = htonll(ntohl(get_16aligned_be32(&vxh->vx_vni)) >> 8);
> + tnl->flags |= FLOW_TNL_F_KEY;
> +
> + packet->packet_type = htonl(next_pt);
> + dp_packet_reset_packet(packet, hlen + ESP_HEADER_LEN + iv_len +
> + VXLAN_HLEN);
> + ret = rte_pktmbuf_trim(m, trim_len);
> +
> + if (next_pt != PT_ETH) {
> + packet->l3_ofs = 0;
> + }
> +
> + return packet;
> +err:
> + dp_packet_delete(packet);
> + return NULL;
> +}
> +
> int
> netdev_vxlan_build_header(const struct netdev *netdev,
> struct ovs_action_push_tnl *data, @@ -621,6 +1093,64 @@
> drop:
> return 1;
> }
>
> +int
> +netdev_vxlanipsec_build_header(const struct netdev *netdev,
> + struct ovs_action_push_tnl *data,
> + const struct netdev_tnl_build_header_params
> +*params) {
> + struct netdev_vport *dev = netdev_vport_cast(netdev);
> + struct netdev_tunnel_config *tnl_cfg;
> + struct vxlanhdr *vxh;
> +
> + /* XXX: RCUfy tnl_cfg. */
> + ovs_mutex_lock(&dev->mutex);
> + tnl_cfg = &dev->tnl_cfg;
> +
> + data->dev = netdev_vport_cast(netdev);
> +
> + vxh = vxlanipsec_udp_build_header(dev, tnl_cfg, data, params);
> +
> + if (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GPE)) {
> + put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS |
> VXLAN_HF_GPE));
> + put_16aligned_be32(&vxh->vx_vni,
> + htonl(ntohll(params->flow->tunnel.tun_id) << 8));
> + if (params->flow->packet_type == htonl(PT_ETH)) {
> + vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_ETHERNET;
> + } else if (pt_ns(params->flow->packet_type) == OFPHTN_ETHERTYPE) {
> + switch (pt_ns_type(params->flow->packet_type)) {
> + case ETH_TYPE_IP:
> + vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_IPV4;
> + break;
> + case ETH_TYPE_IPV6:
> + vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_IPV6;
> + break;
> + case ETH_TYPE_TEB:
> + vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_ETHERNET;
> + break;
> + default:
> + goto drop;
> + }
> + } else {
> + goto drop;
> + }
> + } else {
> + put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS));
> + put_16aligned_be32(&vxh->vx_vni,
> + htonl(ntohll(params->flow->tunnel.tun_id) << 8));
> + }
> +
> + ovs_mutex_unlock(&dev->mutex);
> + data->header_len += sizeof *vxh;
> + data->tnl_type = OVS_VPORT_TYPE_VXLAN;
> + return 0;
> +
> +drop:
> + ovs_mutex_unlock(&dev->mutex);
> + return 1;
> +}
> +
> +
> +
> struct dp_packet *
> netdev_geneve_pop_header(struct dp_packet *packet) { diff --git
>
> int netdev_vport_construct(struct netdev *); diff --git a/lib/netdev-vport.c
> b/lib/netdev-vport.c index d11c5cc..8575c5c 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
[Sugesh] In general I am kind of inclined towards having a new file(netdev-ipsec-vport) to keep all the ipsec crypto specific functions.
There two advantages of having it that way 1) code is modular and doesn't impact existing vport implementation 2) Doesn't need to put
a lot of #if, #else for DPDK based code in the current vport code. Again its just my opinion.
Same for the vport.h file .
> @@ -26,6 +26,13 @@
> #include <netinet/in.h>
> #include <netinet/ip6.h>
> #include <sys/ioctl.h>
> +#include <unistd.h>
> +
> +#include <rte_config.h>
> +#include <rte_cryptodev.h>
> +#include <rte_errno.h>
> +#include <rte_malloc.h>
> +#include <rte_mbuf.h>
>
> #include "byte-order.h"
> #include "daemon.h"
> @@ -59,6 +66,8 @@ VLOG_DEFINE_THIS_MODULE(netdev_vport);
>
> #define DEFAULT_TTL 64
>
> +#define SPI_VAL 1
> +
> /* Last read of the route-table's change number. */ static uint64_t
> rt_change_seqno;
>
[Snip]
>
> --
> 1.7.0.7
>
> _______________________________________________
> dev mailing list
> dev at openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
More information about the dev
mailing list