[ovs-dev] [PATCH ovn v4] ovn-northd: Add IGMP Relay support

Dumitru Ceara dceara at redhat.com
Fri Aug 16 13:53:43 UTC 2019


On Fri, Aug 16, 2019 at 3:04 PM Numan Siddique <nusiddiq at redhat.com> wrote:
>
>
>
> On Fri, Aug 16, 2019 at 6:23 PM Numan Siddique <nusiddiq at redhat.com> wrote:
>>
>>
>>
>> On Fri, Aug 16, 2019 at 5:51 PM Dumitru Ceara <dceara at redhat.com> wrote:
>>>
>>> Add a new configuration option 'mcast_relay' to the Logical_Router:options
>>> in the OVN Northbound database.
>>>
>>> If a router is configured with 'mcast_relay' enabled then ovn-northd
>>> will install Logical_Flows to allow IP multicast traffic to be routed
>>> between Logical_Switches. The logical router will aggregate all IGMP
>>> groups from attached logical switches and modify the routing pipeline in
>>> the following way:
>>> - Table S_ROUTER_IN_IP_INPUT: add flow allowing IP multicast traffic
>>>   if mcast_relay is enabled on the datapath.
>>> - Table S_ROUTER_IN_IP_ROUTING: add flow matching the group address,
>>>   update TTL and set outport="<Multicast_Group> associated with the
>>>   IGMP group". Continue to next table.
>>> - Table S_ROUTER_IN_ARP_RESOLVE: bypass ARP resolve for IP multicast
>>>   traffic and continue to next table.
>>> - Table S_ROUTER_OUT_DELIVERY: add flow matching IP multicast traffic
>>>   and set ETH.SRC to the MAC address of the logical port on which
>>>   traffic is forwarded.
>>>
>>> Signed-off-by: Dumitru Ceara <dceara at redhat.com>
>>> Acked-by: Mark Michelson <mmichels at redhat.com>
>>
>>
>> Acked-by: Numan Siddique <nusiddiq at redhat.com>
>>
>
> I went ahead and applied to master with the below changes
>
> ****************
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index 592d3b9d4..e29b0fff4 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -6355,14 +6355,8 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>                        "drop;");
>
>          /* Allow multicast if relay enabled (priority 95). */
> -        ds_clear(&actions);
> -        if (od->mcast_info.rtr.relay) {
> -            ds_put_cstr(&actions, "next;");
> -        } else {
> -            ds_put_cstr(&actions, "drop;");
> -        }
> -        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95,
> -                      "ip4.mcast", ds_cstr(&actions));
> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95, "ip4.mcast",
> +                      od->mcast_info.rtr.relay ? "next;" : "drop;");
>
>          /* ARP reply handling.  Use ARP replies to populate the logical
>           * router's ARP table. */
> **********
>
> Thanks
> Numan

Great, thanks Numan!

>
>>
>> Just one comment inline.
>>
>> Thanks
>> Numan
>>
>>>
>>>
>>> ---
>>> v4:
>>> - Address Numan's comment and use the predicate for ip4.mcast and
>>>   add a new ip4.src_mcast predicate.
>>> v3:
>>> - Address Mark's comment and move setting of the outport in the IP
>>>   Routing stage.
>>> - Update commit message.
>>> - Fix some typos.
>>> v2:
>>> - Optimize flooding to multicast router ports.
>>> - Fix check for source IP multicast in router pipeline.
>>> - Use an enum for OVN_MCAST_*_KEY definitions to avoid hard to debug
>>>   errors due to typos when adding new OVN_MCAST_*_KEY values.
>>> - Fix ovn-northd man page for IGMP.
>>> ---
>>>  NEWS                    |   1 +
>>>  lib/logical-fields.c    |   2 +
>>>  lib/mcast-group-index.h |  13 +-
>>>  northd/ovn-northd.8.xml |  79 +++++++-
>>>  northd/ovn-northd.c     | 505 ++++++++++++++++++++++++++++++++++++------------
>>>  ovn-nb.xml              |   6 +
>>>  ovn-sb.xml              |   2 +
>>>  tests/ovn.at            | 199 +++++++++++++++++--
>>>  8 files changed, 656 insertions(+), 151 deletions(-)
>>>
>>> diff --git a/NEWS b/NEWS
>>> index f476984..73045d6 100644
>>> --- a/NEWS
>>> +++ b/NEWS
>>> @@ -39,6 +39,7 @@ Post-v2.11.0
>>>         logical groups which results in tunnels only been formed between
>>>         members of the same transport zone(s).
>>>       * Support for new logical switch port type - 'virtual'.
>>> +     * Support for IGMP Snooping/Querier and Relay.
>>>     - New QoS type "linux-netem" on Linux.
>>>     - Added support for TLS Server Name Indication (SNI).
>>>
>>> diff --git a/lib/logical-fields.c b/lib/logical-fields.c
>>> index 4ad5bf4..8fb591c 100644
>>> --- a/lib/logical-fields.c
>>> +++ b/lib/logical-fields.c
>>> @@ -156,6 +156,8 @@ ovn_init_symtab(struct shash *symtab)
>>>
>>>      expr_symtab_add_field(symtab, "ip4.src", MFF_IPV4_SRC, "ip4", false);
>>>      expr_symtab_add_field(symtab, "ip4.dst", MFF_IPV4_DST, "ip4", false);
>>> +    expr_symtab_add_predicate(symtab, "ip4.src_mcast",
>>> +                              "ip4.src[28..31] == 0xe");
>>>      expr_symtab_add_predicate(symtab, "ip4.mcast", "ip4.dst[28..31] == 0xe");
>>>
>>>      expr_symtab_add_predicate(symtab, "icmp4", "ip4 && ip.proto == 1");
>>> diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
>>> index 15a1592..cb49ad7 100644
>>> --- a/lib/mcast-group-index.h
>>> +++ b/lib/mcast-group-index.h
>>> @@ -20,8 +20,17 @@ struct ovsdb_idl;
>>>
>>>  struct sbrec_datapath_binding;
>>>
>>> -#define OVN_MCAST_FLOOD_TUNNEL_KEY   65535
>>> -#define OVN_MCAST_UNKNOWN_TUNNEL_KEY (OVN_MCAST_FLOOD_TUNNEL_KEY - 1)
>>> +#define OVN_MIN_MULTICAST 32768
>>> +#define OVN_MAX_MULTICAST 65535
>>> +
>>> +enum ovn_mcast_tunnel_keys {
>>> +
>>> +    OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
>>> +    OVN_MCAST_UNKNOWN_TUNNEL_KEY,
>>> +    OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
>>> +    OVN_MIN_IP_MULTICAST,
>>> +    OVN_MAX_IP_MULTICAST = OVN_MAX_MULTICAST,
>>> +};
>>>
>>>  struct ovsdb_idl_index *mcast_group_index_create(struct ovsdb_idl *);
>>>  const struct sbrec_multicast_group *
>>> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>>> index 6d2fbe3..d45bb15 100644
>>> --- a/northd/ovn-northd.8.xml
>>> +++ b/northd/ovn-northd.8.xml
>>> @@ -947,10 +947,40 @@ output;
>>>
>>>      <ul>
>>>        <li>
>>> -        A priority-100 flow that outputs all packets with an Ethernet broadcast
>>> +        A priority-100 flow that punts all IGMP packets to
>>> +        <code>ovn-controller</code> if IGMP snooping is enabled on the
>>> +        logical switch.
>>> +      </li>
>>> +
>>> +      <li>
>>> +        Priority-90 flows that forward registered IP multicast traffic to
>>> +        their corresponding multicast group, which <code>ovn-northd</code>
>>> +        creates based on learnt <ref table="IGMP_Group" db="OVN_Southbound"/>
>>> +        entries.  The flows also forward packets to the
>>> +        <code>MC_MROUTER_FLOOD</code> multicast group, which
>>> +        <code>ovn-nortdh</code> populates with all the logical ports that
>>> +        are connected to logical routers with
>>> +        <ref column="options" table="Logical_Router"/>:mcast_relay='true'.
>>> +      </li>
>>> +
>>> +      <li>
>>> +        A priority-85 flow that forwards all IP multicast traffic destined to
>>> +        224.0.0.X to the <code>MC_FLOOD</code> multicast group, which
>>> +        <code>ovn-northd</code> populates with all enabled logical ports.
>>> +      </li>
>>> +
>>> +      <li>
>>> +        A priority-80 flow that forwards all unregistered IP multicast traffic
>>> +        to the <code>MC_MROUTER_FLOOD</code> multicast group, if any.
>>> +        Otherwise the flow drops all unregistered IP multicast packets.  This
>>> +        flow is added only if <ref column="other_config"
>>> +        table="Logical_Switch"/>:mcast_flood_unregistered='false'.
>>> +      </li>
>>> +
>>> +      <li>
>>> +        A priority-70 flow that outputs all packets with an Ethernet broadcast
>>>          or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
>>> -        multicast group, which <code>ovn-northd</code> populates with all
>>> -        enabled logical ports.
>>> +        multicast group.
>>>        </li>
>>>
>>>        <li>
>>> @@ -1228,6 +1258,14 @@ output;
>>>
>>>        <li>
>>>          <p>
>>> +          A priority-95 flow allows IP multicast traffic if
>>> +          <ref column="options" table="Logical_Router"/>:mcast_relay='true',
>>> +          otherwise drops it.
>>> +        </p>
>>> +      </li>
>>> +
>>> +      <li>
>>> +        <p>
>>>            ICMP echo reply.  These flows reply to ICMP echo requests received
>>>            for the router's IP address.  Let <var>A</var> be an IP address
>>>            owned by a router port.  Then, for each <var>A</var> that is
>>> @@ -1941,6 +1979,16 @@ output;
>>>      <ul>
>>>        <li>
>>>          <p>
>>> +          Priority-500 flows that match IP multicast traffic destined to
>>> +          groups registered on any of the attached switches and sets
>>> +          <code>outport</code> to the associated multicast group that will
>>> +          eventually flood the traffic to all interested attached logical
>>> +          switches. The flows also decrement TTL.
>>> +        </p>
>>> +      </li>
>>> +
>>> +      <li>
>>> +        <p>
>>>            For distributed logical routers where one of the logical router
>>>            ports specifies a <code>redirect-chassis</code>, a priority-400
>>>            logical flow for each ip source/destination couple that matches the
>>> @@ -2074,6 +2122,15 @@ next;
>>>      <ul>
>>>        <li>
>>>          <p>
>>> +          A priority-500 flow that matches IP multicast traffic that was
>>> +          allowed in the routing pipeline. For this kind of traffic the
>>> +          <code>outport</code> was already set so the flow just advances to
>>> +          the next table.
>>> +        </p>
>>> +      </li>
>>> +
>>> +      <li>
>>> +        <p>
>>>            For distributed logical routers where one of the logical router
>>>            ports specifies a <code>redirect-chassis</code>, a priority-400
>>>            logical flow with match <code>REGBIT_DISTRIBUTED_NAT == 1</code>
>>> @@ -2641,9 +2698,19 @@ clone {
>>>      <h3>Egress Table 3: Delivery</h3>
>>>
>>>      <p>
>>> -      Packets that reach this table are ready for delivery.  It contains
>>> -      priority-100 logical flows that match packets on each enabled logical
>>> -      router port, with action <code>output;</code>.
>>> +      Packets that reach this table are ready for delivery.  It contains:
>>> +      <ul>
>>> +        <li>
>>> +          Priority-110 logical flows that match IP multicast packets on each
>>> +          enabled logical router port and modify the Ethernet source address
>>> +          of the packets to the Ethernet address of the port and then execute
>>> +          action <code>output;</code>.
>>> +        </li>
>>> +        <li>
>>> +          Priority-100 logical flows that match packets on each enabled
>>> +          logical router port, with action <code>output;</code>.
>>> +        </li>
>>> +      </ul>
>>>      </p>
>>>
>>>  </manpage>
>>> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
>>> index e861344..592d3b9 100644
>>> --- a/northd/ovn-northd.c
>>> +++ b/northd/ovn-northd.c
>>> @@ -433,32 +433,52 @@ struct ipam_info {
>>>      bool mac_only;
>>>  };
>>>
>>> -#define OVN_MIN_MULTICAST 32768
>>> -#define OVN_MAX_MULTICAST OVN_MCAST_FLOOD_TUNNEL_KEY
>>> -BUILD_ASSERT_DECL(OVN_MIN_MULTICAST < OVN_MAX_MULTICAST);
>>> -
>>> -#define OVN_MIN_IP_MULTICAST OVN_MIN_MULTICAST
>>> -#define OVN_MAX_IP_MULTICAST (OVN_MCAST_UNKNOWN_TUNNEL_KEY - 1)
>>> -BUILD_ASSERT_DECL(OVN_MAX_IP_MULTICAST >= OVN_MIN_MULTICAST);
>>> -
>>>  /*
>>>   * Multicast snooping and querier per datapath configuration.
>>>   */
>>> +struct mcast_switch_info {
>>> +
>>> +    bool enabled;               /* True if snooping enabled. */
>>> +    bool querier;               /* True if querier enabled. */
>>> +    bool flood_unregistered;    /* True if unregistered multicast should be
>>> +                                 * flooded.
>>> +                                 */
>>> +    bool flood_relay;           /* True if the switch is connected to a
>>> +                                 * multicast router and unregistered multicast
>>> +                                 * should be flooded to the mrouter. Only
>>> +                                 * applicable if flood_unregistered == false.
>>> +                                 */
>>> +
>>> +    int64_t table_size;         /* Max number of IP multicast groups. */
>>> +    int64_t idle_timeout;       /* Timeout after which an idle group is
>>> +                                 * flushed.
>>> +                                 */
>>> +    int64_t query_interval;     /* Interval between multicast queries. */
>>> +    char *eth_src;              /* ETH src address of the multicast queries. */
>>> +    char *ipv4_src;             /* IP src address of the multicast queries. */
>>> +    int64_t query_max_response; /* Expected time after which reports should
>>> +                                 * be received for queries that were sent out.
>>> +                                 */
>>> +
>>> +    uint32_t active_flows;      /* Current number of active IP multicast
>>> +                                 * flows.
>>> +                                 */
>>> +};
>>> +
>>> +struct mcast_router_info {
>>> +    bool relay; /* True if the router should relay IP multicast. */
>>> +};
>>> +
>>>  struct mcast_info {
>>> -    bool enabled;
>>> -    bool querier;
>>> -    bool flood_unregistered;
>>> -
>>> -    int64_t table_size;
>>> -    int64_t idle_timeout;
>>> -    int64_t query_interval;
>>> -    char *eth_src;
>>> -    char *ipv4_src;
>>> -    int64_t  query_max_response;
>>> -
>>> -    struct hmap group_tnlids;
>>> -    uint32_t group_tnlid_hint;
>>> -    uint32_t active_flows;
>>> +
>>> +    struct hmap group_tnlids;  /* Group tunnel IDs in use on this DP. */
>>> +    uint32_t group_tnlid_hint; /* Hint for allocating next group tunnel ID. */
>>> +    struct ovs_list groups;    /* List of groups learnt on this DP. */
>>> +
>>> +    union {
>>> +        struct mcast_switch_info sw;  /* Switch specific multicast info. */
>>> +        struct mcast_router_info rtr; /* Router specific multicast info. */
>>> +    };
>>>  };
>>>
>>>  static uint32_t
>>> @@ -559,6 +579,7 @@ ovn_datapath_create(struct hmap *datapaths, const struct uuid *key,
>>>  }
>>>
>>>  static void ovn_ls_port_group_destroy(struct hmap *nb_pgs);
>>> +static void destroy_mcast_info_for_datapath(struct ovn_datapath *od);
>>>
>>>  static void
>>>  ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
>>> @@ -572,12 +593,7 @@ ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
>>>          bitmap_free(od->ipam_info.allocated_ipv4s);
>>>          free(od->router_ports);
>>>          ovn_ls_port_group_destroy(&od->nb_pgs);
>>> -
>>> -        if (od->nbs) {
>>> -            free(od->mcast_info.eth_src);
>>> -            free(od->mcast_info.ipv4_src);
>>> -            destroy_tnlids(&od->mcast_info.group_tnlids);
>>> -        }
>>> +        destroy_mcast_info_for_datapath(od);
>>>
>>>          free(od);
>>>      }
>>> @@ -714,23 +730,28 @@ init_ipam_info_for_datapath(struct ovn_datapath *od)
>>>  }
>>>
>>>  static void
>>> -init_mcast_info_for_datapath(struct ovn_datapath *od)
>>> +init_mcast_info_for_router_datapath(struct ovn_datapath *od)
>>>  {
>>> -    if (!od->nbs) {
>>> -        return;
>>> -    }
>>> +    struct mcast_router_info *mcast_rtr_info = &od->mcast_info.rtr;
>>>
>>> -    struct mcast_info *mcast_info = &od->mcast_info;
>>> +    mcast_rtr_info->relay = smap_get_bool(&od->nbr->options, "mcast_relay",
>>> +                                          false);
>>> +}
>>>
>>> -    mcast_info->enabled =
>>> +static void
>>> +init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
>>> +{
>>> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>>> +
>>> +    mcast_sw_info->enabled =
>>>          smap_get_bool(&od->nbs->other_config, "mcast_snoop", false);
>>> -    mcast_info->querier =
>>> +    mcast_sw_info->querier =
>>>          smap_get_bool(&od->nbs->other_config, "mcast_querier", true);
>>> -    mcast_info->flood_unregistered =
>>> +    mcast_sw_info->flood_unregistered =
>>>          smap_get_bool(&od->nbs->other_config, "mcast_flood_unregistered",
>>>                        false);
>>>
>>> -    mcast_info->table_size =
>>> +    mcast_sw_info->table_size =
>>>          smap_get_ullong(&od->nbs->other_config, "mcast_table_size",
>>>                          OVN_MCAST_DEFAULT_MAX_ENTRIES);
>>>
>>> @@ -742,54 +763,94 @@ init_mcast_info_for_datapath(struct ovn_datapath *od)
>>>      } else if (idle_timeout > OVN_MCAST_MAX_IDLE_TIMEOUT_S) {
>>>          idle_timeout = OVN_MCAST_MAX_IDLE_TIMEOUT_S;
>>>      }
>>> -    mcast_info->idle_timeout = idle_timeout;
>>> +    mcast_sw_info->idle_timeout = idle_timeout;
>>>
>>>      uint32_t query_interval =
>>>          smap_get_ullong(&od->nbs->other_config, "mcast_query_interval",
>>> -                        mcast_info->idle_timeout / 2);
>>> +                        mcast_sw_info->idle_timeout / 2);
>>>      if (query_interval < OVN_MCAST_MIN_QUERY_INTERVAL_S) {
>>>          query_interval = OVN_MCAST_MIN_QUERY_INTERVAL_S;
>>>      } else if (query_interval > OVN_MCAST_MAX_QUERY_INTERVAL_S) {
>>>          query_interval = OVN_MCAST_MAX_QUERY_INTERVAL_S;
>>>      }
>>> -    mcast_info->query_interval = query_interval;
>>> +    mcast_sw_info->query_interval = query_interval;
>>>
>>> -    mcast_info->eth_src =
>>> +    mcast_sw_info->eth_src =
>>>          nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_eth_src"));
>>> -    mcast_info->ipv4_src =
>>> +    mcast_sw_info->ipv4_src =
>>>          nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_ip4_src"));
>>>
>>> -    mcast_info->query_max_response =
>>> +    mcast_sw_info->query_max_response =
>>>          smap_get_ullong(&od->nbs->other_config, "mcast_query_max_response",
>>>                          OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
>>>
>>> -    hmap_init(&mcast_info->group_tnlids);
>>> -    mcast_info->group_tnlid_hint = OVN_MIN_IP_MULTICAST;
>>> -    mcast_info->active_flows = 0;
>>> +    mcast_sw_info->active_flows = 0;
>>> +}
>>> +
>>> +static void
>>> +init_mcast_info_for_datapath(struct ovn_datapath *od)
>>> +{
>>> +    if (!od->nbr && !od->nbs) {
>>> +        return;
>>> +    }
>>> +
>>> +    hmap_init(&od->mcast_info.group_tnlids);
>>> +    od->mcast_info.group_tnlid_hint = OVN_MIN_IP_MULTICAST;
>>> +    ovs_list_init(&od->mcast_info.groups);
>>> +
>>> +    if (od->nbs) {
>>> +        init_mcast_info_for_switch_datapath(od);
>>> +    } else {
>>> +        init_mcast_info_for_router_datapath(od);
>>> +    }
>>> +}
>>> +
>>> +static void
>>> +destroy_mcast_info_for_switch_datapath(struct ovn_datapath *od)
>>> +{
>>> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>>> +
>>> +    free(mcast_sw_info->eth_src);
>>> +    free(mcast_sw_info->ipv4_src);
>>> +}
>>> +
>>> +static void
>>> +destroy_mcast_info_for_datapath(struct ovn_datapath *od)
>>> +{
>>> +    if (!od->nbr && !od->nbs) {
>>> +        return;
>>> +    }
>>> +
>>> +    if (od->nbs) {
>>> +        destroy_mcast_info_for_switch_datapath(od);
>>> +    }
>>> +
>>> +    destroy_tnlids(&od->mcast_info.group_tnlids);
>>>  }
>>>
>>>  static void
>>> -store_mcast_info_for_datapath(const struct sbrec_ip_multicast *sb,
>>> -                              struct ovn_datapath *od)
>>> +store_mcast_info_for_switch_datapath(const struct sbrec_ip_multicast *sb,
>>> +                                     struct ovn_datapath *od)
>>>  {
>>> -    struct mcast_info *mcast_info = &od->mcast_info;
>>> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>>>
>>>      sbrec_ip_multicast_set_datapath(sb, od->sb);
>>> -    sbrec_ip_multicast_set_enabled(sb, &mcast_info->enabled, 1);
>>> -    sbrec_ip_multicast_set_querier(sb, &mcast_info->querier, 1);
>>> -    sbrec_ip_multicast_set_table_size(sb, &mcast_info->table_size, 1);
>>> -    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_info->idle_timeout, 1);
>>> +    sbrec_ip_multicast_set_enabled(sb, &mcast_sw_info->enabled, 1);
>>> +    sbrec_ip_multicast_set_querier(sb, &mcast_sw_info->querier, 1);
>>> +    sbrec_ip_multicast_set_table_size(sb, &mcast_sw_info->table_size, 1);
>>> +    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_sw_info->idle_timeout, 1);
>>>      sbrec_ip_multicast_set_query_interval(sb,
>>> -                                          &mcast_info->query_interval, 1);
>>> +                                          &mcast_sw_info->query_interval, 1);
>>>      sbrec_ip_multicast_set_query_max_resp(sb,
>>> -                                          &mcast_info->query_max_response, 1);
>>> +                                          &mcast_sw_info->query_max_response,
>>> +                                          1);
>>>
>>> -    if (mcast_info->eth_src) {
>>> -        sbrec_ip_multicast_set_eth_src(sb, mcast_info->eth_src);
>>> +    if (mcast_sw_info->eth_src) {
>>> +        sbrec_ip_multicast_set_eth_src(sb, mcast_sw_info->eth_src);
>>>      }
>>>
>>> -    if (mcast_info->ipv4_src) {
>>> -        sbrec_ip_multicast_set_ip4_src(sb, mcast_info->ipv4_src);
>>> +    if (mcast_sw_info->ipv4_src) {
>>> +        sbrec_ip_multicast_set_ip4_src(sb, mcast_sw_info->ipv4_src);
>>>      }
>>>  }
>>>
>>> @@ -906,6 +967,7 @@ join_datapaths(struct northd_context *ctx, struct hmap *datapaths,
>>>                                       NULL, nbr, NULL);
>>>              ovs_list_push_back(nb_only, &od->list);
>>>          }
>>> +        init_mcast_info_for_datapath(od);
>>>          ovs_list_push_back(lr_list, &od->lr_list);
>>>      }
>>>  }
>>> @@ -1999,6 +2061,13 @@ join_logical_ports(struct northd_context *ctx,
>>>                      break;
>>>                  }
>>>              }
>>> +
>>> +            /* If the router is multicast enabled then set relay on the switch
>>> +             * datapath.
>>> +             */
>>> +            if (peer->od && peer->od->mcast_info.rtr.relay) {
>>> +                op->od->mcast_info.sw.flood_relay = true;
>>> +            }
>>>          } else if (op->nbrp && op->nbrp->peer && !op->derived) {
>>>              struct ovn_port *peer = ovn_port_find(ports, op->nbrp->peer);
>>>              if (peer) {
>>> @@ -2846,6 +2915,10 @@ struct multicast_group {
>>>  static const struct multicast_group mc_flood =
>>>      { MC_FLOOD, OVN_MCAST_FLOOD_TUNNEL_KEY };
>>>
>>> +#define MC_MROUTER_FLOOD "_MC_mrouter_flood"
>>> +static const struct multicast_group mc_mrouter_flood =
>>> +    { MC_MROUTER_FLOOD, OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY };
>>> +
>>>  #define MC_UNKNOWN "_MC_unknown"
>>>  static const struct multicast_group mc_unknown =
>>>      { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
>>> @@ -2955,7 +3028,8 @@ ovn_multicast_update_sbrec(const struct ovn_multicast *mc,
>>>   */
>>>  struct ovn_igmp_group_entry {
>>>      struct ovs_list list_node; /* Linkage in the list of entries. */
>>> -    const struct sbrec_igmp_group *sb;
>>> +    size_t n_ports;
>>> +    struct ovn_port **ports;
>>>  };
>>>
>>>  /*
>>> @@ -2964,12 +3038,13 @@ struct ovn_igmp_group_entry {
>>>   */
>>>  struct ovn_igmp_group {
>>>      struct hmap_node hmap_node; /* Index on 'datapath' and 'address'. */
>>> +    struct ovs_list list_node;  /* Linkage in the per-dp igmp group list. */
>>>
>>>      struct ovn_datapath *datapath;
>>>      struct in6_addr address; /* Multicast IPv6-mapped-IPv4 or IPv4 address. */
>>>      struct multicast_group mcgroup;
>>>
>>> -    struct ovs_list sb_entries; /* List of SB entries for this group. */
>>> +    struct ovs_list entries; /* List of SB entries for this group. */
>>>  };
>>>
>>>  static uint32_t
>>> @@ -2997,77 +3072,120 @@ ovn_igmp_group_find(struct hmap *igmp_groups,
>>>      return NULL;
>>>  }
>>>
>>> -static void
>>> +static struct ovn_igmp_group *
>>>  ovn_igmp_group_add(struct northd_context *ctx, struct hmap *igmp_groups,
>>>                     struct ovn_datapath *datapath,
>>> -                   const struct sbrec_igmp_group *sb_igmp_group)
>>> +                   const struct in6_addr *address,
>>> +                   const char *address_s)
>>>  {
>>> -    struct in6_addr group_address;
>>> -    ovs_be32 ipv4;
>>> -
>>> -    if (ip_parse(sb_igmp_group->address, &ipv4)) {
>>> -        group_address = in6_addr_mapped_ipv4(ipv4);
>>> -    } else if (!ipv6_parse(sb_igmp_group->address, &group_address)) {
>>> -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>> -        VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
>>> -                     sb_igmp_group->address);
>>> -        return;
>>> -    }
>>> -
>>>      struct ovn_igmp_group *igmp_group =
>>> -        ovn_igmp_group_find(igmp_groups, datapath, &group_address);
>>> +        ovn_igmp_group_find(igmp_groups, datapath, address);
>>>
>>>      if (!igmp_group) {
>>>          igmp_group = xmalloc(sizeof *igmp_group);
>>>
>>>          const struct sbrec_multicast_group *mcgroup =
>>> -            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
>>> -                               sb_igmp_group->address, datapath->sb);
>>> +            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp, address_s,
>>> +                               datapath->sb);
>>>
>>>          igmp_group->datapath = datapath;
>>> -        igmp_group->address = group_address;
>>> +        igmp_group->address = *address;
>>>          if (mcgroup) {
>>>              igmp_group->mcgroup.key = mcgroup->tunnel_key;
>>>              add_tnlid(&datapath->mcast_info.group_tnlids, mcgroup->tunnel_key);
>>>          } else {
>>>              igmp_group->mcgroup.key = 0;
>>>          }
>>> -        igmp_group->mcgroup.name = sb_igmp_group->address;
>>> -        ovs_list_init(&igmp_group->sb_entries);
>>> +        igmp_group->mcgroup.name = address_s;
>>> +        ovs_list_init(&igmp_group->entries);
>>>
>>>          hmap_insert(igmp_groups, &igmp_group->hmap_node,
>>> -                    ovn_igmp_group_hash(datapath, &group_address));
>>> +                    ovn_igmp_group_hash(datapath, address));
>>> +        ovs_list_push_back(&datapath->mcast_info.groups,
>>> +                           &igmp_group->list_node);
>>> +    }
>>> +
>>> +    return igmp_group;
>>> +}
>>> +
>>> +static bool
>>> +ovn_igmp_group_get_address(const struct sbrec_igmp_group *sb_igmp_group,
>>> +                           struct in6_addr *address)
>>> +{
>>> +    ovs_be32 ipv4;
>>> +
>>> +    if (ip_parse(sb_igmp_group->address, &ipv4)) {
>>> +        *address = in6_addr_mapped_ipv4(ipv4);
>>> +        return true;
>>> +    }
>>> +    if (!ipv6_parse(sb_igmp_group->address, address)) {
>>> +        return false;
>>>      }
>>> +    return true;
>>> +}
>>>
>>> +static struct ovn_port **
>>> +ovn_igmp_group_get_ports(const struct sbrec_igmp_group *sb_igmp_group,
>>> +                         size_t *n_ports, struct hmap *ovn_ports)
>>> +{
>>> +    struct ovn_port **ports = xmalloc(sb_igmp_group->n_ports * sizeof *ports);
>>> +
>>> +     *n_ports = 0;
>>> +     for (size_t i = 0; i < sb_igmp_group->n_ports; i++) {
>>> +        ports[(*n_ports)] =
>>> +            ovn_port_find(ovn_ports, sb_igmp_group->ports[i]->logical_port);
>>> +        if (ports[(*n_ports)]) {
>>> +            (*n_ports)++;
>>> +        }
>>> +    }
>>> +
>>> +    return ports;
>>> +}
>>> +
>>> +static void
>>> +ovn_igmp_group_add_entry(struct ovn_igmp_group *igmp_group,
>>> +                         struct ovn_port **ports, size_t n_ports)
>>> +{
>>>      struct ovn_igmp_group_entry *entry = xmalloc(sizeof *entry);
>>>
>>> -    entry->sb = sb_igmp_group;
>>> -    ovs_list_push_back(&igmp_group->sb_entries , &entry->list_node);
>>> +    entry->ports = ports;
>>> +    entry->n_ports = n_ports;
>>> +    ovs_list_push_back(&igmp_group->entries, &entry->list_node);
>>> +}
>>> +
>>> +static void
>>> +ovn_igmp_group_destroy_entry(struct ovn_igmp_group_entry *entry)
>>> +{
>>> +    free(entry->ports);
>>> +}
>>> +
>>> +static bool
>>> +ovn_igmp_group_allocate_id(struct ovn_igmp_group *igmp_group)
>>> +{
>>> +    if (igmp_group->mcgroup.key == 0) {
>>> +        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>>> +        igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
>>> +    }
>>> +
>>> +    if (igmp_group->mcgroup.key == 0) {
>>> +        return false;
>>> +    }
>>> +
>>> +    return true;
>>>  }
>>>
>>>  static void
>>>  ovn_igmp_group_aggregate_ports(struct ovn_igmp_group *igmp_group,
>>> -                               struct hmap *ovn_ports,
>>>                                 struct hmap *mcast_groups)
>>>  {
>>>      struct ovn_igmp_group_entry *entry;
>>>
>>> -    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
>>> -        size_t n_oports = 0;
>>> -        struct ovn_port **oports =
>>> -            xmalloc(entry->sb->n_ports * sizeof *oports);
>>> -
>>> -        for (size_t i = 0; i < entry->sb->n_ports; i++) {
>>> -            oports[n_oports] =
>>> -                ovn_port_find(ovn_ports, entry->sb->ports[i]->logical_port);
>>> -            if (oports[n_oports]) {
>>> -                n_oports++;
>>> -            }
>>> -        }
>>> -
>>> +    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
>>>          ovn_multicast_add_ports(mcast_groups, igmp_group->datapath,
>>> -                                &igmp_group->mcgroup, oports, n_oports);
>>> -        free(oports);
>>> +                                &igmp_group->mcgroup, entry->ports,
>>> +                                entry->n_ports);
>>> +
>>> +        ovn_igmp_group_destroy_entry(entry);
>>>          free(entry);
>>>      }
>>>  }
>>> @@ -3079,10 +3197,12 @@ ovn_igmp_group_destroy(struct hmap *igmp_groups,
>>>      if (igmp_group) {
>>>          struct ovn_igmp_group_entry *entry;
>>>
>>> -        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
>>> +        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
>>> +            ovn_igmp_group_destroy_entry(entry);
>>>              free(entry);
>>>          }
>>>          hmap_remove(igmp_groups, &igmp_group->hmap_node);
>>> +        ovs_list_remove(&igmp_group->list_node);
>>>          free(igmp_group);
>>>      }
>>>  }
>>> @@ -5286,7 +5406,9 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>>>              continue;
>>>          }
>>>
>>> -        if (od->mcast_info.enabled) {
>>> +        struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>>> +
>>> +        if (mcast_sw_info->enabled) {
>>>              /* Punt IGMP traffic to controller. */
>>>              ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100,
>>>                            "ip4 && ip.proto == 2", "igmp;");
>>> @@ -5299,9 +5421,16 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>>>                            "outport = \""MC_FLOOD"\"; output;");
>>>
>>>              /* Drop unregistered IP multicast if not allowed. */
>>> -            if (!od->mcast_info.flood_unregistered) {
>>> -                ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>>> -                              "ip4 && ip4.mcast", "drop;");
>>> +            if (!mcast_sw_info->flood_unregistered) {
>>> +                /* Forward unregistered IP multicast to mrouter (if any). */
>>> +                if (mcast_sw_info->flood_relay) {
>>> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>>> +                                  "ip4 && ip4.mcast",
>>> +                                  "outport = \""MC_MROUTER_FLOOD"\"; output;");
>>> +                } else {
>>> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>>> +                                  "ip4 && ip4.mcast", "drop;");
>>> +                }
>>>              }
>>>          }
>>>
>>> @@ -5318,18 +5447,26 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>>>              continue;
>>>          }
>>>
>>> -        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>>> +        struct mcast_switch_info *mcast_sw_info =
>>> +            &igmp_group->datapath->mcast_info.sw;
>>>
>>> -        if (mcast_info->active_flows >= mcast_info->table_size) {
>>> +        if (mcast_sw_info->active_flows >= mcast_sw_info->table_size) {
>>>              continue;
>>>          }
>>> -        mcast_info->active_flows++;
>>> +        mcast_sw_info->active_flows++;
>>>
>>>          ds_clear(&match);
>>>          ds_clear(&actions);
>>>
>>>          ds_put_format(&match, "eth.mcast && ip4 && ip4.dst == %s ",
>>>                        igmp_group->mcgroup.name);
>>> +        /* Also flood traffic to all multicast routers with relay enabled. */
>>> +        if (mcast_sw_info->flood_relay) {
>>> +            ds_put_cstr(&actions,
>>> +                        "clone { "
>>> +                            "outport = \""MC_MROUTER_FLOOD "\"; output; "
>>> +                        "};");
>>> +        }
>>>          ds_put_format(&actions, "outport = \"%s\"; output; ",
>>>                        igmp_group->mcgroup.name);
>>>
>>> @@ -6209,7 +6346,7 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>>           * source or destination, and zero network source or destination
>>>           * (priority 100). */
>>>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
>>> -                      "ip4.mcast || "
>>> +                      "ip4.src_mcast ||"
>>>                        "ip4.src == 255.255.255.255 || "
>>>                        "ip4.src == 127.0.0.0/8 || "
>>>                        "ip4.dst == 127.0.0.0/8 || "
>>> @@ -6217,6 +6354,16 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>>                        "ip4.dst == 0.0.0.0/8",
>>>                        "drop;");
>>>
>>> +        /* Allow multicast if relay enabled (priority 95). */
>>> +        ds_clear(&actions);
>>> +        if (od->mcast_info.rtr.relay) {
>>> +            ds_put_cstr(&actions, "next;");
>>> +        } else {
>>> +            ds_put_cstr(&actions, "drop;");
>>> +        }
>>> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95,
>>> +                      "ip4.mcast", ds_cstr(&actions));
>>> +
>>
>>
>> Instead of if/else how about
>> ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95, "ip4.mcast",  od->mcast_info.rtr.relay ? "next": "drop");
>>
>>
>>>          /* ARP reply handling.  Use ARP replies to populate the logical
>>>           * router's ARP table. */
>>>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
>>> @@ -7487,6 +7634,27 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>>          }
>>>      }
>>>
>>> +    /* IP Multicast lookup. Here we set the output port, adjust TTL and
>>> +     * advance to next table (priority 500).
>>> +     */
>>> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>>> +        if (!od->nbr || !od->mcast_info.rtr.relay) {
>>> +            continue;
>>> +        }
>>> +        struct ovn_igmp_group *igmp_group;
>>> +
>>> +        LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
>>> +            ds_clear(&match);
>>> +            ds_clear(&actions);
>>> +            ds_put_format(&match, "ip4 && ip4.dst == %s ",
>>> +                          igmp_group->mcgroup.name);
>>> +            ds_put_format(&actions, "outport = \"%s\"; ip.ttl--; next;",
>>> +                          igmp_group->mcgroup.name);
>>> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 500,
>>> +                          ds_cstr(&match), ds_cstr(&actions));
>>> +        }
>>> +    }
>>> +
>>>      /* Logical router ingress table 8: Policy.
>>>       *
>>>       * A packet that arrives at this table is an IP packet that should be
>>> @@ -7517,10 +7685,24 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>>
>>>      /* Local router ingress table 9: ARP Resolution.
>>>       *
>>> -     * Any packet that reaches this table is an IP packet whose next-hop IP
>>> -     * address is in reg0. (ip4.dst is the final destination.) This table
>>> -     * resolves the IP address in reg0 into an output port in outport and an
>>> -     * Ethernet address in eth.dst. */
>>> +     * Multicast packets already have the outport set so just advance to next
>>> +     * table (priority 500). */
>>> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>>> +        if (!od->nbr) {
>>> +            continue;
>>> +        }
>>> +
>>> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 500,
>>> +                      "ip4.mcast", "next;");
>>> +    }
>>> +
>>> +    /* Local router ingress table 9: ARP Resolution.
>>> +     *
>>> +     * Any unicast packet that reaches this table is an IP packet whose
>>> +     * next-hop IP address is in reg0. (ip4.dst is the final destination.)
>>> +     * This table resolves the IP address in reg0 into an output port in
>>> +     * outport and an Ethernet address in eth.dst.
>>> +     */
>>>      HMAP_FOR_EACH (op, key_node, ports) {
>>>          if (op->nbsp && !lsp_is_enabled(op->nbsp)) {
>>>              continue;
>>> @@ -8002,9 +8184,13 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>>          ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
>>>      }
>>>
>>> -    /* Logical router egress table 1: Delivery (priority 100).
>>> +    /* Logical router egress table 1: Delivery (priority 100-110).
>>>       *
>>> -     * Priority 100 rules deliver packets to enabled logical ports. */
>>> +     * Priority 100 rules deliver packets to enabled logical ports.
>>> +     * Priority 110 rules match multicast packets and update the source
>>> +     * mac before delivering to enabled logical ports. IP multicast traffic
>>> +     * bypasses S_ROUTER_IN_IP_ROUTING route lookups.
>>> +     */
>>>      HMAP_FOR_EACH (op, key_node, ports) {
>>>          if (!op->nbrp) {
>>>              continue;
>>> @@ -8024,6 +8210,20 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>>              continue;
>>>          }
>>>
>>> +        /* If multicast relay is enabled then also adjust source mac for IP
>>> +         * multicast traffic.
>>> +         */
>>> +        if (op->od->mcast_info.rtr.relay) {
>>> +            ds_clear(&match);
>>> +            ds_clear(&actions);
>>> +            ds_put_format(&match, "ip4.mcast && outport == %s",
>>> +                          op->json_key);
>>> +            ds_put_format(&actions, "eth.src = %s; output;",
>>> +                          op->lrp_networks.ea_s);
>>> +            ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 110,
>>> +                        ds_cstr(&match), ds_cstr(&actions));
>>> +        }
>>> +
>>>          ds_clear(&match);
>>>          ds_put_format(&match, "outport == %s", op->json_key);
>>>          ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
>>> @@ -8574,7 +8774,7 @@ build_ip_mcast(struct northd_context *ctx, struct hmap *datapaths)
>>>          if (!ip_mcast) {
>>>              ip_mcast = sbrec_ip_multicast_insert(ctx->ovnsb_txn);
>>>          }
>>> -        store_mcast_info_for_datapath(ip_mcast, od);
>>> +        store_mcast_info_for_switch_datapath(ip_mcast, od);
>>>      }
>>>
>>>      /* Delete southbound records without northbound matches. */
>>> @@ -8606,6 +8806,14 @@ build_mcast_groups(struct northd_context *ctx,
>>>
>>>          if (lsp_is_enabled(op->nbsp)) {
>>>              ovn_multicast_add(mcast_groups, &mc_flood, op);
>>> +
>>> +            /* If this port is connected to a multicast router then add it
>>> +             * to the MC_MROUTER_FLOOD group.
>>> +             */
>>> +            if (op->od->mcast_info.sw.flood_relay && op->peer &&
>>> +                    op->peer->od && op->peer->od->mcast_info.rtr.relay) {
>>> +                ovn_multicast_add(mcast_groups, &mc_mrouter_flood, op);
>>> +            }
>>>          }
>>>      }
>>>
>>> @@ -8628,10 +8836,61 @@ build_mcast_groups(struct northd_context *ctx,
>>>              continue;
>>>          }
>>>
>>> +        struct in6_addr group_address;
>>> +        if (!ovn_igmp_group_get_address(sb_igmp, &group_address)) {
>>> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>>> +            VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
>>> +                         sb_igmp->address);
>>> +            continue;
>>> +        }
>>> +
>>>          /* Add the IGMP group entry. Will also try to allocate an ID for it
>>>           * if the multicast group already exists.
>>>           */
>>> -        ovn_igmp_group_add(ctx, igmp_groups, od, sb_igmp);
>>> +        struct ovn_igmp_group *igmp_group =
>>> +            ovn_igmp_group_add(ctx, igmp_groups, od, &group_address,
>>> +                               sb_igmp->address);
>>> +
>>> +        /* Extract the IGMP group ports from the SB entry and store them
>>> +         * in the IGMP group.
>>> +         */
>>> +        size_t n_igmp_ports;
>>> +        struct ovn_port **igmp_ports =
>>> +            ovn_igmp_group_get_ports(sb_igmp, &n_igmp_ports, ports);
>>> +        ovn_igmp_group_add_entry(igmp_group, igmp_ports, n_igmp_ports);
>>> +    }
>>> +
>>> +    /* Build IGMP groups for multicast routers with relay enabled. The router
>>> +     * IGMP groups are based on the groups learnt by their multicast enabled
>>> +     * peers.
>>> +     */
>>> +    struct ovn_datapath *od;
>>> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>>> +
>>> +        if (ovs_list_is_empty(&od->mcast_info.groups)) {
>>> +            continue;
>>> +        }
>>> +
>>> +        for (size_t i = 0; i < od->n_router_ports; i++) {
>>> +            struct ovn_port *router_port = od->router_ports[i]->peer;
>>> +
>>> +            if (!router_port || !router_port->od ||
>>> +                    !router_port->od->mcast_info.rtr.relay) {
>>> +                continue;
>>> +            }
>>> +
>>> +            struct ovn_igmp_group *igmp_group;
>>> +            LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
>>> +                struct ovn_igmp_group *igmp_group_rtr =
>>> +                    ovn_igmp_group_add(ctx, igmp_groups, router_port->od,
>>> +                                       &igmp_group->address,
>>> +                                       igmp_group->mcgroup.name);
>>> +                struct ovn_port **router_igmp_ports =
>>> +                    xmalloc(sizeof *router_igmp_ports);
>>> +                router_igmp_ports[0] = router_port;
>>> +                ovn_igmp_group_add_entry(igmp_group_rtr, router_igmp_ports, 1);
>>> +            }
>>> +        }
>>>      }
>>>
>>>      /* Walk the aggregated IGMP groups and allocate IDs for new entries.
>>> @@ -8639,21 +8898,17 @@ build_mcast_groups(struct northd_context *ctx,
>>>       */
>>>      struct ovn_igmp_group *igmp_group, *igmp_group_next;
>>>      HMAP_FOR_EACH_SAFE (igmp_group, igmp_group_next, hmap_node, igmp_groups) {
>>> -        if (igmp_group->mcgroup.key == 0) {
>>> -            struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>>> -            igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
>>> -        }
>>>
>>> -        /* If we ran out of keys just destroy the entry. */
>>> -        if (igmp_group->mcgroup.key == 0) {
>>> +        if (!ovn_igmp_group_allocate_id(igmp_group)) {
>>> +            /* If we ran out of keys just destroy the entry. */
>>>              ovn_igmp_group_destroy(igmp_groups, igmp_group);
>>>              continue;
>>>          }
>>>
>>> -        /* Aggregate the ports from all SB entries corresponding to this
>>> +        /* Aggregate the ports from all entries corresponding to this
>>>           * group.
>>>           */
>>> -        ovn_igmp_group_aggregate_ports(igmp_group, ports, mcast_groups);
>>> +        ovn_igmp_group_aggregate_ports(igmp_group, mcast_groups);
>>>      }
>>>  }
>>>
>>> diff --git a/ovn-nb.xml b/ovn-nb.xml
>>> index e166190..1f8d751 100644
>>> --- a/ovn-nb.xml
>>> +++ b/ovn-nb.xml
>>> @@ -1527,6 +1527,12 @@
>>>            address.
>>>          </p>
>>>        </column>
>>> +      <column name="options" key="mcast_relay" type'{"type": "boolean"}'>
>>> +        <p>
>>> +          Enables/disables IP multicast relay between logical switches
>>> +          connected to the logical router. Default: False.
>>> +        </p>
>>> +      </column>
>>>      </group>
>>>
>>>      <group title="Common Columns">
>>> diff --git a/ovn-sb.xml b/ovn-sb.xml
>>> index 17c45bb..02691bb 100644
>>> --- a/ovn-sb.xml
>>> +++ b/ovn-sb.xml
>>> @@ -1017,6 +1017,8 @@
>>>          <li><code>eth.mcast</code> expands to <code>eth.dst[40]</code></li>
>>>          <li><code>vlan.present</code> expands to <code>vlan.tci[12]</code></li>
>>>          <li><code>ip4</code> expands to <code>eth.type == 0x800</code></li>
>>> +        <li><code>ip4.src_mcast</code> expands to
>>> +        <code>ip4.src[28..31] == 0xe</code></li>
>>>          <li><code>ip4.mcast</code> expands to <code>ip4.dst[28..31] == 0xe</code></li>
>>>          <li><code>ip6</code> expands to <code>eth.type == 0x86dd</code></li>
>>>          <li><code>ip</code> expands to <code>ip4 || ip6</code></li>
>>> diff --git a/tests/ovn.at b/tests/ovn.at
>>> index 71eb390..52c044c 100644
>>> --- a/tests/ovn.at
>>> +++ b/tests/ovn.at
>>> @@ -14721,12 +14721,12 @@ AT_CHECK([ovn-sbctl get controller_event $uuid seq_num], [0], [dnl
>>>  OVN_CLEANUP([hv1], [hv2])
>>>  AT_CLEANUP
>>>
>>> -AT_SETUP([ovn -- IGMP snoop/querier])
>>> +AT_SETUP([ovn -- IGMP snoop/querier/relay])
>>>  AT_SKIP_IF([test $HAVE_PYTHON = no])
>>>  ovn_start
>>>
>>>  # Logical network:
>>> -# Two independent logical switches (sw1 and sw2).
>>> +# Three logical switches (sw1-sw3) connected to a logical router (rtr).
>>>  # sw1:
>>>  #   - subnet 10.0.0.0/8
>>>  #   - 2 ports bound on hv1 (sw1-p11, sw1-p12)
>>> @@ -14736,6 +14736,10 @@ ovn_start
>>>  #   - 1 port bound on hv1 (sw2-p1)
>>>  #   - 1 port bound on hv2 (sw2-p2)
>>>  #   - IGMP Querier from 20.0.0.254
>>> +# sw3:
>>> +#   - subnet 30.0.0.0/8
>>> +#   - 1 port bound on hv1 (sw3-p1)
>>> +#   - 1 port bound on hv2 (sw3-p2)
>>>
>>>  reset_pcap_file() {
>>>      local iface=$1
>>> @@ -14812,29 +14816,47 @@ store_igmp_v3_query() {
>>>  }
>>>
>>>  #
>>> -# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN
>>> -#    IP_PROTO DATA OUTFILE
>>> +# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
>>> +#    IP_CHKSUM IP_PROTO DATA
>>>  #
>>>  # This shell function causes an IP multicast packet to be received on INPORT
>>>  # of HV.
>>>  # The hexdump of the packet is stored in OUTFILE.
>>>  #
>>>  send_ip_multicast_pkt() {
>>> -    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ip_src=$5 ip_dst=$6
>>> -    local ip_len=$7 ip_chksum=$8 proto=$9 data=${10} outfile=${11}
>>> -
>>> -    local ip_ttl=20
>>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4
>>> +    local ip_src=$5 ip_dst=$6 ip_len=$7 ip_ttl=$8 ip_chksum=$9 proto=${10}
>>> +    local data=${11}
>>>
>>>      local eth=${eth_dst}${eth_src}0800
>>>      local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
>>>      local packet=${eth}${ip}${data}
>>>
>>>      as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet}
>>> +}
>>> +
>>> +#
>>> +# store_ip_multicast_pkt ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
>>> +#    IP_CHKSUM IP_PROTO DATA OUTFILE
>>> +#
>>> +# This shell function builds an IP multicast packet and stores the hexdump of
>>> +# the packet in OUTFILE.
>>> +#
>>> +store_ip_multicast_pkt() {
>>> +    local eth_src=$1 eth_dst=$2
>>> +    local ip_src=$3 ip_dst=$4 ip_len=$5 ip_ttl=$6 ip_chksum=$7 proto=$8
>>> +    local data=$9 outfile=${10}
>>> +
>>> +    local eth=${eth_dst}${eth_src}0800
>>> +    local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
>>> +    local packet=${eth}${ip}${data}
>>> +
>>>      echo ${packet} >> ${outfile}
>>>  }
>>>
>>>  ovn-nbctl ls-add sw1
>>>  ovn-nbctl ls-add sw2
>>> +ovn-nbctl ls-add sw3
>>>
>>>  ovn-nbctl lsp-add sw1 sw1-p11
>>>  ovn-nbctl lsp-add sw1 sw1-p12
>>> @@ -14842,6 +14864,26 @@ ovn-nbctl lsp-add sw1 sw1-p21
>>>  ovn-nbctl lsp-add sw1 sw1-p22
>>>  ovn-nbctl lsp-add sw2 sw2-p1
>>>  ovn-nbctl lsp-add sw2 sw2-p2
>>> +ovn-nbctl lsp-add sw3 sw3-p1
>>> +ovn-nbctl lsp-add sw3 sw3-p2
>>> +
>>> +ovn-nbctl lr-add rtr
>>> +ovn-nbctl lrp-add rtr rtr-sw1 00:00:00:00:01:00 10.0.0.254/24
>>> +ovn-nbctl lrp-add rtr rtr-sw2 00:00:00:00:02:00 20.0.0.254/24
>>> +ovn-nbctl lrp-add rtr rtr-sw3 00:00:00:00:03:00 30.0.0.254/24
>>> +
>>> +ovn-nbctl lsp-add sw1 sw1-rtr                      \
>>> +    -- lsp-set-type sw1-rtr router                 \
>>> +    -- lsp-set-addresses sw1-rtr 00:00:00:00:01:00 \
>>> +    -- lsp-set-options sw1-rtr router-port=rtr-sw1
>>> +ovn-nbctl lsp-add sw2 sw2-rtr                      \
>>> +    -- lsp-set-type sw2-rtr router                 \
>>> +    -- lsp-set-addresses sw2-rtr 00:00:00:00:02:00 \
>>> +    -- lsp-set-options sw2-rtr router-port=rtr-sw2
>>> +ovn-nbctl lsp-add sw3 sw3-rtr                      \
>>> +    -- lsp-set-type sw3-rtr router                 \
>>> +    -- lsp-set-addresses sw3-rtr 00:00:00:00:03:00 \
>>> +    -- lsp-set-options sw3-rtr router-port=rtr-sw3
>>>
>>>  net_add n1
>>>  sim_add hv1
>>> @@ -14863,6 +14905,11 @@ ovs-vsctl -- add-port br-int hv1-vif3 -- \
>>>      options:tx_pcap=hv1/vif3-tx.pcap \
>>>      options:rxq_pcap=hv1/vif3-rx.pcap \
>>>      ofport-request=1
>>> +ovs-vsctl -- add-port br-int hv1-vif4 -- \
>>> +    set interface hv1-vif4 external-ids:iface-id=sw3-p1 \
>>> +    options:tx_pcap=hv1/vif4-tx.pcap \
>>> +    options:rxq_pcap=hv1/vif4-rx.pcap \
>>> +    ofport-request=1
>>>
>>>  sim_add hv2
>>>  as hv2
>>> @@ -14883,12 +14930,18 @@ ovs-vsctl -- add-port br-int hv2-vif3 -- \
>>>      options:tx_pcap=hv2/vif3-tx.pcap \
>>>      options:rxq_pcap=hv2/vif3-rx.pcap \
>>>      ofport-request=1
>>> +ovs-vsctl -- add-port br-int hv2-vif4 -- \
>>> +    set interface hv2-vif4 external-ids:iface-id=sw3-p2 \
>>> +    options:tx_pcap=hv2/vif4-tx.pcap \
>>> +    options:rxq_pcap=hv2/vif4-rx.pcap \
>>> +    ofport-request=1
>>>
>>>  OVN_POPULATE_ARP
>>>
>>>  # Enable IGMP snooping on sw1.
>>> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_querier="false"
>>> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_snoop="true"
>>> +ovn-nbctl set Logical_Switch sw1       \
>>> +    other_config:mcast_querier="false" \
>>> +    other_config:mcast_snoop="true"
>>>
>>>  # No IGMP query should be generated by sw1 (mcast_querier="false").
>>>  truncate -s 0 expected
>>> @@ -14921,9 +14974,12 @@ truncate -s 0 expected
>>>  truncate -s 0 expected_empty
>>>  send_ip_multicast_pkt hv1-vif2 hv1 \
>>>      000000000001 01005e000144 \
>>> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
>>> -    e518e518000a3b3a0000 \
>>> -    expected
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>>> +    e518e518000a3b3a0000
>>> +store_ip_multicast_pkt \
>>> +    000000000001 01005e000144 \
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>>> +    e518e518000a3b3a0000 expected
>>>
>>>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected])
>>>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
>>> @@ -14944,17 +15000,19 @@ OVS_WAIT_UNTIL([
>>>      test "${total_entries}" = "1"
>>>  ])
>>>
>>> -# Send traffic traffic and make sure it gets forwarded only on the port that
>>> -# joined.
>>> +# Send traffic and make sure it gets forwarded only on the port that joined.
>>>  as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>>>  as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>>>  truncate -s 0 expected
>>>  truncate -s 0 expected_empty
>>>  send_ip_multicast_pkt hv1-vif2 hv1 \
>>>      000000000001 01005e000144 \
>>> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
>>> -    e518e518000a3b3a0000 \
>>> -    expected
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>>> +    e518e518000a3b3a0000
>>> +store_ip_multicast_pkt \
>>> +    000000000001 01005e000144 \
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>>> +    e518e518000a3b3a0000 expected
>>>
>>>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
>>>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
>>> @@ -14988,6 +15046,111 @@ sleep 1
>>>  OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected])
>>>  OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
>>>
>>> +# Dissable IGMP querier on sw2.
>>> +ovn-nbctl set Logical_Switch sw2 \
>>> +    other_config:mcast_querier="false"
>>> +
>>> +# Enable IGMP snooping on sw3.
>>> +ovn-nbctl set Logical_Switch sw3       \
>>> +    other_config:mcast_querier="false" \
>>> +    other_config:mcast_snoop="true"
>>> +
>>> +# Send traffic from sw3 and make sure rtr doesn't relay it.
>>> +truncate -s 0 expected_empty
>>> +
>>> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>>> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
>>> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
>>> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
>>> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>>> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
>>> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
>>> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
>>> +
>>> +send_ip_multicast_pkt hv2-vif4 hv2 \
>>> +    000000000001 01005e000144 \
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>>> +    e518e518000a3b3a0000
>>> +
>>> +# Sleep a bit to make sure no traffic is received and then check.
>>> +sleep 1
>>> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
>>> +
>>> +# Enable IGMP relay on rtr
>>> +ovn-nbctl set logical_router rtr \
>>> +    options:mcast_relay="true"
>>> +
>>> +# Inject IGMP Join for 239.0.1.68 on sw1-p11.
>>> +send_igmp_v3_report hv1-vif1 hv1 \
>>> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>>> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>>> +    /dev/null
>>> +# Inject IGMP Join for 239.0.1.68 on sw2-p2.
>>> +send_igmp_v3_report hv2-vif3 hv2 \
>>> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>>> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>>> +    /dev/null
>>> +# Inject IGMP Join for 239.0.1.68 on sw3-p1.
>>> +send_igmp_v3_report hv1-vif4 hv1 \
>>> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>>> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>>> +    /dev/null
>>> +
>>> +# Check that the IGMP Group is learned by all switches.
>>> +OVS_WAIT_UNTIL([
>>> +    total_entries=`ovn-sbctl find IGMP_Group | grep "239.0.1.68" | wc -l`
>>> +    test "${total_entries}" = "3"
>>> +])
>>> +
>>> +# Send traffic from sw3 and make sure it is relayed by rtr.
>>> +# and ports that joined.
>>> +truncate -s 0 expected_routed_sw1
>>> +truncate -s 0 expected_routed_sw2
>>> +truncate -s 0 expected_switched
>>> +truncate -s 0 expected_empty
>>> +
>>> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>>> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
>>> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
>>> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
>>> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>>> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
>>> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
>>> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
>>> +
>>> +send_ip_multicast_pkt hv2-vif4 hv2 \
>>> +    000000000001 01005e000144 \
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>>> +    e518e518000a3b3a0000
>>> +store_ip_multicast_pkt \
>>> +    000000000100 01005e000144 \
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
>>> +    e518e518000a3b3a0000 expected_routed_sw1
>>> +store_ip_multicast_pkt \
>>> +    000000000200 01005e000144 \
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
>>> +    e518e518000a3b3a0000 expected_routed_sw2
>>> +store_ip_multicast_pkt \
>>> +    000000000001 01005e000144 \
>>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>>> +    e518e518000a3b3a0000 expected_switched
>>> +
>>> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_routed_sw1])
>>> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_routed_sw2])
>>> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_switched])
>>> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
>>> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
>>> +
>>>  OVN_CLEANUP([hv1], [hv2])
>>>  AT_CLEANUP
>>>
>>> --
>>> 1.8.3.1
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev at openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev


More information about the dev mailing list