[ovs-dev] [PATCH RFC v3 4/4] dpif-netdev: Time based output batching.

Mon Aug 14 13:12:21 UTC 2017

> >>From earlier in-house trials we know we need to target flush times of 50
> us or less, so we clearly need better time resolution. Sub-ms timing in PMD
> should be based on TSC cycles, which are already kept in the pmd struct.
> Could you provide a corresponding patch for performance testing?
> 
> I don't think that TSC is suitable in this case. Some reasons:
> 
> * non-PMD threads are able to float across cpu cores.
> * Turbo-boost can be enabled or frequency can be adjusted manually after
> DPDK init.
> * TSC cycles only calculated if DPDK enabled.
> 
> TSC is used currently only for not really precise statistics.
> For the real features we need more accurate time accounting.
> 
> I believe that CLOCK_MONOTONIC is able to provide at least microsecond
> granularity on the most of systems. We just need to add one more wrapper
> function like 'time_usec()' to the lib/timeval.
> 

We have tested the effect of turbo mode on TSC and there is none. The TSC frequency remains at the nominal clock speed, no matter if the core is clocked down or up. So, I believe for PMD threads (where performance matters) TSC would be an adequate and efficient clock.

On PMDs I am a bit concerned about the overhead/latency introduced with the clock_gettime() system call, but I haven't done any measurements to check the actual impact. Have you?

If we go for CLOCK_MONOTONIC in microsecond resolution, we should make sure that the clock is read not more than once once every iteration (and cache the us value as now in the pmd ctx struct as suggested in your other patch). But then for consistency also the XPS feature should use the PMD time in us resolution.

For non-PMD thread we could actually skip time-based output batching completely. The packet rates and the frequency of calls to dpif_netdev_run() in the main ovs-vswitchd thread are so low that time-based flushing doesn't seem to make much sense.

Below you can find an alternative incremental patch on top of your RFC 4/4 that uses TSC on PMD. We will be comparing the two alternatives for performance both with non-PMD guests (iperf3) as well as PMD guests (DPDK testpmd).

BR, Jan

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0d78ae4..8285786 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -265,7 +265,7 @@ struct dp_netdev {
     struct hmap ports;
     struct seq *port_seq;       /* Incremented whenever a port changes. */
 
-    /* The time that a packet can wait in output batch for sending. */
+    /* Time in cycles that a packet can wait in output batch for sending. */
     atomic_uint32_t output_max_latency;
 
     /* Meters. */
@@ -508,7 +508,7 @@ struct tx_port {
     int qid;
     long long last_used;
     struct hmap_node node;
-    long long output_time;
+    long long flush_time;   /* Time in LSC cycles to flush output buffer. */
     struct dp_packet_batch output_pkts;
 };
 
@@ -622,6 +622,7 @@ struct dpif_netdev {
     uint64_t last_port_seq;
 };
 
+static inline unsigned long cycles_per_microsecond(void);
 static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
                               struct dp_netdev_port **portp)
     OVS_REQUIRES(dp->port_mutex);
@@ -2921,11 +2922,12 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
     uint32_t output_max_latency, cur_max_latency;
 
     output_max_latency = smap_get_int(other_config, "output-max-latency",
-                                      DEFAULT_OUTPUT_MAX_LATENCY);
+                                      DEFAULT_OUTPUT_MAX_LATENCY)
+                         * cycles_per_microsecond();
     atomic_read_relaxed(&dp->output_max_latency, &cur_max_latency);
     if (output_max_latency != cur_max_latency) {
         atomic_store_relaxed(&dp->output_max_latency, output_max_latency);
-        VLOG_INFO("Output maximum latency set to %"PRIu32" ms",
+        VLOG_INFO("Output maximum latency set to %"PRIu32" cycles",
                   output_max_latency);
     }
 
@@ -3091,6 +3093,16 @@ cycles_counter(void)
 #endif
 }
 
+static inline unsigned long
+cycles_per_microsecond(void)
+{
+#ifdef DPDK_NETDEV
+    return rte_get_tsc_hz() / (1000 * 1000);
+#else
+    return 0;
+#endif
+}
+
 /* Fake mutex to make sure that the calls to cycles_count_* are balanced */
 extern struct ovs_mutex cycles_counter_fake_mutex;
 
@@ -3171,7 +3183,7 @@ dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
 
     HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
         if (!dp_packet_batch_is_empty(&p->output_pkts)
-            && (force || p->output_time <= now)) {
+            && (force || p->flush_time <= pmd->last_cycles)) {
             output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p, now);
         }
     }
@@ -3196,7 +3208,6 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
 
         batch_cnt = batch.count;
         dp_netdev_input(pmd, &batch, port_no, now);
-        output_cnt = dp_netdev_pmd_flush_output_packets(pmd, now, false);
     } else if (error != EAGAIN && error != EOPNOTSUPP) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 
@@ -3732,7 +3743,6 @@ dpif_netdev_run(struct dpif *dpif)
     struct dp_netdev_pmd_thread *non_pmd;
     uint64_t new_tnl_seq;
     int process_packets;
-    bool need_to_flush = true;
 
     ovs_mutex_lock(&dp->port_mutex);
     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
@@ -3751,21 +3761,16 @@ dpif_netdev_run(struct dpif *dpif)
                     cycles_count_intermediate(non_pmd, process_packets
                                                        ? PMD_CYCLES_PROCESSING
                                                        : PMD_CYCLES_IDLE);
-                    if (process_packets) {
-                        need_to_flush = false;
-                    }
                 }
             }
         }
-        if (need_to_flush) {
-            /* We didn't receive anything in the process loop.
-             * Check if we need to send something. */
-            process_packets = dp_netdev_pmd_flush_output_packets(non_pmd,
-                                                                 0, false);
-            cycles_count_intermediate(non_pmd, process_packets
-                                               ? PMD_CYCLES_PROCESSING
-                                               : PMD_CYCLES_IDLE);
-        }
+
+        /* Check if queued packets need to be flushed. */
+        process_packets =
+                dp_netdev_pmd_flush_output_packets(non_pmd, 0, false);
+        cycles_count_intermediate(non_pmd, process_packets
+                                           ? PMD_CYCLES_PROCESSING
+                                           : PMD_CYCLES_IDLE);
 
         cycles_count_end(non_pmd, PMD_CYCLES_IDLE);
         dpif_netdev_xps_revalidate_pmd(non_pmd, time_msec(), false);
@@ -3946,7 +3951,6 @@ reload:
     cycles_count_start(pmd);
     for (;;) {
         int process_packets;
-        bool need_to_flush = true;
 
         for (i = 0; i < poll_cnt; i++) {
             process_packets =
@@ -3955,20 +3959,14 @@ reload:
             cycles_count_intermediate(pmd,
                                       process_packets ? PMD_CYCLES_PROCESSING
                                                       : PMD_CYCLES_IDLE);
-            if (process_packets) {
-                need_to_flush = false;
-            }
         }
 
-        if (need_to_flush) {
-            /* We didn't receive anything in the process loop.
-             * Check if we need to send something. */
-            process_packets = dp_netdev_pmd_flush_output_packets(pmd,
-                                                                 0, false);
-            cycles_count_intermediate(pmd,
-                                      process_packets ? PMD_CYCLES_PROCESSING
-                                                      : PMD_CYCLES_IDLE);
-        }
+        /* Check if queued packets need to be flushed. */
+        process_packets =
+                dp_netdev_pmd_flush_output_packets(pmd, 0, false);
+        cycles_count_intermediate(pmd,
+                                  process_packets ? PMD_CYCLES_PROCESSING
+                                                  : PMD_CYCLES_IDLE);
 
         if (lc++ > 1024) {
             bool reload;
@@ -4593,7 +4591,7 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
 
     tx->port = port;
     tx->qid = -1;
-    tx->output_time = 0LL;
+    tx->flush_time = 0LL;
     dp_packet_batch_init(&tx->output_pkts);
 
     hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
@@ -5276,11 +5274,12 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
                 dp_netdev_pmd_flush_output_on_port(pmd, p, now);
             }
 
-            if (dp_packet_batch_is_empty(&p->output_pkts)) {
+            if (dp_packet_batch_is_empty(&p->output_pkts) &&
+                !dp_packet_batch_is_empty(packets_)) {
                 uint32_t cur_max_latency;
 
                 atomic_read_relaxed(&dp->output_max_latency, &cur_max_latency);
-                p->output_time = now + cur_max_latency;
+                p->flush_time = pmd->last_cycles + cur_max_latency;
                 pmd->n_output_batches++;
             }
 
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 23930f0..8ad1d00 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -345,9 +345,9 @@
       </column>
 
       <column name="other_config" key="output-max-latency"
-              type='{"type": "integer", "minInteger": 0, "maxInteger": 1000}'>
+              type='{"type": "integer", "minInteger": 0, "maxInteger": 1000000}'>
         <p>
-          Specifies the time in milliseconds that a packet can wait in output
+          Specifies the time in microseconds that a packet can wait in output
           batch for sending i.e. amount of time that packet can spend in an
           intermediate output queue before sending to netdev.
           This option can be used to configure balance between throughput