]> git.ipfire.org Git - thirdparty/libvirt.git/commitdiff
bandwidth: Create hierarchical shaping classes
authorMichal Privoznik <mprivozn@redhat.com>
Fri, 16 Nov 2012 10:36:02 +0000 (11:36 +0100)
committerMichal Privoznik <mprivozn@redhat.com>
Tue, 11 Dec 2012 17:36:55 +0000 (18:36 +0100)
These classes can borrow unused bandwidth. Basically,
only egress qdsics can have classes, therefore we can
do this kind of traffic shaping only on host's outgoing,
that is domain's incoming traffic.

src/lxc/lxc_process.c
src/network/bridge_driver.c
src/qemu/qemu_command.c
src/qemu/qemu_driver.c
src/util/virnetdevbandwidth.c
src/util/virnetdevbandwidth.h
src/util/virnetdevmacvlan.c

index 50c61c5f9c1ddf14dccdd588b69a064f0747f541..3e7fcb85505f0bfba90e15b61aa78e01e3a442e0 100644 (file)
@@ -341,7 +341,8 @@ static int virLXCProcessSetupInterfaceBridged(virConnectPtr conn,
         goto cleanup;
 
     if (virNetDevBandwidthSet(net->ifname,
-                              virDomainNetGetActualBandwidth(net)) < 0) {
+                              virDomainNetGetActualBandwidth(net),
+                              false) < 0) {
         virReportError(VIR_ERR_INTERNAL_ERROR,
                        _("cannot set bandwidth limits on %s"),
                        net->ifname);
index 09680ff51751243dc65d0f23f7f614091a1ff432..7ad6020a12a9ae9e914a35507b1b79eb39475b62 100644 (file)
@@ -2459,7 +2459,8 @@ networkStartNetworkVirtual(struct network_driver *driver,
         VIR_FORCE_CLOSE(tapfd);
     }
 
-    if (virNetDevBandwidthSet(network->def->bridge, network->def->bandwidth) < 0) {
+    if (virNetDevBandwidthSet(network->def->bridge,
+                              network->def->bandwidth, true) < 0) {
         virReportError(VIR_ERR_INTERNAL_ERROR,
                        _("cannot set bandwidth limits on %s"),
                        network->def->bridge);
index 9009bd296c1d83fca5cdaf718cf1ba0b14f4682e..e10eb09533b36ccc786ca7e1c9feaa845a6e7770 100644 (file)
@@ -292,7 +292,8 @@ qemuNetworkIfaceConnect(virDomainDefPtr def,
 
     if (tapfd >= 0 &&
         virNetDevBandwidthSet(net->ifname,
-                              virDomainNetGetActualBandwidth(net)) < 0) {
+                              virDomainNetGetActualBandwidth(net),
+                              false) < 0) {
         virReportError(VIR_ERR_INTERNAL_ERROR,
                        _("cannot set bandwidth limits on %s"),
                        net->ifname);
index d035bc317178123c61d7492e59d0095f75af51ba..93d9b69b498024023322c6971cc9360255806825 100644 (file)
@@ -9034,7 +9034,7 @@ qemuDomainSetInterfaceParameters(virDomainPtr dom,
                    sizeof(*newBandwidth->out));
         }
 
-        if (virNetDevBandwidthSet(net->ifname, newBandwidth) < 0) {
+        if (virNetDevBandwidthSet(net->ifname, newBandwidth, false) < 0) {
             virReportError(VIR_ERR_INTERNAL_ERROR,
                            _("cannot set bandwidth limits on %s"),
                            device);
index 49fc4259a77a6ca47100caac8838f91550425115..71c272e330f8d00ac2530cfd20222ee94635144c 100644 (file)
@@ -45,17 +45,21 @@ virNetDevBandwidthFree(virNetDevBandwidthPtr def)
  * virNetDevBandwidthSet:
  * @ifname: on which interface
  * @bandwidth: rates to set (may be NULL)
+ * @hierarchical_class: whether to create hierarchical class
  *
  * This function enables QoS on specified interface
  * and set given traffic limits for both, incoming
  * and outgoing traffic. Any previous setting get
- * overwritten.
+ * overwritten. If @hierarchical_class is TRUE, create
+ * hierarchical class. It is used to guarantee minimal
+ * throughput ('floor' attribute in NIC).
  *
  * Return 0 on success, -1 otherwise.
  */
 int
 virNetDevBandwidthSet(const char *ifname,
-                      virNetDevBandwidthPtr bandwidth)
+                      virNetDevBandwidthPtr bandwidth,
+                      bool hierarchical_class)
 {
     int ret = -1;
     virCommandPtr cmd = NULL;
@@ -71,7 +75,7 @@ virNetDevBandwidthSet(const char *ifname,
 
     virNetDevBandwidthClear(ifname);
 
-    if (bandwidth->in) {
+    if (bandwidth->in && bandwidth->in->average) {
         if (virAsprintf(&average, "%llukbps", bandwidth->in->average) < 0)
             goto cleanup;
         if (bandwidth->in->peak &&
@@ -83,15 +87,89 @@ virNetDevBandwidthSet(const char *ifname,
 
         cmd = virCommandNew(TC);
         virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "root",
-                             "handle", "1:", "htb", "default", "1", NULL);
+                             "handle", "1:", "htb", "default",
+                             hierarchical_class ? "2" : "1", NULL);
         if (virCommandRun(cmd, NULL) < 0)
             goto cleanup;
 
+        /* If we are creating a hierarchical class, all non guaranteed traffic
+         * goes to the 1:2 class which will adjust 'rate' dynamically as NICs
+         * with guaranteed throughput are plugged and unplugged. Class 1:1
+         * exists so we don't exceed the maximum limit for the network. For each
+         * NIC with guaranteed throughput a separate classid will be created.
+         * NB '1:' is just a shorter notation of '1:0'.
+         *
+         * To get a picture how this works:
+         *
+         * +-----+     +---------+     +-----------+      +-----------+     +-----+
+         * |     |     |  qdisc  |     | class 1:1 |      | class 1:2 |     |     |
+         * | NIC |     | def 1:2 |     |   rate    |      |   rate    |     | sfq |
+         * |     | --> |         | --> |   peak    | -+-> |   peak    | --> |     |
+         * +-----+     +---------+     +-----------+  |   +-----------+     +-----+
+         *                                            |
+         *                                            |   +-----------+     +-----+
+         *                                            |   | class 1:3 |     |     |
+         *                                            |   |   rate    |     | sfq |
+         *                                            +-> |   peak    | --> |     |
+         *                                            |   +-----------+     +-----+
+         *                                           ...
+         *                                            |   +-----------+     +-----+
+         *                                            |   | class 1:n |     |     |
+         *                                            |   |   rate    |     | sfq |
+         *                                            +-> |   peak    | --> |     |
+         *                                                +-----------+     +-----+
+         *
+         * After the routing decision, when is it clear a packet is to be sent
+         * via a particular NIC, it is sent to the root qdisc (queueing
+         * discipline). In this case HTB (Hierarchical Token Bucket). It has
+         * only one direct child class (with id 1:1) which shapes the overall
+         * rate that is sent through the NIC.  This class has at least one child
+         * (1:2) which is meant for all non-privileged (non guaranteed) traffic
+         * from all domains. Then, for each interface with guaranteed
+         * throughput, a separate class (1:n) is created. Imagine a class is a
+         * box. Whenever a packet ends up in a class it is stored in this box
+         * until the kernel sends it, then it is removed from box. Packets are
+         * placed into boxes based on rules (filters) - e.g. depending on
+         * destination IP/MAC address. If there is no rule to be applied, the
+         * root qdisc has a default where such packets go (1:2 in this case).
+         * Packets come in over and over again and boxes get filled more and
+         * more. Imagine that kernel sends packets just once a second. So it
+         * starts to traverse through this tree. It starts with the root qdisc
+         * and through 1:1 it gets to 1:2. It sends packets up to 1:2's 'rate'.
+         * Then it moves to 1:3 and again sends packets up to 1:3's 'rate'.  The
+         * whole process is repeated until 1:n is processed. So now we have
+         * ensured each class its guaranteed bandwidth. If the sum of sent data
+         * doesn't exceed the 'rate' in 1:1 class, we can go further and send
+         * more packets. The rest of available bandwidth is distributed to the
+         * 1:2,1:3...1:n classes by ratio of their 'rate'. As soon as the root
+         * 'rate' limit is reached or there are no more packets to send, we stop
+         * sending and wait another second. Each class has an SFQ qdisc which
+         * shuffles packets in boxes stochastically, so one sender cannot
+         * starve others.
+         *
+         * Therefore, whenever we want to plug in a new guaranteed interface, we
+         * need to create a new class and adjust the 'rate' of the 1:2 class.
+         * When unplugging we do the exact opposite - remove the associated
+         * class, and adjust the 'rate'.
+         *
+         * This description is rather long, but it is still a good idea to read
+         * it before you dig into the code.
+         */
+        if (hierarchical_class) {
+            virCommandFree(cmd);
+            cmd = virCommandNew(TC);
+            virCommandAddArgList(cmd, "class", "add", "dev", ifname, "parent",
+                                 "1:", "classid", "1:1", "htb", "rate", average,
+                                 "ceil", peak ? peak : average, NULL);
+            if (virCommandRun(cmd, NULL) < 0)
+                goto cleanup;
+        }
         virCommandFree(cmd);
         cmd = virCommandNew(TC);
         virCommandAddArgList(cmd,"class", "add", "dev", ifname, "parent",
-                             "1:", "classid", "1:1", "htb", NULL);
-        virCommandAddArgList(cmd, "rate", average, NULL);
+                             hierarchical_class ? "1:1" : "1:", "classid",
+                             hierarchical_class ? "1:2" : "1:1", "htb",
+                             "rate", average, NULL);
 
         if (peak)
             virCommandAddArgList(cmd, "ceil", peak, NULL);
@@ -104,7 +182,8 @@ virNetDevBandwidthSet(const char *ifname,
         virCommandFree(cmd);
         cmd = virCommandNew(TC);
         virCommandAddArgList(cmd, "qdisc", "add", "dev", ifname, "parent",
-                             "1:1", "handle", "2:", "sfq", "perturb",
+                             hierarchical_class ? "1:2" : "1:1",
+                             "handle", "2:", "sfq", "perturb",
                              "10", NULL);
 
         if (virCommandRun(cmd, NULL) < 0)
index 35f8b89cb2e80bc6135c6e213fdf423f913428a8..d308ab27ecb073329013872a509abadec9344dc2 100644 (file)
@@ -42,7 +42,9 @@ struct _virNetDevBandwidth {
 
 void virNetDevBandwidthFree(virNetDevBandwidthPtr def);
 
-int virNetDevBandwidthSet(const char *ifname, virNetDevBandwidthPtr bandwidth)
+int virNetDevBandwidthSet(const char *ifname,
+                          virNetDevBandwidthPtr bandwidth,
+                          bool hierarchical_class)
     ATTRIBUTE_NONNULL(1) ATTRIBUTE_RETURN_CHECK;
 int virNetDevBandwidthClear(const char *ifname)
     ATTRIBUTE_NONNULL(1);
index d8e646ad935429e05fbe60d6917668e566fe6c5a..657c484158db9c03cf255a289247b915c43933d5 100644 (file)
@@ -925,7 +925,7 @@ create_name:
         rc = 0;
     }
 
-    if (virNetDevBandwidthSet(cr_ifname, bandwidth) < 0) {
+    if (virNetDevBandwidthSet(cr_ifname, bandwidth, false) < 0) {
         virReportError(VIR_ERR_INTERNAL_ERROR,
                        _("cannot set bandwidth limits on %s"),
                        cr_ifname);