]> git.ipfire.org Git - thirdparty/libvirt.git/commitdiff
nodedev: add switchdev to NIC capabilities
authorEdan David <edand@mellanox.com>
Mon, 21 Aug 2017 09:19:53 +0000 (05:19 -0400)
committerJohn Ferlan <jferlan@redhat.com>
Mon, 18 Sep 2017 12:32:24 +0000 (08:32 -0400)
Adding functionality to libvirt that will allow querying the interface
for the availability of switchdev Offloading NIC capabilities.

The switchdev mode was introduced in kernel 4.8, the iproute2-devlink
command to retrieve the switchdev NIC feature with command example:

    devlink dev eswitch show pci/0000:03:00.0

This feature is needed for Openstack so we can do a scheduling decision
if the NIC is in Hardware Offload (switchdev) or regular SR-IOV (legacy) mode.
And select the appropriate hypervisors with the requested capability see [1].

[1] - https://specs.openstack.org/openstack/nova-specs/specs/pike/approved/enable-sriov-nic-features.html

Reviewed-by: Laine Stump <laine@laine.org>
Reviewed-by: John Ferlan <jferlan@redhat.com>
configure.ac
docs/formatnode.html.in
src/util/virnetdev.c
src/util/virnetdev.h
tests/nodedevschemadata/net_00_13_02_b9_f9_d3.xml
tests/nodedevschemadata/net_00_15_58_2f_e9_55.xml

index a46b9c693b30cbef344f37be4cab7a50956839e7..c9509c7f96edb4aacb59edb11dc4deab69e86a59 100644 (file)
@@ -627,6 +627,19 @@ if test "$with_linux" = "yes"; then
     AC_CHECK_HEADERS([linux/btrfs.h])
 fi
 
+dnl
+dnl check for kernel headers required by devlink
+dnl
+if test "$with_linux" = "yes"; then
+    AC_CHECK_HEADERS([linux/devlink.h])
+    AC_CHECK_DECLS([DEVLINK_GENL_VERSION, DEVLINK_GENL_NAME, DEVLINK_ATTR_MAX, DEVLINK_CMD_ESWITCH_GET, DEVLINK_ATTR_BUS_NAME, DEVLINK_ATTR_DEV_NAME, DEVLINK_ATTR_ESWITCH_MODE, DEVLINK_ESWITCH_MODE_SWITCHDEV],
+                   [AC_DEFINE([HAVE_DECL_DEVLINK],
+                              [1],
+                              [whether devlink declarations are available])],
+                   [],
+                   [[#include <linux/devlink.h>]])
+fi
+
 dnl Allow perl/python overrides
 AC_PATH_PROGS([PYTHON], [python2 python])
 if test -z "$PYTHON"; then
index 4d935b50f9f438311691fd750f4874480672a006..29244a8984580ac8e2045e8d8125ab59562e8750 100644 (file)
                     <dt><code>rxhash</code></dt><dd>receive-hashing</dd>
                     <dt><code>rdma</code></dt><dd>remote-direct-memory-access</dd>
                     <dt><code>txudptnl</code></dt><dd>tx-udp-tunnel-segmentation</dd>
+                    <dt><code>switchdev</code></dt><dd>kernel-forward-plane-offload</dd>
                 </dl>
               </dd>
               <dt><code>capability</code></dt>
index 51a6e42c5ca8b07eca1ec3886aaaec9b644fa8cf..04069392540eadce0cf154b4450662805187c058 100644 (file)
 # include <net/if_dl.h>
 #endif
 
+#if HAVE_DECL_DEVLINK
+# include <linux/devlink.h>
+#endif
+
 #ifndef IFNAMSIZ
 # define IFNAMSIZ 16
 #endif
@@ -2481,7 +2485,8 @@ VIR_ENUM_IMPL(virNetDevFeature,
               "ntuple",
               "rxhash",
               "rdma",
-              "txudptnl")
+              "txudptnl",
+              "switchdev")
 
 #ifdef __linux__
 int
@@ -3115,6 +3120,181 @@ virNetDevGetEthtoolFeatures(virBitmapPtr bitmap,
 }
 
 
+# if HAVE_DECL_DEVLINK
+/**
+ * virNetDevPutExtraHeader
+ * reserve and prepare room for an extra header
+ * This function sets to zero the room that is required to put the extra
+ * header after the initial Netlink header. This function also increases
+ * the nlmsg_len field.
+ *
+ * @nlh: pointer to Netlink header
+ * @size: size of the extra header that we want to put
+ *
+ * Returns pointer to the start of the extended header
+ */
+static void *
+virNetDevPutExtraHeader(struct nlmsghdr *nlh,
+                        size_t size)
+{
+    char *ptr = (char *)nlh + nlh->nlmsg_len;
+    size_t len = NLMSG_ALIGN(size);
+    nlh->nlmsg_len += len;
+    return ptr;
+}
+
+
+/**
+ * virNetDevGetFamilyId:
+ * This function supplies the devlink family id
+ *
+ * @family_name: the name of the family to query
+ *
+ * Returns family id or 0 on failure.
+ */
+static uint32_t
+virNetDevGetFamilyId(const char *family_name)
+{
+    struct nl_msg *nl_msg = NULL;
+    struct nlmsghdr *resp = NULL;
+    struct genlmsghdr* gmsgh = NULL;
+    struct nlattr *tb[CTRL_ATTR_MAX + 1] = {NULL, };
+    unsigned int recvbuflen;
+    uint32_t family_id = 0;
+
+    if (!(nl_msg = nlmsg_alloc_simple(GENL_ID_CTRL,
+                                      NLM_F_REQUEST | NLM_F_ACK))) {
+        virReportOOMError();
+        goto cleanup;
+    }
+
+    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
+        goto cleanup;
+
+    gmsgh->cmd = CTRL_CMD_GETFAMILY;
+    gmsgh->version = DEVLINK_GENL_VERSION;
+
+    if (nla_put_string(nl_msg, CTRL_ATTR_FAMILY_NAME, family_name) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("allocated netlink buffer is too small"));
+        goto cleanup;
+    }
+
+    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
+        goto cleanup;
+
+    if (nlmsg_parse(resp, sizeof(struct nlmsghdr), tb, CTRL_CMD_MAX, NULL) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("malformed netlink response message"));
+        goto cleanup;
+    }
+
+    if (tb[CTRL_ATTR_FAMILY_ID] == NULL)
+        goto cleanup;
+
+    family_id = *(uint32_t *)RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
+
+ cleanup:
+    nlmsg_free(nl_msg);
+    VIR_FREE(resp);
+    return family_id;
+}
+
+
+/**
+ * virNetDevSwitchdevFeature
+ * This function checks for the availability of Switchdev feature
+ * and add it to bitmap
+ *
+ * @ifname: name of the interface
+ * @out: add Switchdev feature if exist to bitmap
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+static int
+virNetDevSwitchdevFeature(const char *ifname,
+                          virBitmapPtr *out)
+{
+    struct nl_msg *nl_msg = NULL;
+    struct nlmsghdr *resp = NULL;
+    unsigned int recvbuflen;
+    struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {NULL, };
+    virPCIDevicePtr pci_device_ptr = NULL;
+    struct genlmsghdr* gmsgh = NULL;
+    const char *pci_name;
+    char *pfname = NULL;
+    int is_vf = -1;
+    int ret = -1;
+    uint32_t family_id;
+
+    if ((family_id = virNetDevGetFamilyId(DEVLINK_GENL_NAME)) <= 0)
+        return ret;
+
+    if ((is_vf = virNetDevIsVirtualFunction(ifname)) < 0)
+        return ret;
+
+    if (is_vf == 1 && virNetDevGetPhysicalFunction(ifname, &pfname) < 0)
+        goto cleanup;
+
+    if (!(nl_msg = nlmsg_alloc_simple(family_id,
+                                      NLM_F_REQUEST | NLM_F_ACK))) {
+        virReportOOMError();
+        goto cleanup;
+    }
+
+    if (!(gmsgh = virNetDevPutExtraHeader(nlmsg_hdr(nl_msg), sizeof(struct genlmsghdr))))
+        goto cleanup;
+
+    gmsgh->cmd = DEVLINK_CMD_ESWITCH_GET;
+    gmsgh->version = DEVLINK_GENL_VERSION;
+
+    pci_device_ptr = pfname ? virNetDevGetPCIDevice(pfname) :
+                              virNetDevGetPCIDevice(ifname);
+    if (pci_device_ptr == NULL)
+        goto cleanup;
+
+    pci_name = virPCIDeviceGetName(pci_device_ptr);
+
+    if (nla_put(nl_msg, DEVLINK_ATTR_BUS_NAME, strlen("pci")+1, "pci") < 0 ||
+        nla_put(nl_msg, DEVLINK_ATTR_DEV_NAME, strlen(pci_name)+1, pci_name) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("allocated netlink buffer is too small"));
+        goto cleanup;
+    }
+
+    if (virNetlinkCommand(nl_msg, &resp, &recvbuflen, 0, 0, NETLINK_GENERIC, 0) < 0)
+        goto cleanup;
+
+    if (nlmsg_parse(resp, sizeof(struct genlmsghdr), tb, DEVLINK_ATTR_MAX, NULL) < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("malformed netlink response message"));
+        goto cleanup;
+    }
+
+    if (tb[DEVLINK_ATTR_ESWITCH_MODE] &&
+        *(int *)RTA_DATA(tb[DEVLINK_ATTR_ESWITCH_MODE]) == DEVLINK_ESWITCH_MODE_SWITCHDEV) {
+        ignore_value(virBitmapSetBit(*out, VIR_NET_DEV_FEAT_SWITCHDEV));
+    }
+
+    ret = 0;
+
+ cleanup:
+    nlmsg_free(nl_msg);
+    virPCIDeviceFree(pci_device_ptr);
+    VIR_FREE(resp);
+    VIR_FREE(pfname);
+    return ret;
+}
+# else
+static int
+virNetDevSwitchdevFeature(const char *ifname ATTRIBUTE_UNUSED,
+                          virBitmapPtr *out ATTRIBUTE_UNUSED)
+{
+    return 0;
+}
+# endif
+
+
 # if HAVE_DECL_ETHTOOL_GFEATURES
 /**
  * virNetDevGFeatureAvailable
@@ -3315,6 +3495,9 @@ virNetDevGetFeatures(const char *ifname,
     if (virNetDevRDMAFeature(ifname, out) < 0)
         goto cleanup;
 
+    if (virNetDevSwitchdevFeature(ifname, out) < 0)
+        goto cleanup;
+
     ret = 0;
  cleanup:
     VIR_FORCE_CLOSE(fd);
index 9205c0e86c467571ddb69e004b4c2361e8fe4386..71eaf45e30566e8baf877bdc7f90b85f6414509b 100644 (file)
@@ -112,6 +112,7 @@ typedef enum {
     VIR_NET_DEV_FEAT_RXHASH,
     VIR_NET_DEV_FEAT_RDMA,
     VIR_NET_DEV_FEAT_TXUDPTNL,
+    VIR_NET_DEV_FEAT_SWITCHDEV,
     VIR_NET_DEV_FEAT_LAST
 } virNetDevFeature;
 
index d4c96e8533b30c778f7498893e6467b2b0cb0568..88252e6a4e6546a8933f348de44ab92aa450e08d 100644 (file)
@@ -15,6 +15,7 @@
     <feature name='rxhash'/>
     <feature name='rdma'/>
     <feature name='txudptnl'/>
+    <feature name='switchdev'/>
     <capability type='80211'/>
   </capability>
 </device>
index 71bf90e20e531d599d23090727c74830f5fe1fad..f77dfcc3e00a6cb2dfb1e98a4c877a51df54e805 100644 (file)
@@ -15,6 +15,7 @@
     <feature name='rxhash'/>
     <feature name='rdma'/>
     <feature name='txudptnl'/>
+    <feature name='switchdev'/>
     <capability type='80203'/>
   </capability>
 </device>