From 63692df103e9c76b26c382ce8079283b7df9a99a Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 23 Oct 2014 14:22:12 -0400
Subject: [PATCH] PCI: Allow numa_node override via sysfs

NUMA systems with ACPI normally describe the physical topology via _PXM
methods.  But many BIOSes don't implement _PXM, which leaves the kernel
with no way to discover the device topology, which reduces performance
because we can't put memory and processes close to the device.

The NUMA node of a PCI device is already exported in the sysfs "numa_node"
file.  Make that file writable so users can workaround the lack of _PXM
methods in the BIOS.  For example:

  echo 3 > /sys/devices/pci0000:ff/0000:03:1f.3/numa_node

sets the node for PCI device 0000:03:1f.3.

Writing the file emits a FW_BUG warning to encourage users to request
firmware updates.  It also taints the kernel with TAINT_FIRMWARE_WORKAROUND
because overriding the node incorrectly can cause performance issues.

[bhelgaas: changelog, documentation text]
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: Myron Stowe <mstowe@redhat.com>
CC: Alexander Ducyk <alexander.h.duyck@redhat.com>
CC: Jiang Liu <jiang.liu@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-bus-pci | 13 ++++++++++++
 drivers/pci/pci-sysfs.c                 | 27 ++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index ee6c040364927..b3bc50f650ee4 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -281,3 +281,16 @@ Description:
 		opt-out of driver binding using a driver_override name such as
 		"none".  Only a single driver may be specified in the override,
 		there is no support for parsing delimiters.
+
+What:		/sys/bus/pci/devices/.../numa_node
+Date:		Oct 2014
+Contact:	Prarit Bhargava <prarit@redhat.com>
+Description:
+		This file contains the NUMA node to which the PCI device is
+		attached, or -1 if the node is unknown.  The initial value
+		comes from an ACPI _PXM method or a similar firmware
+		source.  If that is missing or incorrect, this file can be
+		written to override the node.  In that case, please report
+		a firmware bug to the system vendor.  Writing to this file
+		taints the kernel with TAINT_FIRMWARE_WORKAROUND, which
+		reduces the supportability of your system.
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 92b6d9ab00e49..91e760f9655ba 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -221,12 +221,37 @@ static ssize_t enabled_show(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR_RW(enabled);
 
 #ifdef CONFIG_NUMA
+static ssize_t numa_node_store(struct device *dev,
+			       struct device_attribute *attr, const char *buf,
+			       size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int node, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = kstrtoint(buf, 0, &node);
+	if (ret)
+		return ret;
+
+	if (!node_online(node))
+		return -EINVAL;
+
+	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+	dev_alert(&pdev->dev, FW_BUG "Overriding NUMA node to %d.  Contact your vendor for updates.",
+		  node);
+
+	dev->numa_node = node;
+	return count;
+}
+
 static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
 	return sprintf(buf, "%d\n", dev->numa_node);
 }
-static DEVICE_ATTR_RO(numa_node);
+static DEVICE_ATTR_RW(numa_node);
 #endif
 
 static ssize_t dma_mask_bits_show(struct device *dev,
-- 
GitLab