]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/suse-2.6.27.39/patches.arch/ia64-page-migration
Imported linux-2.6.27.39 suse/xen patches.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.arch / ia64-page-migration
CommitLineData
2cb7cef9
BS
1From: Russ Anderson <rja@sgi.com>
2Subject: ia64: Call migration code on correctable errors v8
3References: 415829
4Acked-by: schwab@suse.de
5
6Migrate data off pages with correctable memory errors. This patch is the
7ia64 specific piece. It connects the CPE handler to the page migration
8code. It is implemented as a kernel loadable module, similar to the mca
9recovery code (mca_recovery.ko). This allows the feature to be turned off
10by uninstalling the module.
11
12
13Signed-off-by: Russ Anderson <rja@sgi.com>
14
15---
16 arch/ia64/Kconfig | 9
17 arch/ia64/include/asm/mca.h | 6
18 arch/ia64/include/asm/page.h | 1
19 arch/ia64/kernel/Makefile | 1
20 arch/ia64/kernel/cpe_migrate.c | 434 +++++++++++++++++++++++++++++++++++++++++
21 arch/ia64/kernel/mca.c | 37 +++
22 6 files changed, 487 insertions(+), 1 deletion(-)
23
24--- a/arch/ia64/include/asm/mca.h
25+++ b/arch/ia64/include/asm/mca.h
26@@ -137,6 +137,7 @@ extern unsigned long __per_cpu_mca[NR_CP
27
28 extern int cpe_vector;
29 extern int ia64_cpe_irq;
30+extern int cpe_poll_enabled;
31 extern void ia64_mca_init(void);
32 extern void ia64_mca_cpu_init(void *);
33 extern void ia64_os_mca_dispatch(void);
34@@ -150,10 +151,15 @@ extern void ia64_slave_init_handler(void
35 extern void ia64_mca_cmc_vector_setup(void);
36 extern int ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
37 extern void ia64_unreg_MCA_extension(void);
38+extern int ia64_reg_CE_extension(int (*fn)(void *));
39+extern void ia64_unreg_CE_extension(void);
40 extern u64 ia64_get_rnat(u64 *);
41 extern void ia64_mca_printk(const char * fmt, ...)
42 __attribute__ ((format (printf, 1, 2)));
43
44+extern struct list_head badpagelist;
45+extern unsigned int total_badpages;
46+
47 struct ia64_mca_notify_die {
48 struct ia64_sal_os_state *sos;
49 int *monarch_cpu;
50--- a/arch/ia64/include/asm/page.h
51+++ b/arch/ia64/include/asm/page.h
52@@ -121,6 +121,7 @@ extern unsigned long max_low_pfn;
53 #endif
54
55 #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
56+#define phys_to_page(kaddr) (pfn_to_page(kaddr >> PAGE_SHIFT))
57 #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
58 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
59
60--- a/arch/ia64/Kconfig
61+++ b/arch/ia64/Kconfig
62@@ -470,6 +470,15 @@ config COMPAT_FOR_U64_ALIGNMENT
63 config IA64_MCA_RECOVERY
64 tristate "MCA recovery from errors other than TLB."
65
66+config IA64_CPE_MIGRATE
67+ tristate "Migrate data off pages with correctable errors"
68+ default m
69+ help
70+ Migrate data off pages with correctable memory errors. Selecting
71+ Y will build this functionality into the kernel. Selecting M will
72+ build this functionality as a kernel loadable module. Installing
73+ the module will turn on the functionality.
74+
75 config PERFMON
76 bool "Performance monitor support"
77 help
78--- /dev/null
79+++ b/arch/ia64/kernel/cpe_migrate.c
80@@ -0,0 +1,434 @@
81+/*
82+ * File: cpe_migrate.c
83+ * Purpose: Migrate data from physical pages with excessive correctable
84+ * errors to new physical pages. Keep the old pages on a discard
85+ * list.
86+ *
87+ * Copyright (C) 2008 SGI - Silicon Graphics Inc.
88+ * Copyright (C) 2008 Russ Anderson <rja@sgi.com>
89+ */
90+
91+#include <linux/sysdev.h>
92+#include <linux/types.h>
93+#include <linux/sched.h>
94+#include <linux/module.h>
95+#include <linux/kernel.h>
96+#include <linux/smp.h>
97+#include <linux/workqueue.h>
98+#include <linux/mm.h>
99+#include <linux/swap.h>
100+#include <linux/vmalloc.h>
101+#include <linux/migrate.h>
102+#include <linux/page-isolation.h>
103+#include <linux/memcontrol.h>
104+#include <linux/kobject.h>
105+
106+#include <asm/page.h>
107+#include <asm/system.h>
108+#include <asm/sn/sn_cpuid.h>
109+#include <asm/mca.h>
110+
111+#define BADRAM_BASENAME "badram"
112+#define CE_HISTORY_LENGTH 30
113+
114+struct cpe_info {
115+ u64 paddr;
116+ u16 node;
117+};
118+static struct cpe_info cpe[CE_HISTORY_LENGTH];
119+
120+static int cpe_polling_enabled = 1;
121+static int cpe_head;
122+static int cpe_tail;
123+static int work_scheduled;
124+static int mstat_cannot_isolate;
125+static int mstat_failed_to_discard;
126+static int mstat_already_marked;
127+static int mstat_already_on_list;
128+
129+DEFINE_SPINLOCK(cpe_migrate_lock);
130+
131+static void
132+get_physical_address(void *buffer, u64 *paddr, u16 *node)
133+{
134+ sal_log_record_header_t *rh;
135+ sal_log_mem_dev_err_info_t *mdei;
136+ ia64_err_rec_t *err_rec;
137+ sal_log_platform_err_info_t *plat_err;
138+ efi_guid_t guid;
139+
140+ err_rec = buffer;
141+ rh = &err_rec->sal_elog_header;
142+ *paddr = 0;
143+ *node = 0;
144+
145+ /*
146+ * Make sure it is a corrected error.
147+ */
148+ if (rh->severity != sal_log_severity_corrected)
149+ return;
150+
151+ plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err;
152+
153+ guid = plat_err->mem_dev_err.header.guid;
154+ if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) {
155+ /*
156+ * Memory cpe
157+ */
158+ mdei = &plat_err->mem_dev_err;
159+ if (mdei->valid.oem_data) {
160+ if (mdei->valid.physical_addr)
161+ *paddr = mdei->physical_addr;
162+
163+ if (mdei->valid.node) {
164+ if (ia64_platform_is("sn2"))
165+ *node = nasid_to_cnodeid(mdei->node);
166+ else
167+ *node = mdei->node;
168+ }
169+ }
170+ }
171+}
172+
173+static struct page *
174+alloc_migrate_page(struct page *ignored, unsigned long node, int **x)
175+{
176+
177+ return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
178+}
179+
180+static int
181+validate_paddr_page(u64 paddr)
182+{
183+ struct page *page;
184+
185+ if (!paddr)
186+ return -EINVAL;
187+
188+ if (!ia64_phys_addr_valid(paddr))
189+ return -EINVAL;
190+
191+ if (!pfn_valid(paddr >> PAGE_SHIFT))
192+ return -EINVAL;
193+
194+ page = phys_to_page(paddr);
195+ if (PageMemError(page))
196+ mstat_already_marked++;
197+ return 0;
198+}
199+
200+static int
201+ia64_mca_cpe_move_page(u64 paddr, u32 node)
202+{
203+ LIST_HEAD(pagelist);
204+ struct page *page;
205+ int ret;
206+
207+ ret = validate_paddr_page(paddr);
208+ if (ret < 0)
209+ return ret;
210+
211+ /*
212+ * convert physical address to page number
213+ */
214+ page = phys_to_page(paddr);
215+
216+ migrate_prep();
217+ ret = isolate_lru_page(page, &pagelist);
218+ if (ret) {
219+ mstat_cannot_isolate++;
220+ return ret;
221+ }
222+
223+ SetPageMemError(page); /* Mark the page as bad */
224+ ret = migrate_pages(&pagelist, alloc_migrate_page, node);
225+ if (ret == 0) {
226+ total_badpages++;
227+ list_add_tail(&page->lru, &badpagelist);
228+ } else {
229+ mstat_failed_to_discard++;
230+ /*
231+ * The page failed to migrate and is not on the bad page list.
232+ * Clearing the error bit will allow another attempt to migrate
233+ * if it gets another correctable error.
234+ */
235+ ClearPageMemError(page);
236+ }
237+
238+ return 0;
239+}
240+
241+/*
242+ * ia64_mca_cpe_migrate
243+ * The worker that does the actual migration. It pulls a
244+ * physical address off the list and calls the migration code.
245+ */
246+static void
247+ia64_mca_cpe_migrate(struct work_struct *unused)
248+{
249+ int ret;
250+ u64 paddr;
251+ u16 node;
252+
253+ do {
254+ paddr = cpe[cpe_tail].paddr;
255+ if (paddr) {
256+ /*
257+ * There is a valid entry that needs processing.
258+ */
259+ node = cpe[cpe_tail].node;
260+
261+ ret = ia64_mca_cpe_move_page(paddr, node);
262+ if (ret <= 0)
263+ /*
264+ * Even though the return status is negative,
265+ * clear the entry. If the same address has
266+ * another CPE it will be re-added to the list.
267+ */
268+ cpe[cpe_tail].paddr = 0;
269+
270+ }
271+ if (++cpe_tail >= CE_HISTORY_LENGTH)
272+ cpe_tail = 0;
273+
274+ } while (cpe_tail != cpe_head);
275+ work_scheduled = 0;
276+}
277+
278+static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate);
279+DEFINE_SPINLOCK(cpe_list_lock);
280+
281+/*
282+ * cpe_setup_migrate
283+ * Get the physical address out of the CPE record, add it
284+ * to the list of addresses to migrate (if not already on),
285+ * and schedule the back end worker task. This is called
286+ * in interrupt context so cannot directly call the migration
287+ * code.
288+ *
289+ * Inputs
290+ * rec The CPE record
291+ * Outputs
292+ * 1 on Success, -1 on failure
293+ */
294+static int
295+cpe_setup_migrate(void *rec)
296+{
297+ u64 paddr;
298+ u16 node;
299+ /* int head, tail; */
300+ int i, ret;
301+
302+ if (!rec)
303+ return -EINVAL;
304+
305+ get_physical_address(rec, &paddr, &node);
306+ ret = validate_paddr_page(paddr);
307+ if (ret < 0)
308+ return -EINVAL;
309+
310+ if ((cpe_head != cpe_tail) || (cpe[cpe_head].paddr != 0))
311+ /*
312+ * List not empty
313+ */
314+ for (i = 0; i < CE_HISTORY_LENGTH; i++) {
315+ if (PAGE_ALIGN(cpe[i].paddr) == PAGE_ALIGN(paddr)) {
316+ mstat_already_on_list++;
317+ return 1; /* already on the list */
318+ }
319+ }
320+
321+ if (!spin_trylock(&cpe_list_lock)) {
322+ /*
323+ * Someone else has the lock. To avoid spinning in interrupt
324+ * handler context, bail.
325+ */
326+ return 1;
327+ }
328+
329+ if (cpe[cpe_head].paddr == 0) {
330+ cpe[cpe_head].node = node;
331+ cpe[cpe_head].paddr = paddr;
332+
333+ if (++cpe_head >= CE_HISTORY_LENGTH)
334+ cpe_head = 0;
335+ }
336+ spin_unlock(&cpe_list_lock);
337+
338+ if (!work_scheduled) {
339+ work_scheduled = 1;
340+ schedule_work(&cpe_enable_work);
341+ }
342+
343+ return 1;
344+}
345+
346+/*
347+ * =============================================================================
348+ */
349+
350+/*
351+ * free_one_bad_page
352+ * Free one page from the list of bad pages.
353+ */
354+static int
355+free_one_bad_page(unsigned long paddr)
356+{
357+ LIST_HEAD(pagelist);
358+ struct page *page, *page2, *target;
359+
360+ /*
361+ * Verify page address
362+ */
363+ target = phys_to_page(paddr);
364+ list_for_each_entry_safe(page, page2, &badpagelist, lru) {
365+ if (page != target)
366+ continue;
367+
368+ ClearPageMemError(page); /* Mark the page as good */
369+ total_badpages--;
370+ list_move_tail(&page->lru, &pagelist);
371+ putback_lru_pages(&pagelist);
372+ break;
373+ }
374+ return 0;
375+}
376+
377+/*
378+ * free_all_bad_pages
379+ * Free all of the pages on the bad pages list.
380+ */
381+static int
382+free_all_bad_pages(void)
383+{
384+ struct page *page, *page2;
385+
386+ list_for_each_entry_safe(page, page2, &badpagelist, lru) {
387+ ClearPageMemError(page); /* Mark the page as good */
388+ total_badpages--;
389+ }
390+ putback_lru_pages(&badpagelist);
391+ return 0;
392+}
393+
394+#define OPT_LEN 16
395+
396+static ssize_t
397+badpage_store(struct kobject *kobj,
398+ struct kobj_attribute *attr, const char *buf, size_t count)
399+{
400+ char optstr[OPT_LEN];
401+ unsigned long opt;
402+ int len = OPT_LEN;
403+ int err;
404+
405+ if (count < len)
406+ len = count;
407+
408+ strlcpy(optstr, buf, len);
409+
410+ err = strict_strtoul(optstr, 16, &opt);
411+ if (err)
412+ return err;
413+
414+ if (opt == 0)
415+ free_all_bad_pages();
416+ else
417+ free_one_bad_page(opt);
418+
419+ return count;
420+}
421+
422+/*
423+ * badpage_show
424+ * Display the number, size, and addresses of all the pages on the
425+ * bad page list.
426+ *
427+ * Note that sysfs provides buf of PAGE_SIZE length. bufend tracks
428+ * the remaining space in buf to avoid overflowing.
429+ */
430+static ssize_t
431+badpage_show(struct kobject *kobj,
432+ struct kobj_attribute *attr, char *buf)
433+
434+{
435+ struct page *page, *page2;
436+ int i = 0, cnt = 0;
437+ char *bufend = buf + PAGE_SIZE;
438+
439+ cnt = snprintf(buf, bufend - (buf + cnt),
440+ "Memory marked bad: %d kB\n"
441+ "Pages marked bad: %d\n"
442+ "Unable to isolate on LRU: %d\n"
443+ "Unable to migrate: %d\n"
444+ "Already marked bad: %d\n"
445+ "Already on list: %d\n"
446+ "List of bad physical pages\n",
447+ total_badpages << (PAGE_SHIFT - 10), total_badpages,
448+ mstat_cannot_isolate, mstat_failed_to_discard,
449+ mstat_already_marked, mstat_already_on_list
450+ );
451+
452+ list_for_each_entry_safe(page, page2, &badpagelist, lru) {
453+ if (bufend - (buf + cnt) < 20)
454+ break; /* Avoid overflowing the buffer */
455+ cnt += snprintf(buf + cnt, bufend - (buf + cnt),
456+ " 0x%011lx", page_to_phys(page));
457+ if (!(++i % 5))
458+ cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
459+ }
460+ cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
461+
462+ return cnt;
463+}
464+
465+static struct kobj_attribute badram_attr = {
466+ .attr = {
467+ .name = "badram",
468+ .mode = S_IWUSR | S_IRUGO,
469+ },
470+ .show = badpage_show,
471+ .store = badpage_store,
472+};
473+
474+static int __init
475+cpe_migrate_external_handler_init(void)
476+{
477+ int error;
478+
479+ error = sysfs_create_file(kernel_kobj, &badram_attr.attr);
480+ if (error)
481+ return -EINVAL;
482+
483+ /*
484+ * register external ce handler
485+ */
486+ if (ia64_reg_CE_extension(cpe_setup_migrate)) {
487+ printk(KERN_ERR "ia64_reg_CE_extension failed.\n");
488+ return -EFAULT;
489+ }
490+ cpe_poll_enabled = cpe_polling_enabled;
491+
492+ printk(KERN_INFO "Registered badram Driver\n");
493+ return 0;
494+}
495+
496+static void __exit
497+cpe_migrate_external_handler_exit(void)
498+{
499+ /* unregister external mca handlers */
500+ ia64_unreg_CE_extension();
501+
502+ sysfs_remove_file(kernel_kobj, &badram_attr.attr);
503+}
504+
505+module_init(cpe_migrate_external_handler_init);
506+module_exit(cpe_migrate_external_handler_exit);
507+
508+module_param(cpe_polling_enabled, int, 0644);
509+MODULE_PARM_DESC(cpe_polling_enabled,
510+ "Enable polling with migration");
511+
512+MODULE_AUTHOR("Russ Anderson <rja@sgi.com>");
513+MODULE_DESCRIPTION("ia64 Corrected Error page migration driver");
514+MODULE_LICENSE("GPL");
515--- a/arch/ia64/kernel/Makefile
516+++ b/arch/ia64/kernel/Makefile
517@@ -27,6 +27,7 @@ obj-$(CONFIG_PERFMON) += perfmon_defaul
518 obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
519 obj-$(CONFIG_CPU_FREQ) += cpufreq/
520 obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
521+obj-$(CONFIG_IA64_CPE_MIGRATE) += cpe_migrate.o
522 obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o
523 obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
524 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
525--- a/arch/ia64/kernel/mca.c
526+++ b/arch/ia64/kernel/mca.c
527@@ -68,6 +68,9 @@
528 *
529 * 2007-04-27 Russ Anderson <rja@sgi.com>
530 * Support multiple cpus going through OS_MCA in the same event.
531+ *
532+ * 2008-04-22 Russ Anderson <rja@sgi.com>
533+ * Migrate data off pages with correctable memory errors.
534 */
535 #include <linux/jiffies.h>
536 #include <linux/types.h>
537@@ -163,7 +166,14 @@ static int cmc_polling_enabled = 1;
538 * but encounters problems retrieving CPE logs. This should only be
539 * necessary for debugging.
540 */
541-static int cpe_poll_enabled = 1;
542+int cpe_poll_enabled = 1;
543+EXPORT_SYMBOL(cpe_poll_enabled);
544+
545+unsigned int total_badpages;
546+EXPORT_SYMBOL(total_badpages);
547+
548+LIST_HEAD(badpagelist);
549+EXPORT_SYMBOL(badpagelist);
550
551 extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
552
553@@ -523,6 +533,28 @@ int mca_recover_range(unsigned long addr
554 }
555 EXPORT_SYMBOL_GPL(mca_recover_range);
556
557+/* Function pointer to Corrected Error memory migration driver */
558+int (*ia64_mca_ce_extension)(void *);
559+
560+int
561+ia64_reg_CE_extension(int (*fn)(void *))
562+{
563+ if (ia64_mca_ce_extension)
564+ return 1;
565+
566+ ia64_mca_ce_extension = fn;
567+ return 0;
568+}
569+EXPORT_SYMBOL(ia64_reg_CE_extension);
570+
571+void
572+ia64_unreg_CE_extension(void)
573+{
574+ if (ia64_mca_ce_extension)
575+ ia64_mca_ce_extension = NULL;
576+}
577+EXPORT_SYMBOL(ia64_unreg_CE_extension);
578+
579 #ifdef CONFIG_ACPI
580
581 int cpe_vector = -1;
582@@ -534,6 +566,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
583 static unsigned long cpe_history[CPE_HISTORY_LENGTH];
584 static int index;
585 static DEFINE_SPINLOCK(cpe_history_lock);
586+ int recover;
587
588 IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
589 __func__, cpe_irq, smp_processor_id());
590@@ -580,6 +613,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
591 out:
592 /* Get the CPE error record and log it */
593 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
594+ recover = (ia64_mca_ce_extension && ia64_mca_ce_extension(
595+ IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE)));
596
597 return IRQ_HANDLED;
598 }