1 From: Russ Anderson <rja@sgi.com>
2 Subject: ia64: Call migration code on correctable errors v8
4 Acked-by: schwab@suse.de
6 Migrate data off pages with correctable memory errors. This patch is the
7 ia64 specific piece. It connects the CPE handler to the page migration
8 code. It is implemented as a kernel loadable module, similar to the mca
9 recovery code (mca_recovery.ko). This allows the feature to be turned off
10 by uninstalling the module.
13 Signed-off-by: Russ Anderson <rja@sgi.com>
17 arch/ia64/include/asm/mca.h | 6
18 arch/ia64/include/asm/page.h | 1
19 arch/ia64/kernel/Makefile | 1
20 arch/ia64/kernel/cpe_migrate.c | 434 +++++++++++++++++++++++++++++++++++++++++
21 arch/ia64/kernel/mca.c | 37 +++
22 6 files changed, 487 insertions(+), 1 deletion(-)
24 --- a/arch/ia64/include/asm/mca.h
25 +++ b/arch/ia64/include/asm/mca.h
26 @@ -137,6 +137,7 @@ extern unsigned long __per_cpu_mca[NR_CP
28 extern int cpe_vector;
29 extern int ia64_cpe_irq;
30 +extern int cpe_poll_enabled;
31 extern void ia64_mca_init(void);
32 extern void ia64_mca_cpu_init(void *);
33 extern void ia64_os_mca_dispatch(void);
34 @@ -150,10 +151,15 @@ extern void ia64_slave_init_handler(void
35 extern void ia64_mca_cmc_vector_setup(void);
36 extern int ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
37 extern void ia64_unreg_MCA_extension(void);
38 +extern int ia64_reg_CE_extension(int (*fn)(void *));
39 +extern void ia64_unreg_CE_extension(void);
40 extern u64 ia64_get_rnat(u64 *);
41 extern void ia64_mca_printk(const char * fmt, ...)
42 __attribute__ ((format (printf, 1, 2)));
44 +extern struct list_head badpagelist;
45 +extern unsigned int total_badpages;
47 struct ia64_mca_notify_die {
48 struct ia64_sal_os_state *sos;
50 --- a/arch/ia64/include/asm/page.h
51 +++ b/arch/ia64/include/asm/page.h
52 @@ -121,6 +121,7 @@ extern unsigned long max_low_pfn;
55 #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
56 +#define phys_to_page(kaddr) (pfn_to_page(kaddr >> PAGE_SHIFT))
57 #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
58 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
60 --- a/arch/ia64/Kconfig
61 +++ b/arch/ia64/Kconfig
62 @@ -470,6 +470,15 @@ config COMPAT_FOR_U64_ALIGNMENT
63 config IA64_MCA_RECOVERY
64 tristate "MCA recovery from errors other than TLB."
66 +config IA64_CPE_MIGRATE
67 + tristate "Migrate data off pages with correctable errors"
70 + Migrate data off pages with correctable memory errors. Selecting
71 + Y will build this functionality into the kernel. Selecting M will
72 + build this functionality as a kernel loadable module. Installing
73 + the module will turn on the functionality.
76 bool "Performance monitor support"
79 +++ b/arch/ia64/kernel/cpe_migrate.c
82 + * File: cpe_migrate.c
83 + * Purpose: Migrate data from physical pages with excessive correctable
84 + * errors to new physical pages. Keep the old pages on a discard
87 + * Copyright (C) 2008 SGI - Silicon Graphics Inc.
88 + * Copyright (C) 2008 Russ Anderson <rja@sgi.com>
91 +#include <linux/sysdev.h>
92 +#include <linux/types.h>
93 +#include <linux/sched.h>
94 +#include <linux/module.h>
95 +#include <linux/kernel.h>
96 +#include <linux/smp.h>
97 +#include <linux/workqueue.h>
98 +#include <linux/mm.h>
99 +#include <linux/swap.h>
100 +#include <linux/vmalloc.h>
101 +#include <linux/migrate.h>
102 +#include <linux/page-isolation.h>
103 +#include <linux/memcontrol.h>
104 +#include <linux/kobject.h>
106 +#include <asm/page.h>
107 +#include <asm/system.h>
108 +#include <asm/sn/sn_cpuid.h>
109 +#include <asm/mca.h>
111 +#define BADRAM_BASENAME "badram"
112 +#define CE_HISTORY_LENGTH 30
118 +static struct cpe_info cpe[CE_HISTORY_LENGTH];
120 +static int cpe_polling_enabled = 1;
121 +static int cpe_head;
122 +static int cpe_tail;
123 +static int work_scheduled;
124 +static int mstat_cannot_isolate;
125 +static int mstat_failed_to_discard;
126 +static int mstat_already_marked;
127 +static int mstat_already_on_list;
129 +DEFINE_SPINLOCK(cpe_migrate_lock);
132 +get_physical_address(void *buffer, u64 *paddr, u16 *node)
134 + sal_log_record_header_t *rh;
135 + sal_log_mem_dev_err_info_t *mdei;
136 + ia64_err_rec_t *err_rec;
137 + sal_log_platform_err_info_t *plat_err;
141 + rh = &err_rec->sal_elog_header;
146 + * Make sure it is a corrected error.
148 + if (rh->severity != sal_log_severity_corrected)
151 + plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err;
153 + guid = plat_err->mem_dev_err.header.guid;
154 + if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) {
158 + mdei = &plat_err->mem_dev_err;
159 + if (mdei->valid.oem_data) {
160 + if (mdei->valid.physical_addr)
161 + *paddr = mdei->physical_addr;
163 + if (mdei->valid.node) {
164 + if (ia64_platform_is("sn2"))
165 + *node = nasid_to_cnodeid(mdei->node);
167 + *node = mdei->node;
173 +static struct page *
174 +alloc_migrate_page(struct page *ignored, unsigned long node, int **x)
177 + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
181 +validate_paddr_page(u64 paddr)
188 + if (!ia64_phys_addr_valid(paddr))
191 + if (!pfn_valid(paddr >> PAGE_SHIFT))
194 + page = phys_to_page(paddr);
195 + if (PageMemError(page))
196 + mstat_already_marked++;
201 +ia64_mca_cpe_move_page(u64 paddr, u32 node)
203 + LIST_HEAD(pagelist);
207 + ret = validate_paddr_page(paddr);
212 + * convert physical address to page number
214 + page = phys_to_page(paddr);
217 + ret = isolate_lru_page(page, &pagelist);
219 + mstat_cannot_isolate++;
223 + SetPageMemError(page); /* Mark the page as bad */
224 + ret = migrate_pages(&pagelist, alloc_migrate_page, node);
227 + list_add_tail(&page->lru, &badpagelist);
229 + mstat_failed_to_discard++;
231 + * The page failed to migrate and is not on the bad page list.
232 + * Clearing the error bit will allow another attempt to migrate
233 + * if it gets another correctable error.
235 + ClearPageMemError(page);
242 + * ia64_mca_cpe_migrate
243 + * The worker that does the actual migration. It pulls a
244 + * physical address off the list and calls the migration code.
247 +ia64_mca_cpe_migrate(struct work_struct *unused)
254 + paddr = cpe[cpe_tail].paddr;
257 + * There is a valid entry that needs processing.
259 + node = cpe[cpe_tail].node;
261 + ret = ia64_mca_cpe_move_page(paddr, node);
264 + * Even though the return status is negative,
265 + * clear the entry. If the same address has
266 + * another CPE it will be re-added to the list.
268 + cpe[cpe_tail].paddr = 0;
271 + if (++cpe_tail >= CE_HISTORY_LENGTH)
274 + } while (cpe_tail != cpe_head);
275 + work_scheduled = 0;
278 +static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate);
279 +DEFINE_SPINLOCK(cpe_list_lock);
282 + * cpe_setup_migrate
283 + * Get the physical address out of the CPE record, add it
284 + * to the list of addresses to migrate (if not already on),
285 + * and schedule the back end worker task. This is called
286 + * in interrupt context so cannot directly call the migration
290 + * rec The CPE record
292 + * 1 on Success, -1 on failure
295 +cpe_setup_migrate(void *rec)
299 + /* int head, tail; */
305 + get_physical_address(rec, &paddr, &node);
306 + ret = validate_paddr_page(paddr);
310 + if ((cpe_head != cpe_tail) || (cpe[cpe_head].paddr != 0))
314 + for (i = 0; i < CE_HISTORY_LENGTH; i++) {
315 + if (PAGE_ALIGN(cpe[i].paddr) == PAGE_ALIGN(paddr)) {
316 + mstat_already_on_list++;
317 + return 1; /* already on the list */
321 + if (!spin_trylock(&cpe_list_lock)) {
323 + * Someone else has the lock. To avoid spinning in interrupt
324 + * handler context, bail.
329 + if (cpe[cpe_head].paddr == 0) {
330 + cpe[cpe_head].node = node;
331 + cpe[cpe_head].paddr = paddr;
333 + if (++cpe_head >= CE_HISTORY_LENGTH)
336 + spin_unlock(&cpe_list_lock);
338 + if (!work_scheduled) {
339 + work_scheduled = 1;
340 + schedule_work(&cpe_enable_work);
347 + * =============================================================================
351 + * free_one_bad_page
352 + * Free one page from the list of bad pages.
355 +free_one_bad_page(unsigned long paddr)
357 + LIST_HEAD(pagelist);
358 + struct page *page, *page2, *target;
361 + * Verify page address
363 + target = phys_to_page(paddr);
364 + list_for_each_entry_safe(page, page2, &badpagelist, lru) {
365 + if (page != target)
368 + ClearPageMemError(page); /* Mark the page as good */
370 + list_move_tail(&page->lru, &pagelist);
371 + putback_lru_pages(&pagelist);
378 + * free_all_bad_pages
379 + * Free all of the pages on the bad pages list.
382 +free_all_bad_pages(void)
384 + struct page *page, *page2;
386 + list_for_each_entry_safe(page, page2, &badpagelist, lru) {
387 + ClearPageMemError(page); /* Mark the page as good */
390 + putback_lru_pages(&badpagelist);
397 +badpage_store(struct kobject *kobj,
398 + struct kobj_attribute *attr, const char *buf, size_t count)
400 + char optstr[OPT_LEN];
408 + strlcpy(optstr, buf, len);
410 + err = strict_strtoul(optstr, 16, &opt);
415 + free_all_bad_pages();
417 + free_one_bad_page(opt);
424 + * Display the number, size, and addresses of all the pages on the
427 + * Note that sysfs provides buf of PAGE_SIZE length. bufend tracks
428 + * the remaining space in buf to avoid overflowing.
431 +badpage_show(struct kobject *kobj,
432 + struct kobj_attribute *attr, char *buf)
435 + struct page *page, *page2;
436 + int i = 0, cnt = 0;
437 + char *bufend = buf + PAGE_SIZE;
439 + cnt = snprintf(buf, bufend - (buf + cnt),
440 + "Memory marked bad: %d kB\n"
441 + "Pages marked bad: %d\n"
442 + "Unable to isolate on LRU: %d\n"
443 + "Unable to migrate: %d\n"
444 + "Already marked bad: %d\n"
445 + "Already on list: %d\n"
446 + "List of bad physical pages\n",
447 + total_badpages << (PAGE_SHIFT - 10), total_badpages,
448 + mstat_cannot_isolate, mstat_failed_to_discard,
449 + mstat_already_marked, mstat_already_on_list
452 + list_for_each_entry_safe(page, page2, &badpagelist, lru) {
453 + if (bufend - (buf + cnt) < 20)
454 + break; /* Avoid overflowing the buffer */
455 + cnt += snprintf(buf + cnt, bufend - (buf + cnt),
456 + " 0x%011lx", page_to_phys(page));
458 + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
460 + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
465 +static struct kobj_attribute badram_attr = {
468 + .mode = S_IWUSR | S_IRUGO,
470 + .show = badpage_show,
471 + .store = badpage_store,
475 +cpe_migrate_external_handler_init(void)
479 + error = sysfs_create_file(kernel_kobj, &badram_attr.attr);
484 + * register external ce handler
486 + if (ia64_reg_CE_extension(cpe_setup_migrate)) {
487 + printk(KERN_ERR "ia64_reg_CE_extension failed.\n");
490 + cpe_poll_enabled = cpe_polling_enabled;
492 + printk(KERN_INFO "Registered badram Driver\n");
497 +cpe_migrate_external_handler_exit(void)
499 + /* unregister external mca handlers */
500 + ia64_unreg_CE_extension();
502 + sysfs_remove_file(kernel_kobj, &badram_attr.attr);
505 +module_init(cpe_migrate_external_handler_init);
506 +module_exit(cpe_migrate_external_handler_exit);
508 +module_param(cpe_polling_enabled, int, 0644);
509 +MODULE_PARM_DESC(cpe_polling_enabled,
510 + "Enable polling with migration");
512 +MODULE_AUTHOR("Russ Anderson <rja@sgi.com>");
513 +MODULE_DESCRIPTION("ia64 Corrected Error page migration driver");
514 +MODULE_LICENSE("GPL");
515 --- a/arch/ia64/kernel/Makefile
516 +++ b/arch/ia64/kernel/Makefile
517 @@ -27,6 +27,7 @@ obj-$(CONFIG_PERFMON) += perfmon_defaul
518 obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
519 obj-$(CONFIG_CPU_FREQ) += cpufreq/
520 obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
521 +obj-$(CONFIG_IA64_CPE_MIGRATE) += cpe_migrate.o
522 obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o
523 obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
524 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
525 --- a/arch/ia64/kernel/mca.c
526 +++ b/arch/ia64/kernel/mca.c
529 * 2007-04-27 Russ Anderson <rja@sgi.com>
530 * Support multiple cpus going through OS_MCA in the same event.
532 + * 2008-04-22 Russ Anderson <rja@sgi.com>
533 + * Migrate data off pages with correctable memory errors.
535 #include <linux/jiffies.h>
536 #include <linux/types.h>
537 @@ -163,7 +166,14 @@ static int cmc_polling_enabled = 1;
538 * but encounters problems retrieving CPE logs. This should only be
539 * necessary for debugging.
541 -static int cpe_poll_enabled = 1;
542 +int cpe_poll_enabled = 1;
543 +EXPORT_SYMBOL(cpe_poll_enabled);
545 +unsigned int total_badpages;
546 +EXPORT_SYMBOL(total_badpages);
548 +LIST_HEAD(badpagelist);
549 +EXPORT_SYMBOL(badpagelist);
551 extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
553 @@ -523,6 +533,28 @@ int mca_recover_range(unsigned long addr
555 EXPORT_SYMBOL_GPL(mca_recover_range);
557 +/* Function pointer to Corrected Error memory migration driver */
558 +int (*ia64_mca_ce_extension)(void *);
561 +ia64_reg_CE_extension(int (*fn)(void *))
563 + if (ia64_mca_ce_extension)
566 + ia64_mca_ce_extension = fn;
569 +EXPORT_SYMBOL(ia64_reg_CE_extension);
572 +ia64_unreg_CE_extension(void)
574 + if (ia64_mca_ce_extension)
575 + ia64_mca_ce_extension = NULL;
577 +EXPORT_SYMBOL(ia64_unreg_CE_extension);
582 @@ -534,6 +566,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
583 static unsigned long cpe_history[CPE_HISTORY_LENGTH];
585 static DEFINE_SPINLOCK(cpe_history_lock);
588 IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
589 __func__, cpe_irq, smp_processor_id());
590 @@ -580,6 +613,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
592 /* Get the CPE error record and log it */
593 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
594 + recover = (ia64_mca_ce_extension && ia64_mca_ce_extension(
595 + IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE)));