From: Russ Anderson Subject: ia64: Call migration code on correctable errors v8 References: 415829 Acked-by: schwab@suse.de Migrate data off pages with correctable memory errors. This patch is the ia64 specific piece. It connects the CPE handler to the page migration code. It is implemented as a kernel loadable module, similar to the mca recovery code (mca_recovery.ko). This allows the feature to be turned off by uninstalling the module. Signed-off-by: Russ Anderson --- arch/ia64/Kconfig | 9 arch/ia64/include/asm/mca.h | 6 arch/ia64/include/asm/page.h | 1 arch/ia64/kernel/Makefile | 1 arch/ia64/kernel/cpe_migrate.c | 434 +++++++++++++++++++++++++++++++++++++++++ arch/ia64/kernel/mca.c | 37 +++ 6 files changed, 487 insertions(+), 1 deletion(-) --- a/arch/ia64/include/asm/mca.h +++ b/arch/ia64/include/asm/mca.h @@ -137,6 +137,7 @@ extern unsigned long __per_cpu_mca[NR_CP extern int cpe_vector; extern int ia64_cpe_irq; +extern int cpe_poll_enabled; extern void ia64_mca_init(void); extern void ia64_mca_cpu_init(void *); extern void ia64_os_mca_dispatch(void); @@ -150,10 +151,15 @@ extern void ia64_slave_init_handler(void extern void ia64_mca_cmc_vector_setup(void); extern int ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *)); extern void ia64_unreg_MCA_extension(void); +extern int ia64_reg_CE_extension(int (*fn)(void *)); +extern void ia64_unreg_CE_extension(void); extern u64 ia64_get_rnat(u64 *); extern void ia64_mca_printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))); +extern struct list_head badpagelist; +extern unsigned int total_badpages; + struct ia64_mca_notify_die { struct ia64_sal_os_state *sos; int *monarch_cpu; --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -121,6 +121,7 @@ extern unsigned long max_low_pfn; #endif #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) +#define phys_to_page(kaddr) (pfn_to_page(kaddr >> PAGE_SHIFT)) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -470,6 +470,15 @@ config COMPAT_FOR_U64_ALIGNMENT config IA64_MCA_RECOVERY tristate "MCA recovery from errors other than TLB." +config IA64_CPE_MIGRATE + tristate "Migrate data off pages with correctable errors" + default m + help + Migrate data off pages with correctable memory errors. Selecting + Y will build this functionality into the kernel. Selecting M will + build this functionality as a kernel loadable module. Installing + the module will turn on the functionality. + config PERFMON bool "Performance monitor support" help --- /dev/null +++ b/arch/ia64/kernel/cpe_migrate.c @@ -0,0 +1,434 @@ +/* + * File: cpe_migrate.c + * Purpose: Migrate data from physical pages with excessive correctable + * errors to new physical pages. Keep the old pages on a discard + * list. + * + * Copyright (C) 2008 SGI - Silicon Graphics Inc. + * Copyright (C) 2008 Russ Anderson + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BADRAM_BASENAME "badram" +#define CE_HISTORY_LENGTH 30 + +struct cpe_info { + u64 paddr; + u16 node; +}; +static struct cpe_info cpe[CE_HISTORY_LENGTH]; + +static int cpe_polling_enabled = 1; +static int cpe_head; +static int cpe_tail; +static int work_scheduled; +static int mstat_cannot_isolate; +static int mstat_failed_to_discard; +static int mstat_already_marked; +static int mstat_already_on_list; + +DEFINE_SPINLOCK(cpe_migrate_lock); + +static void +get_physical_address(void *buffer, u64 *paddr, u16 *node) +{ + sal_log_record_header_t *rh; + sal_log_mem_dev_err_info_t *mdei; + ia64_err_rec_t *err_rec; + sal_log_platform_err_info_t *plat_err; + efi_guid_t guid; + + err_rec = buffer; + rh = &err_rec->sal_elog_header; + *paddr = 0; + *node = 0; + + /* + * Make sure it is a corrected error. + */ + if (rh->severity != sal_log_severity_corrected) + return; + + plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err; + + guid = plat_err->mem_dev_err.header.guid; + if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) { + /* + * Memory cpe + */ + mdei = &plat_err->mem_dev_err; + if (mdei->valid.oem_data) { + if (mdei->valid.physical_addr) + *paddr = mdei->physical_addr; + + if (mdei->valid.node) { + if (ia64_platform_is("sn2")) + *node = nasid_to_cnodeid(mdei->node); + else + *node = mdei->node; + } + } + } +} + +static struct page * +alloc_migrate_page(struct page *ignored, unsigned long node, int **x) +{ + + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); +} + +static int +validate_paddr_page(u64 paddr) +{ + struct page *page; + + if (!paddr) + return -EINVAL; + + if (!ia64_phys_addr_valid(paddr)) + return -EINVAL; + + if (!pfn_valid(paddr >> PAGE_SHIFT)) + return -EINVAL; + + page = phys_to_page(paddr); + if (PageMemError(page)) + mstat_already_marked++; + return 0; +} + +static int +ia64_mca_cpe_move_page(u64 paddr, u32 node) +{ + LIST_HEAD(pagelist); + struct page *page; + int ret; + + ret = validate_paddr_page(paddr); + if (ret < 0) + return ret; + + /* + * convert physical address to page number + */ + page = phys_to_page(paddr); + + migrate_prep(); + ret = isolate_lru_page(page, &pagelist); + if (ret) { + mstat_cannot_isolate++; + return ret; + } + + SetPageMemError(page); /* Mark the page as bad */ + ret = migrate_pages(&pagelist, alloc_migrate_page, node); + if (ret == 0) { + total_badpages++; + list_add_tail(&page->lru, &badpagelist); + } else { + mstat_failed_to_discard++; + /* + * The page failed to migrate and is not on the bad page list. + * Clearing the error bit will allow another attempt to migrate + * if it gets another correctable error. + */ + ClearPageMemError(page); + } + + return 0; +} + +/* + * ia64_mca_cpe_migrate + * The worker that does the actual migration. It pulls a + * physical address off the list and calls the migration code. + */ +static void +ia64_mca_cpe_migrate(struct work_struct *unused) +{ + int ret; + u64 paddr; + u16 node; + + do { + paddr = cpe[cpe_tail].paddr; + if (paddr) { + /* + * There is a valid entry that needs processing. + */ + node = cpe[cpe_tail].node; + + ret = ia64_mca_cpe_move_page(paddr, node); + if (ret <= 0) + /* + * Even though the return status is negative, + * clear the entry. If the same address has + * another CPE it will be re-added to the list. + */ + cpe[cpe_tail].paddr = 0; + + } + if (++cpe_tail >= CE_HISTORY_LENGTH) + cpe_tail = 0; + + } while (cpe_tail != cpe_head); + work_scheduled = 0; +} + +static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate); +DEFINE_SPINLOCK(cpe_list_lock); + +/* + * cpe_setup_migrate + * Get the physical address out of the CPE record, add it + * to the list of addresses to migrate (if not already on), + * and schedule the back end worker task. This is called + * in interrupt context so cannot directly call the migration + * code. + * + * Inputs + * rec The CPE record + * Outputs + * 1 on Success, -1 on failure + */ +static int +cpe_setup_migrate(void *rec) +{ + u64 paddr; + u16 node; + /* int head, tail; */ + int i, ret; + + if (!rec) + return -EINVAL; + + get_physical_address(rec, &paddr, &node); + ret = validate_paddr_page(paddr); + if (ret < 0) + return -EINVAL; + + if ((cpe_head != cpe_tail) || (cpe[cpe_head].paddr != 0)) + /* + * List not empty + */ + for (i = 0; i < CE_HISTORY_LENGTH; i++) { + if (PAGE_ALIGN(cpe[i].paddr) == PAGE_ALIGN(paddr)) { + mstat_already_on_list++; + return 1; /* already on the list */ + } + } + + if (!spin_trylock(&cpe_list_lock)) { + /* + * Someone else has the lock. To avoid spinning in interrupt + * handler context, bail. + */ + return 1; + } + + if (cpe[cpe_head].paddr == 0) { + cpe[cpe_head].node = node; + cpe[cpe_head].paddr = paddr; + + if (++cpe_head >= CE_HISTORY_LENGTH) + cpe_head = 0; + } + spin_unlock(&cpe_list_lock); + + if (!work_scheduled) { + work_scheduled = 1; + schedule_work(&cpe_enable_work); + } + + return 1; +} + +/* + * ============================================================================= + */ + +/* + * free_one_bad_page + * Free one page from the list of bad pages. + */ +static int +free_one_bad_page(unsigned long paddr) +{ + LIST_HEAD(pagelist); + struct page *page, *page2, *target; + + /* + * Verify page address + */ + target = phys_to_page(paddr); + list_for_each_entry_safe(page, page2, &badpagelist, lru) { + if (page != target) + continue; + + ClearPageMemError(page); /* Mark the page as good */ + total_badpages--; + list_move_tail(&page->lru, &pagelist); + putback_lru_pages(&pagelist); + break; + } + return 0; +} + +/* + * free_all_bad_pages + * Free all of the pages on the bad pages list. + */ +static int +free_all_bad_pages(void) +{ + struct page *page, *page2; + + list_for_each_entry_safe(page, page2, &badpagelist, lru) { + ClearPageMemError(page); /* Mark the page as good */ + total_badpages--; + } + putback_lru_pages(&badpagelist); + return 0; +} + +#define OPT_LEN 16 + +static ssize_t +badpage_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char optstr[OPT_LEN]; + unsigned long opt; + int len = OPT_LEN; + int err; + + if (count < len) + len = count; + + strlcpy(optstr, buf, len); + + err = strict_strtoul(optstr, 16, &opt); + if (err) + return err; + + if (opt == 0) + free_all_bad_pages(); + else + free_one_bad_page(opt); + + return count; +} + +/* + * badpage_show + * Display the number, size, and addresses of all the pages on the + * bad page list. + * + * Note that sysfs provides buf of PAGE_SIZE length. bufend tracks + * the remaining space in buf to avoid overflowing. + */ +static ssize_t +badpage_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) + +{ + struct page *page, *page2; + int i = 0, cnt = 0; + char *bufend = buf + PAGE_SIZE; + + cnt = snprintf(buf, bufend - (buf + cnt), + "Memory marked bad: %d kB\n" + "Pages marked bad: %d\n" + "Unable to isolate on LRU: %d\n" + "Unable to migrate: %d\n" + "Already marked bad: %d\n" + "Already on list: %d\n" + "List of bad physical pages\n", + total_badpages << (PAGE_SHIFT - 10), total_badpages, + mstat_cannot_isolate, mstat_failed_to_discard, + mstat_already_marked, mstat_already_on_list + ); + + list_for_each_entry_safe(page, page2, &badpagelist, lru) { + if (bufend - (buf + cnt) < 20) + break; /* Avoid overflowing the buffer */ + cnt += snprintf(buf + cnt, bufend - (buf + cnt), + " 0x%011lx", page_to_phys(page)); + if (!(++i % 5)) + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n"); + } + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n"); + + return cnt; +} + +static struct kobj_attribute badram_attr = { + .attr = { + .name = "badram", + .mode = S_IWUSR | S_IRUGO, + }, + .show = badpage_show, + .store = badpage_store, +}; + +static int __init +cpe_migrate_external_handler_init(void) +{ + int error; + + error = sysfs_create_file(kernel_kobj, &badram_attr.attr); + if (error) + return -EINVAL; + + /* + * register external ce handler + */ + if (ia64_reg_CE_extension(cpe_setup_migrate)) { + printk(KERN_ERR "ia64_reg_CE_extension failed.\n"); + return -EFAULT; + } + cpe_poll_enabled = cpe_polling_enabled; + + printk(KERN_INFO "Registered badram Driver\n"); + return 0; +} + +static void __exit +cpe_migrate_external_handler_exit(void) +{ + /* unregister external mca handlers */ + ia64_unreg_CE_extension(); + + sysfs_remove_file(kernel_kobj, &badram_attr.attr); +} + +module_init(cpe_migrate_external_handler_init); +module_exit(cpe_migrate_external_handler_exit); + +module_param(cpe_polling_enabled, int, 0644); +MODULE_PARM_DESC(cpe_polling_enabled, + "Enable polling with migration"); + +MODULE_AUTHOR("Russ Anderson "); +MODULE_DESCRIPTION("ia64 Corrected Error page migration driver"); +MODULE_LICENSE("GPL"); --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_PERFMON) += perfmon_defaul obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +obj-$(CONFIG_IA64_CPE_MIGRATE) += cpe_migrate.o obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -68,6 +68,9 @@ * * 2007-04-27 Russ Anderson * Support multiple cpus going through OS_MCA in the same event. + * + * 2008-04-22 Russ Anderson + * Migrate data off pages with correctable memory errors. */ #include #include @@ -163,7 +166,14 @@ static int cmc_polling_enabled = 1; * but encounters problems retrieving CPE logs. This should only be * necessary for debugging. */ -static int cpe_poll_enabled = 1; +int cpe_poll_enabled = 1; +EXPORT_SYMBOL(cpe_poll_enabled); + +unsigned int total_badpages; +EXPORT_SYMBOL(total_badpages); + +LIST_HEAD(badpagelist); +EXPORT_SYMBOL(badpagelist); extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe); @@ -523,6 +533,28 @@ int mca_recover_range(unsigned long addr } EXPORT_SYMBOL_GPL(mca_recover_range); +/* Function pointer to Corrected Error memory migration driver */ +int (*ia64_mca_ce_extension)(void *); + +int +ia64_reg_CE_extension(int (*fn)(void *)) +{ + if (ia64_mca_ce_extension) + return 1; + + ia64_mca_ce_extension = fn; + return 0; +} +EXPORT_SYMBOL(ia64_reg_CE_extension); + +void +ia64_unreg_CE_extension(void) +{ + if (ia64_mca_ce_extension) + ia64_mca_ce_extension = NULL; +} +EXPORT_SYMBOL(ia64_unreg_CE_extension); + #ifdef CONFIG_ACPI int cpe_vector = -1; @@ -534,6 +566,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v static unsigned long cpe_history[CPE_HISTORY_LENGTH]; static int index; static DEFINE_SPINLOCK(cpe_history_lock); + int recover; IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", __func__, cpe_irq, smp_processor_id()); @@ -580,6 +613,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v out: /* Get the CPE error record and log it */ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE); + recover = (ia64_mca_ce_extension && ia64_mca_ce_extension( + IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE))); return IRQ_HANDLED; }