]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.arch/ia64-page-migration
Revert "Disable build of xen kernel."
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.arch / ia64-page-migration
1 From: Russ Anderson <rja@sgi.com>
2 Subject: ia64: Call migration code on correctable errors v8
3 References: 415829
4 Acked-by: schwab@suse.de
5
6 Migrate data off pages with correctable memory errors. This patch is the
7 ia64 specific piece. It connects the CPE handler to the page migration
8 code. It is implemented as a kernel loadable module, similar to the mca
9 recovery code (mca_recovery.ko). This allows the feature to be turned off
10 by uninstalling the module.
11
12
13 Signed-off-by: Russ Anderson <rja@sgi.com>
14
15 ---
16 arch/ia64/Kconfig | 9
17 arch/ia64/include/asm/mca.h | 6
18 arch/ia64/include/asm/page.h | 1
19 arch/ia64/kernel/Makefile | 1
20 arch/ia64/kernel/cpe_migrate.c | 434 +++++++++++++++++++++++++++++++++++++++++
21 arch/ia64/kernel/mca.c | 37 +++
22 6 files changed, 487 insertions(+), 1 deletion(-)
23
24 --- a/arch/ia64/include/asm/mca.h
25 +++ b/arch/ia64/include/asm/mca.h
26 @@ -137,6 +137,7 @@ extern unsigned long __per_cpu_mca[NR_CP
27
28 extern int cpe_vector;
29 extern int ia64_cpe_irq;
30 +extern int cpe_poll_enabled;
31 extern void ia64_mca_init(void);
32 extern void ia64_mca_cpu_init(void *);
33 extern void ia64_os_mca_dispatch(void);
34 @@ -150,10 +151,15 @@ extern void ia64_slave_init_handler(void
35 extern void ia64_mca_cmc_vector_setup(void);
36 extern int ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
37 extern void ia64_unreg_MCA_extension(void);
38 +extern int ia64_reg_CE_extension(int (*fn)(void *));
39 +extern void ia64_unreg_CE_extension(void);
40 extern u64 ia64_get_rnat(u64 *);
41 extern void ia64_mca_printk(const char * fmt, ...)
42 __attribute__ ((format (printf, 1, 2)));
43
44 +extern struct list_head badpagelist;
45 +extern unsigned int total_badpages;
46 +
47 struct ia64_mca_notify_die {
48 struct ia64_sal_os_state *sos;
49 int *monarch_cpu;
50 --- a/arch/ia64/include/asm/page.h
51 +++ b/arch/ia64/include/asm/page.h
52 @@ -121,6 +121,7 @@ extern unsigned long max_low_pfn;
53 #endif
54
55 #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
56 +#define phys_to_page(kaddr) (pfn_to_page(kaddr >> PAGE_SHIFT))
57 #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
58 #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
59
60 --- a/arch/ia64/Kconfig
61 +++ b/arch/ia64/Kconfig
62 @@ -470,6 +470,15 @@ config COMPAT_FOR_U64_ALIGNMENT
63 config IA64_MCA_RECOVERY
64 tristate "MCA recovery from errors other than TLB."
65
66 +config IA64_CPE_MIGRATE
67 + tristate "Migrate data off pages with correctable errors"
68 + default m
69 + help
70 + Migrate data off pages with correctable memory errors. Selecting
71 + Y will build this functionality into the kernel. Selecting M will
72 + build this functionality as a kernel loadable module. Installing
73 + the module will turn on the functionality.
74 +
75 config PERFMON
76 bool "Performance monitor support"
77 help
78 --- /dev/null
79 +++ b/arch/ia64/kernel/cpe_migrate.c
80 @@ -0,0 +1,434 @@
81 +/*
82 + * File: cpe_migrate.c
83 + * Purpose: Migrate data from physical pages with excessive correctable
84 + * errors to new physical pages. Keep the old pages on a discard
85 + * list.
86 + *
87 + * Copyright (C) 2008 SGI - Silicon Graphics Inc.
88 + * Copyright (C) 2008 Russ Anderson <rja@sgi.com>
89 + */
90 +
91 +#include <linux/sysdev.h>
92 +#include <linux/types.h>
93 +#include <linux/sched.h>
94 +#include <linux/module.h>
95 +#include <linux/kernel.h>
96 +#include <linux/smp.h>
97 +#include <linux/workqueue.h>
98 +#include <linux/mm.h>
99 +#include <linux/swap.h>
100 +#include <linux/vmalloc.h>
101 +#include <linux/migrate.h>
102 +#include <linux/page-isolation.h>
103 +#include <linux/memcontrol.h>
104 +#include <linux/kobject.h>
105 +
106 +#include <asm/page.h>
107 +#include <asm/system.h>
108 +#include <asm/sn/sn_cpuid.h>
109 +#include <asm/mca.h>
110 +
111 +#define BADRAM_BASENAME "badram"
112 +#define CE_HISTORY_LENGTH 30
113 +
114 +struct cpe_info {
115 + u64 paddr;
116 + u16 node;
117 +};
118 +static struct cpe_info cpe[CE_HISTORY_LENGTH];
119 +
120 +static int cpe_polling_enabled = 1;
121 +static int cpe_head;
122 +static int cpe_tail;
123 +static int work_scheduled;
124 +static int mstat_cannot_isolate;
125 +static int mstat_failed_to_discard;
126 +static int mstat_already_marked;
127 +static int mstat_already_on_list;
128 +
129 +DEFINE_SPINLOCK(cpe_migrate_lock);
130 +
131 +static void
132 +get_physical_address(void *buffer, u64 *paddr, u16 *node)
133 +{
134 + sal_log_record_header_t *rh;
135 + sal_log_mem_dev_err_info_t *mdei;
136 + ia64_err_rec_t *err_rec;
137 + sal_log_platform_err_info_t *plat_err;
138 + efi_guid_t guid;
139 +
140 + err_rec = buffer;
141 + rh = &err_rec->sal_elog_header;
142 + *paddr = 0;
143 + *node = 0;
144 +
145 + /*
146 + * Make sure it is a corrected error.
147 + */
148 + if (rh->severity != sal_log_severity_corrected)
149 + return;
150 +
151 + plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err;
152 +
153 + guid = plat_err->mem_dev_err.header.guid;
154 + if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) {
155 + /*
156 + * Memory cpe
157 + */
158 + mdei = &plat_err->mem_dev_err;
159 + if (mdei->valid.oem_data) {
160 + if (mdei->valid.physical_addr)
161 + *paddr = mdei->physical_addr;
162 +
163 + if (mdei->valid.node) {
164 + if (ia64_platform_is("sn2"))
165 + *node = nasid_to_cnodeid(mdei->node);
166 + else
167 + *node = mdei->node;
168 + }
169 + }
170 + }
171 +}
172 +
173 +static struct page *
174 +alloc_migrate_page(struct page *ignored, unsigned long node, int **x)
175 +{
176 +
177 + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
178 +}
179 +
180 +static int
181 +validate_paddr_page(u64 paddr)
182 +{
183 + struct page *page;
184 +
185 + if (!paddr)
186 + return -EINVAL;
187 +
188 + if (!ia64_phys_addr_valid(paddr))
189 + return -EINVAL;
190 +
191 + if (!pfn_valid(paddr >> PAGE_SHIFT))
192 + return -EINVAL;
193 +
194 + page = phys_to_page(paddr);
195 + if (PageMemError(page))
196 + mstat_already_marked++;
197 + return 0;
198 +}
199 +
200 +static int
201 +ia64_mca_cpe_move_page(u64 paddr, u32 node)
202 +{
203 + LIST_HEAD(pagelist);
204 + struct page *page;
205 + int ret;
206 +
207 + ret = validate_paddr_page(paddr);
208 + if (ret < 0)
209 + return ret;
210 +
211 + /*
212 + * convert physical address to page number
213 + */
214 + page = phys_to_page(paddr);
215 +
216 + migrate_prep();
217 + ret = isolate_lru_page(page, &pagelist);
218 + if (ret) {
219 + mstat_cannot_isolate++;
220 + return ret;
221 + }
222 +
223 + SetPageMemError(page); /* Mark the page as bad */
224 + ret = migrate_pages(&pagelist, alloc_migrate_page, node);
225 + if (ret == 0) {
226 + total_badpages++;
227 + list_add_tail(&page->lru, &badpagelist);
228 + } else {
229 + mstat_failed_to_discard++;
230 + /*
231 + * The page failed to migrate and is not on the bad page list.
232 + * Clearing the error bit will allow another attempt to migrate
233 + * if it gets another correctable error.
234 + */
235 + ClearPageMemError(page);
236 + }
237 +
238 + return 0;
239 +}
240 +
241 +/*
242 + * ia64_mca_cpe_migrate
243 + * The worker that does the actual migration. It pulls a
244 + * physical address off the list and calls the migration code.
245 + */
246 +static void
247 +ia64_mca_cpe_migrate(struct work_struct *unused)
248 +{
249 + int ret;
250 + u64 paddr;
251 + u16 node;
252 +
253 + do {
254 + paddr = cpe[cpe_tail].paddr;
255 + if (paddr) {
256 + /*
257 + * There is a valid entry that needs processing.
258 + */
259 + node = cpe[cpe_tail].node;
260 +
261 + ret = ia64_mca_cpe_move_page(paddr, node);
262 + if (ret <= 0)
263 + /*
264 + * Even though the return status is negative,
265 + * clear the entry. If the same address has
266 + * another CPE it will be re-added to the list.
267 + */
268 + cpe[cpe_tail].paddr = 0;
269 +
270 + }
271 + if (++cpe_tail >= CE_HISTORY_LENGTH)
272 + cpe_tail = 0;
273 +
274 + } while (cpe_tail != cpe_head);
275 + work_scheduled = 0;
276 +}
277 +
278 +static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate);
279 +DEFINE_SPINLOCK(cpe_list_lock);
280 +
281 +/*
282 + * cpe_setup_migrate
283 + * Get the physical address out of the CPE record, add it
284 + * to the list of addresses to migrate (if not already on),
285 + * and schedule the back end worker task. This is called
286 + * in interrupt context so cannot directly call the migration
287 + * code.
288 + *
289 + * Inputs
290 + * rec The CPE record
291 + * Outputs
292 + * 1 on Success, -1 on failure
293 + */
294 +static int
295 +cpe_setup_migrate(void *rec)
296 +{
297 + u64 paddr;
298 + u16 node;
299 + /* int head, tail; */
300 + int i, ret;
301 +
302 + if (!rec)
303 + return -EINVAL;
304 +
305 + get_physical_address(rec, &paddr, &node);
306 + ret = validate_paddr_page(paddr);
307 + if (ret < 0)
308 + return -EINVAL;
309 +
310 + if ((cpe_head != cpe_tail) || (cpe[cpe_head].paddr != 0))
311 + /*
312 + * List not empty
313 + */
314 + for (i = 0; i < CE_HISTORY_LENGTH; i++) {
315 + if (PAGE_ALIGN(cpe[i].paddr) == PAGE_ALIGN(paddr)) {
316 + mstat_already_on_list++;
317 + return 1; /* already on the list */
318 + }
319 + }
320 +
321 + if (!spin_trylock(&cpe_list_lock)) {
322 + /*
323 + * Someone else has the lock. To avoid spinning in interrupt
324 + * handler context, bail.
325 + */
326 + return 1;
327 + }
328 +
329 + if (cpe[cpe_head].paddr == 0) {
330 + cpe[cpe_head].node = node;
331 + cpe[cpe_head].paddr = paddr;
332 +
333 + if (++cpe_head >= CE_HISTORY_LENGTH)
334 + cpe_head = 0;
335 + }
336 + spin_unlock(&cpe_list_lock);
337 +
338 + if (!work_scheduled) {
339 + work_scheduled = 1;
340 + schedule_work(&cpe_enable_work);
341 + }
342 +
343 + return 1;
344 +}
345 +
346 +/*
347 + * =============================================================================
348 + */
349 +
350 +/*
351 + * free_one_bad_page
352 + * Free one page from the list of bad pages.
353 + */
354 +static int
355 +free_one_bad_page(unsigned long paddr)
356 +{
357 + LIST_HEAD(pagelist);
358 + struct page *page, *page2, *target;
359 +
360 + /*
361 + * Verify page address
362 + */
363 + target = phys_to_page(paddr);
364 + list_for_each_entry_safe(page, page2, &badpagelist, lru) {
365 + if (page != target)
366 + continue;
367 +
368 + ClearPageMemError(page); /* Mark the page as good */
369 + total_badpages--;
370 + list_move_tail(&page->lru, &pagelist);
371 + putback_lru_pages(&pagelist);
372 + break;
373 + }
374 + return 0;
375 +}
376 +
377 +/*
378 + * free_all_bad_pages
379 + * Free all of the pages on the bad pages list.
380 + */
381 +static int
382 +free_all_bad_pages(void)
383 +{
384 + struct page *page, *page2;
385 +
386 + list_for_each_entry_safe(page, page2, &badpagelist, lru) {
387 + ClearPageMemError(page); /* Mark the page as good */
388 + total_badpages--;
389 + }
390 + putback_lru_pages(&badpagelist);
391 + return 0;
392 +}
393 +
394 +#define OPT_LEN 16
395 +
396 +static ssize_t
397 +badpage_store(struct kobject *kobj,
398 + struct kobj_attribute *attr, const char *buf, size_t count)
399 +{
400 + char optstr[OPT_LEN];
401 + unsigned long opt;
402 + int len = OPT_LEN;
403 + int err;
404 +
405 + if (count < len)
406 + len = count;
407 +
408 + strlcpy(optstr, buf, len);
409 +
410 + err = strict_strtoul(optstr, 16, &opt);
411 + if (err)
412 + return err;
413 +
414 + if (opt == 0)
415 + free_all_bad_pages();
416 + else
417 + free_one_bad_page(opt);
418 +
419 + return count;
420 +}
421 +
422 +/*
423 + * badpage_show
424 + * Display the number, size, and addresses of all the pages on the
425 + * bad page list.
426 + *
427 + * Note that sysfs provides buf of PAGE_SIZE length. bufend tracks
428 + * the remaining space in buf to avoid overflowing.
429 + */
430 +static ssize_t
431 +badpage_show(struct kobject *kobj,
432 + struct kobj_attribute *attr, char *buf)
433 +
434 +{
435 + struct page *page, *page2;
436 + int i = 0, cnt = 0;
437 + char *bufend = buf + PAGE_SIZE;
438 +
439 + cnt = snprintf(buf, bufend - (buf + cnt),
440 + "Memory marked bad: %d kB\n"
441 + "Pages marked bad: %d\n"
442 + "Unable to isolate on LRU: %d\n"
443 + "Unable to migrate: %d\n"
444 + "Already marked bad: %d\n"
445 + "Already on list: %d\n"
446 + "List of bad physical pages\n",
447 + total_badpages << (PAGE_SHIFT - 10), total_badpages,
448 + mstat_cannot_isolate, mstat_failed_to_discard,
449 + mstat_already_marked, mstat_already_on_list
450 + );
451 +
452 + list_for_each_entry_safe(page, page2, &badpagelist, lru) {
453 + if (bufend - (buf + cnt) < 20)
454 + break; /* Avoid overflowing the buffer */
455 + cnt += snprintf(buf + cnt, bufend - (buf + cnt),
456 + " 0x%011lx", page_to_phys(page));
457 + if (!(++i % 5))
458 + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
459 + }
460 + cnt += snprintf(buf + cnt, bufend - (buf + cnt), "\n");
461 +
462 + return cnt;
463 +}
464 +
465 +static struct kobj_attribute badram_attr = {
466 + .attr = {
467 + .name = "badram",
468 + .mode = S_IWUSR | S_IRUGO,
469 + },
470 + .show = badpage_show,
471 + .store = badpage_store,
472 +};
473 +
474 +static int __init
475 +cpe_migrate_external_handler_init(void)
476 +{
477 + int error;
478 +
479 + error = sysfs_create_file(kernel_kobj, &badram_attr.attr);
480 + if (error)
481 + return -EINVAL;
482 +
483 + /*
484 + * register external ce handler
485 + */
486 + if (ia64_reg_CE_extension(cpe_setup_migrate)) {
487 + printk(KERN_ERR "ia64_reg_CE_extension failed.\n");
488 + return -EFAULT;
489 + }
490 + cpe_poll_enabled = cpe_polling_enabled;
491 +
492 + printk(KERN_INFO "Registered badram Driver\n");
493 + return 0;
494 +}
495 +
496 +static void __exit
497 +cpe_migrate_external_handler_exit(void)
498 +{
499 + /* unregister external mca handlers */
500 + ia64_unreg_CE_extension();
501 +
502 + sysfs_remove_file(kernel_kobj, &badram_attr.attr);
503 +}
504 +
505 +module_init(cpe_migrate_external_handler_init);
506 +module_exit(cpe_migrate_external_handler_exit);
507 +
508 +module_param(cpe_polling_enabled, int, 0644);
509 +MODULE_PARM_DESC(cpe_polling_enabled,
510 + "Enable polling with migration");
511 +
512 +MODULE_AUTHOR("Russ Anderson <rja@sgi.com>");
513 +MODULE_DESCRIPTION("ia64 Corrected Error page migration driver");
514 +MODULE_LICENSE("GPL");
515 --- a/arch/ia64/kernel/Makefile
516 +++ b/arch/ia64/kernel/Makefile
517 @@ -27,6 +27,7 @@ obj-$(CONFIG_PERFMON) += perfmon_defaul
518 obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
519 obj-$(CONFIG_CPU_FREQ) += cpufreq/
520 obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
521 +obj-$(CONFIG_IA64_CPE_MIGRATE) += cpe_migrate.o
522 obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o
523 obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
524 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
525 --- a/arch/ia64/kernel/mca.c
526 +++ b/arch/ia64/kernel/mca.c
527 @@ -68,6 +68,9 @@
528 *
529 * 2007-04-27 Russ Anderson <rja@sgi.com>
530 * Support multiple cpus going through OS_MCA in the same event.
531 + *
532 + * 2008-04-22 Russ Anderson <rja@sgi.com>
533 + * Migrate data off pages with correctable memory errors.
534 */
535 #include <linux/jiffies.h>
536 #include <linux/types.h>
537 @@ -163,7 +166,14 @@ static int cmc_polling_enabled = 1;
538 * but encounters problems retrieving CPE logs. This should only be
539 * necessary for debugging.
540 */
541 -static int cpe_poll_enabled = 1;
542 +int cpe_poll_enabled = 1;
543 +EXPORT_SYMBOL(cpe_poll_enabled);
544 +
545 +unsigned int total_badpages;
546 +EXPORT_SYMBOL(total_badpages);
547 +
548 +LIST_HEAD(badpagelist);
549 +EXPORT_SYMBOL(badpagelist);
550
551 extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
552
553 @@ -523,6 +533,28 @@ int mca_recover_range(unsigned long addr
554 }
555 EXPORT_SYMBOL_GPL(mca_recover_range);
556
557 +/* Function pointer to Corrected Error memory migration driver */
558 +int (*ia64_mca_ce_extension)(void *);
559 +
560 +int
561 +ia64_reg_CE_extension(int (*fn)(void *))
562 +{
563 + if (ia64_mca_ce_extension)
564 + return 1;
565 +
566 + ia64_mca_ce_extension = fn;
567 + return 0;
568 +}
569 +EXPORT_SYMBOL(ia64_reg_CE_extension);
570 +
571 +void
572 +ia64_unreg_CE_extension(void)
573 +{
574 + if (ia64_mca_ce_extension)
575 + ia64_mca_ce_extension = NULL;
576 +}
577 +EXPORT_SYMBOL(ia64_unreg_CE_extension);
578 +
579 #ifdef CONFIG_ACPI
580
581 int cpe_vector = -1;
582 @@ -534,6 +566,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
583 static unsigned long cpe_history[CPE_HISTORY_LENGTH];
584 static int index;
585 static DEFINE_SPINLOCK(cpe_history_lock);
586 + int recover;
587
588 IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
589 __func__, cpe_irq, smp_processor_id());
590 @@ -580,6 +613,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
591 out:
592 /* Get the CPE error record and log it */
593 ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
594 + recover = (ia64_mca_ce_extension && ia64_mca_ce_extension(
595 + IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE)));
596
597 return IRQ_HANDLED;
598 }