]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | Subject: Fix partition migration hang under load |
2 | From: Brian King <brking@linux.vnet.ibm.com> | |
3 | References: 470563 - LTC51153 | |
4 | ||
5 | While testing partition migration with heavy CPU load using | |
6 | shared processors, it was observed that sometimes the migration | |
7 | would never complete and would appear to hang. Currently, the | |
8 | migration code assumes that if H_SUCCESS is returned from the H_JOIN | |
9 | then the migration is complete and the processor is waking up on | |
10 | the target system. If there was an outstanding PROD to the processor | |
11 | when the H_JOIN is called, however, it will return H_SUCCESS on the source | |
12 | system, causing the migration to hang, or in some scenarios cause | |
13 | the kernel to crash on the complete call waking the caller | |
14 | of rtas_percpu_suspend_me. | |
15 | ||
16 | Signed-off-by: Brian King <brking@linux.vnet.ibm.com> | |
17 | Signed-off-by: Olaf Hering <olh@suse.de> | |
18 | --- | |
19 | ||
20 | arch/powerpc/kernel/rtas.c | 10 ++++++++-- | |
21 | 1 file changed, 8 insertions(+), 2 deletions(-) | |
22 | ||
23 | --- a/arch/powerpc/kernel/rtas.c | |
24 | +++ b/arch/powerpc/kernel/rtas.c | |
25 | @@ -46,6 +46,7 @@ EXPORT_SYMBOL(rtas); | |
26 | ||
27 | struct rtas_suspend_me_data { | |
28 | atomic_t working; /* number of cpus accessing this struct */ | |
29 | + atomic_t done; | |
30 | int token; /* ibm,suspend-me */ | |
31 | int error; | |
32 | struct completion *complete; /* wait on this until working == 0 */ | |
33 | @@ -663,7 +664,7 @@ static int ibm_suspend_me_token = RTAS_U | |
34 | #ifdef CONFIG_PPC_PSERIES | |
35 | static void rtas_percpu_suspend_me(void *info) | |
36 | { | |
37 | - long rc; | |
38 | + long rc = H_SUCCESS; | |
39 | unsigned long msr_save; | |
40 | int cpu; | |
41 | struct rtas_suspend_me_data *data = | |
42 | @@ -675,7 +676,8 @@ static void rtas_percpu_suspend_me(void | |
43 | msr_save = mfmsr(); | |
44 | mtmsr(msr_save & ~(MSR_EE)); | |
45 | ||
46 | - rc = plpar_hcall_norets(H_JOIN); | |
47 | + while (rc == H_SUCCESS && !atomic_read(&data->done)) | |
48 | + rc = plpar_hcall_norets(H_JOIN); | |
49 | ||
50 | mtmsr(msr_save); | |
51 | ||
52 | @@ -698,6 +700,9 @@ static void rtas_percpu_suspend_me(void | |
53 | smp_processor_id(), rc); | |
54 | data->error = rc; | |
55 | } | |
56 | + | |
57 | + atomic_set(&data->done, 1); | |
58 | + | |
59 | /* This cpu did the suspend or got an error; in either case, | |
60 | * we need to prod all other other cpus out of join state. | |
61 | * Extra prods are harmless. | |
62 | @@ -740,6 +745,7 @@ static int rtas_ibm_suspend_me(struct rt | |
63 | } | |
64 | ||
65 | atomic_set(&data.working, 0); | |
66 | + atomic_set(&data.done, 0); | |
67 | data.token = rtas_token("ibm,suspend-me"); | |
68 | data.error = 0; | |
69 | data.complete = &done; |