1 From e7ee2c089e94067d68475990bdeed211c8852917 Mon Sep 17 00:00:00 2001
2 From: Eric Ren <zren@suse.com>
3 Date: Tue, 10 Jan 2017 16:57:33 -0800
4 Subject: ocfs2: fix crash caused by stale lvb with fsdlm plugin
6 From: Eric Ren <zren@suse.com>
8 commit e7ee2c089e94067d68475990bdeed211c8852917 upstream.
10 The crash happens rather often when we reset some cluster nodes while
11 nodes contend fiercely to do truncate and append.
13 The crash backtrace is below:
15 dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover_grant 1 locks on 971 resources
16 dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover 9 generation 5 done: 4 ms
17 ocfs2: Begin replay journal (node 318952601, slot 2) on device (253,18)
18 ocfs2: End replay journal (node 318952601, slot 2) on device (253,18)
19 ocfs2: Beginning quota recovery on device (253,18) for slot 2
20 ocfs2: Finishing quota recovery on device (253,18) for slot 2
21 (truncate,30154,1):ocfs2_truncate_file:470 ERROR: bug expression: le64_to_cpu(fe->i_size) != i_size_read(inode)
22 (truncate,30154,1):ocfs2_truncate_file:470 ERROR: Inode 290321, inode i_size = 732 != di i_size = 937, i_flags = 0x1
23 ------------[ cut here ]------------
24 kernel BUG at /usr/src/linux/fs/ocfs2/file.c:470!
25 invalid opcode: 0000 [#1] SMP
26 Modules linked in: ocfs2_stack_user(OEN) ocfs2(OEN) ocfs2_nodemanager ocfs2_stackglue(OEN) quota_tree dlm(OEN) configfs fuse sd_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs softdog xfs libcrc32c ppdev parport_pc pcspkr parport joydev virtio_balloon virtio_net i2c_piix4 acpi_cpufreq button processor ext4 crc16 jbd2 mbcache ata_generic cirrus virtio_blk ata_piix drm_kms_helper ahci syscopyarea libahci sysfillrect sysimgblt fb_sys_fops ttm floppy libata drm virtio_pci virtio_ring uhci_hcd virtio ehci_hcd usbcore serio_raw usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4
27 Supported: No, Unsupported modules are loaded
28 CPU: 1 PID: 30154 Comm: truncate Tainted: G OE N 4.4.21-69-default #1
29 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20151112_172657-sheep25 04/01/2014
30 task: ffff88004ff6d240 ti: ffff880074e68000 task.ti: ffff880074e68000
31 RIP: 0010:[<ffffffffa05c8c30>] [<ffffffffa05c8c30>] ocfs2_truncate_file+0x640/0x6c0 [ocfs2]
32 RSP: 0018:ffff880074e6bd50 EFLAGS: 00010282
33 RAX: 0000000000000074 RBX: 000000000000029e RCX: 0000000000000000
34 RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246
35 RBP: ffff880074e6bda8 R08: 000000003675dc7a R09: ffffffff82013414
36 R10: 0000000000034c50 R11: 0000000000000000 R12: ffff88003aab3448
37 R13: 00000000000002dc R14: 0000000000046e11 R15: 0000000000000020
38 FS: 00007f839f965700(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000
39 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
40 CR2: 00007f839f97e000 CR3: 0000000036723000 CR4: 00000000000006e0
42 ocfs2_setattr+0x698/0xa90 [ocfs2]
43 notify_change+0x1ae/0x380
45 do_sys_ftruncate.constprop.11+0x108/0x160
46 entry_SYSCALL_64_fastpath+0x12/0x6d
47 Code: 24 28 ba d6 01 00 00 48 c7 c6 30 43 62 a0 8b 41 2c 89 44 24 08 48 8b 41 20 48 c7 c1 78 a3 62 a0 48 89 04 24 31 c0 e8 a0 97 f9 ff <0f> 0b 3d 00 fe ff ff 0f 84 ab fd ff ff 83 f8 fc 0f 84 a2 fd ff
48 RIP [<ffffffffa05c8c30>] ocfs2_truncate_file+0x640/0x6c0 [ocfs2]
50 It's because ocfs2_inode_lock() get us stale LVB in which the i_size is
51 not equal to the disk i_size. We mistakenly trust the LVB because the
52 underlaying fsdlm dlm_lock() doesn't set lkb_sbflags with
53 DLM_SBF_VALNOTVALID properly for us. But, why?
55 The current code tries to downconvert lock without DLM_LKF_VALBLK flag
56 to tell o2cb don't update RSB's LVB if it's a PR->NULL conversion, even
57 if the lock resource type needs LVB. This is not the right way for
60 The fsdlm plugin behaves different on DLM_LKF_VALBLK, it depends on
61 DLM_LKF_VALBLK to decide if we care about the LVB in the LKB. If
62 DLM_LKF_VALBLK is not set, fsdlm will skip recovering RSB's LVB from
63 this lkb and set the right DLM_SBF_VALNOTVALID appropriately when node
66 The following diagram briefly illustrates how this crash happens:
68 RSB1 is inode metadata lock resource with LOCK_TYPE_USES_LVB;
74 RSB1(master): NULL->EX
75 ocfs2_downconvert_lock(PR->NULL, set_lvb==0)
76 ocfs2_dlm_lock(no DLM_LKF_VALBLK)
78 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
80 dlm_lock(no DLM_LKF_VALBLK)
81 convert_lock(overwrite lkb->lkb_exflags
82 with no DLM_LKF_VALBLK)
89 /* The LVB is not trustable if the node with EX fails and
90 * no lock >= PR is left. We should set RSB_VALNOTVALID for RSB1.
93 if(!(kb_exflags & DLM_LKF_VALBLK)) /* This means we miss the chance to
94 return; * to invalid the LVB here.
100 RSB1(become master from recovery)
103 ocfs2_inode_lock(NULL->EX)
104 /* dlm_lock() return the stale lvb without setting DLM_SBF_VALNOTVALID */
105 ocfs2_meta_lvb_is_trustable() return 1 /* so we don't refresh inode from disk */
106 ocfs2_truncate_file()
107 mlog_bug_on_msg(disk isize != i_size_read(inode)) /* crash! */
109 The fix is quite straightforward. We keep to set DLM_LKF_VALBLK flag
110 for dlm_lock() if the lock resource type needs LVB and the fsdlm plugin
113 Link: http://lkml.kernel.org/r/1481275846-6604-1-git-send-email-zren@suse.com
114 Signed-off-by: Eric Ren <zren@suse.com>
115 Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
116 Cc: Mark Fasheh <mfasheh@versity.com>
117 Cc: Joel Becker <jlbec@evilplan.org>
118 Cc: Junxiao Bi <junxiao.bi@oracle.com>
119 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
120 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
121 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
124 fs/ocfs2/dlmglue.c | 10 ++++++++++
125 fs/ocfs2/stackglue.c | 6 ++++++
126 fs/ocfs2/stackglue.h | 3 +++
127 3 files changed, 19 insertions(+)
129 --- a/fs/ocfs2/dlmglue.c
130 +++ b/fs/ocfs2/dlmglue.c
131 @@ -3321,6 +3321,16 @@ static int ocfs2_downconvert_lock(struct
132 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
133 lockres->l_level, new_level);
136 + * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
137 + * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
138 + * we can recover correctly from node failure. Otherwise, we may get
139 + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
141 + if (!ocfs2_is_o2cb_active() &&
142 + lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
146 dlm_flags |= DLM_LKF_VALBLK;
148 --- a/fs/ocfs2/stackglue.c
149 +++ b/fs/ocfs2/stackglue.c
150 @@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_
152 static struct ocfs2_stack_plugin *active_stack;
154 +inline int ocfs2_is_o2cb_active(void)
156 + return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
158 +EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
160 static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
162 struct ocfs2_stack_plugin *p;
163 --- a/fs/ocfs2/stackglue.h
164 +++ b/fs/ocfs2/stackglue.h
165 @@ -298,4 +298,7 @@ void ocfs2_stack_glue_set_max_proto_vers
166 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
167 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
169 +/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
170 +int ocfs2_is_o2cb_active(void);
172 #endif /* STACKGLUE_H */