]>
Commit | Line | Data |
---|---|---|
59d2274f DW |
1 | #!/usr/bin/env python3 |
2 | ||
3 | # Read ftrace input, looking for XFS buffer deadlocks. | |
4 | # | |
5 | # Copyright (C) 2016 Oracle. All Rights Reserved. | |
6 | # | |
7 | # Author: Darrick J. Wong <darrick.wong@oracle.com> | |
8 | # | |
9 | # This program is free software; you can redistribute it and/or | |
10 | # modify it under the terms of the GNU General Public License | |
11 | # as published by the Free Software Foundation; either version 2 | |
12 | # of the License, or (at your option) any later version. | |
13 | # | |
14 | # This program is distributed in the hope that it would be useful, | |
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | # GNU General Public License for more details. | |
18 | # | |
19 | # You should have received a copy of the GNU General Public License | |
20 | # along with this program; if not, write the Free Software Foundation, | |
21 | # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | |
22 | # | |
23 | # Rough guide to using this script: | |
24 | # Collect ftrace data from a deadlock: | |
25 | # | |
26 | # # trace-cmd record -e 'xfs_buf_*lock*' <other traces> & | |
27 | # <run command, hang system>^Z | |
28 | # # killall -INT trace-cmd | |
29 | # <wait for trace-cmd to spit out trace.dat> | |
30 | # | |
31 | # Now analyze the captured trace data: | |
32 | # | |
33 | # # trace-cmd report | xfsbuflock.py | |
34 | # === fsx-14956 === | |
35 | # <trace data> | |
36 | # 3732.005575: xfs_buf_trylock_fail: dev 8:16 bno 0x1 nblks 0x1 hold 4 \ | |
37 | # pincount 1 lock 0 flags DONE|KMEM caller 0xc009af36s | |
38 | # Locked buffers: | |
39 | # dev 8:16 bno 0x64c371 nblks 0x1 lock 1 owner fsx-14956@3732.005567 | |
40 | # waiting: fsx-14954 | |
41 | # dev 8:16 bno 0x64c380 nblks 0x8 lock 1 owner fsx-14956@3732.005571 | |
42 | # dev 8:16 bno 0x64c378 nblks 0x8 lock 1 owner fsx-14956@3732.005570 | |
43 | # === fsx-14954 === | |
44 | # <trace data> | |
45 | # 3732.005592: xfs_buf_trylock_fail: dev 8:16 bno 0x64c371 nblks 0x1 hold 4 \ | |
46 | # pincount 1 lock 0 flags ASYNC|DONE|KMEM caller 0xc009af36s | |
47 | # Locked buffers: | |
48 | # dev 8:16 bno 0x8 nblks 0x8 lock 1 owner fsx-14954@3732.005583 | |
49 | # dev 8:16 bno 0x1 nblks 0x1 lock 1 owner fsx-14954@3732.005574 | |
50 | # waiting: fsx-14956 | |
51 | # waiting: fsx-14957 | |
52 | # waiting: fsx-14958 | |
53 | # dev 8:16 bno 0x10 nblks 0x8 lock 1 owner fsx-14954@3732.005585 | |
54 | # | |
55 | # As you can see, fsx-14596 is locking AGFs in violation of the locking | |
56 | # order rules. | |
57 | ||
58 | import sys | |
59 | import fileinput | |
60 | from collections import namedtuple | |
61 | ||
62 | NR_BACKTRACE = 50 | |
63 | ||
64 | class Process: | |
65 | def __init__(self, pid): | |
66 | self.pid = pid; | |
67 | self.bufs = set() | |
68 | self.locked_bufs = set() | |
69 | self.backtrace = [] | |
70 | ||
71 | def dump(self): | |
72 | print('=== %s ===' % self.pid) | |
73 | for bt in self.backtrace: | |
74 | print('%f: %s' % (bt.time, bt.descr)) | |
75 | print('Locked buffers:') | |
76 | for buf in self.locked_bufs: | |
77 | buf.dump() | |
78 | ||
79 | class Buffer: | |
80 | def __init__(self, dev, bno, blen): | |
81 | self.dev = dev | |
82 | self.bno = int(bno, 0) | |
83 | self.blen = int(blen, 0) | |
84 | self.locked = False | |
85 | self.locktime = None | |
86 | self.owner = None | |
87 | self.waiters = set() | |
88 | ||
89 | def trylock(self, process, time): | |
90 | self.lockdone(process, time) | |
91 | ||
92 | def lockdone(self, process, time): | |
93 | if self.locked: | |
94 | print('Buffer already locked on line %d?!' % nr) | |
95 | # process.dump() | |
96 | # self.dump() | |
97 | # assert False | |
98 | if process in self.waiters: | |
99 | self.waiters.remove(process) | |
100 | self.locked = True | |
101 | self.owner = process | |
102 | self.locktime = time | |
103 | process.locked_bufs.add(self) | |
104 | process.bufs.add(self) | |
105 | locked_buffers.add(self) | |
106 | ||
107 | def waitlock(self, process): | |
108 | self.waiters.add(process) | |
109 | ||
110 | def unlock(self): | |
111 | self.locked = False | |
112 | if self in locked_buffers: | |
113 | locked_buffers.remove(self) | |
114 | if self.owner is not None and \ | |
115 | self in self.owner.locked_bufs: | |
116 | self.owner.locked_bufs.remove(self) | |
117 | ||
118 | def dump(self): | |
119 | if self.owner is not None: | |
120 | pid = '%s@%f' % (self.owner.pid, self.locktime) | |
121 | else: | |
122 | pid = '' | |
123 | print('dev %s bno 0x%x nblks 0x%x lock %d owner %s' % \ | |
124 | (self.dev, self.bno, self.blen, self.locked, \ | |
125 | pid)) | |
126 | for proc in self.waiters: | |
127 | print(' waiting: %s' % proc.pid) | |
128 | ||
129 | Event = namedtuple('Event', 'time descr') | |
130 | ||
131 | # Read ftrace input, looking for events and for buffer lock info | |
132 | processes = {} | |
133 | buffers = {} | |
134 | locked_buffers = set() | |
135 | ||
136 | def getbuf(toks): | |
137 | if int(toks[7], 0) == 18446744073709551615: | |
138 | return None | |
139 | bufkey = ' '.join(toks[4:10]) | |
140 | if bufkey in buffers: | |
141 | return buffers[bufkey] | |
142 | buf = Buffer(toks[5], toks[7], toks[9]) | |
143 | buffers[bufkey] = buf | |
144 | return buf | |
145 | ||
146 | nr = 0 | |
147 | for line in fileinput.input(): | |
148 | nr += 1 | |
149 | toks = line.split() | |
150 | if len(toks) < 4: | |
151 | continue | |
152 | pid = toks[0] | |
153 | time = float(toks[2][:-1]) | |
154 | fn = toks[3][:-1] | |
155 | ||
156 | if pid in processes: | |
157 | proc = processes[pid] | |
158 | else: | |
159 | proc = Process(pid) | |
160 | processes[pid] = proc | |
161 | ||
162 | if fn == 'xfs_buf_unlock' or fn == 'xfs_buf_item_unlock_stale': | |
163 | buf = getbuf(toks) | |
164 | if buf is not None: | |
165 | buf.unlock() | |
166 | elif fn == 'xfs_buf_lock_done': | |
167 | buf = getbuf(toks) | |
168 | if buf is not None: | |
169 | buf.lockdone(proc, time) | |
170 | elif fn == 'xfs_buf_lock': | |
171 | buf = getbuf(toks) | |
172 | if buf is not None: | |
173 | buf.waitlock(proc) | |
174 | elif fn == 'xfs_buf_trylock': | |
175 | buf = getbuf(toks) | |
176 | if buf is not None: | |
177 | buf.trylock(proc, time) | |
178 | elif fn == 'xfs_buf_item_unlock': | |
179 | pass | |
180 | else: | |
181 | e = Event(time, ' '.join(toks[3:])) | |
182 | proc.backtrace.append(e) | |
183 | if len(proc.backtrace) > NR_BACKTRACE: | |
184 | proc.backtrace.pop(0) | |
185 | ||
186 | deadlocked = set() | |
187 | for buf in locked_buffers: | |
188 | deadlocked.add(buf.owner) | |
189 | ||
190 | for proc in deadlocked: | |
191 | proc.dump() | |
192 | ||
193 | sys.exit(0) | |
194 | ||
195 | for key in buffers: | |
196 | buf = buffers[key] | |
197 | if buf.locked: | |
198 | print('dev %s bno 0x%x len 0x%x owner %s' % (buf.dev, buf.bno, buf.blen, buf.owner.pid)) | |
199 | else: | |
200 | print('dev %s bno 0x%x len 0x%x' % (buf.dev, buf.bno, buf.blen)) | |
201 | ||
202 | sys.exit(0) | |
203 | ||
204 | for pid in processes: | |
205 | proc = processes[pid] |