]>
Commit | Line | Data |
---|---|---|
59d2274f DW |
1 | #!/usr/bin/env python3 |
2 | ||
3 | # Read ftrace input, looking for XFS buffer deadlocks. | |
4 | # | |
5 | # Copyright (C) 2016 Oracle. All Rights Reserved. | |
6 | # | |
7 | # Author: Darrick J. Wong <darrick.wong@oracle.com> | |
8 | # | |
9 | # This program is free software; you can redistribute it and/or | |
10 | # modify it under the terms of the GNU General Public License | |
11 | # as published by the Free Software Foundation; either version 2 | |
12 | # of the License, or (at your option) any later version. | |
13 | # | |
14 | # This program is distributed in the hope that it would be useful, | |
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | # GNU General Public License for more details. | |
18 | # | |
19 | # You should have received a copy of the GNU General Public License | |
20 | # along with this program; if not, write the Free Software Foundation, | |
21 | # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | |
22 | # | |
23 | # Rough guide to using this script: | |
24 | # Collect ftrace data from a deadlock: | |
25 | # | |
26 | # # trace-cmd record -e 'xfs_buf_*lock*' <other traces> & | |
27 | # <run command, hang system>^Z | |
28 | # # killall -INT trace-cmd | |
29 | # <wait for trace-cmd to spit out trace.dat> | |
30 | # | |
31 | # Now analyze the captured trace data: | |
32 | # | |
33 | # # trace-cmd report | xfsbuflock.py | |
34 | # === fsx-14956 === | |
35 | # <trace data> | |
36 | # 3732.005575: xfs_buf_trylock_fail: dev 8:16 bno 0x1 nblks 0x1 hold 4 \ | |
37 | # pincount 1 lock 0 flags DONE|KMEM caller 0xc009af36s | |
38 | # Locked buffers: | |
39 | # dev 8:16 bno 0x64c371 nblks 0x1 lock 1 owner fsx-14956@3732.005567 | |
40 | # waiting: fsx-14954 | |
41 | # dev 8:16 bno 0x64c380 nblks 0x8 lock 1 owner fsx-14956@3732.005571 | |
42 | # dev 8:16 bno 0x64c378 nblks 0x8 lock 1 owner fsx-14956@3732.005570 | |
43 | # === fsx-14954 === | |
44 | # <trace data> | |
45 | # 3732.005592: xfs_buf_trylock_fail: dev 8:16 bno 0x64c371 nblks 0x1 hold 4 \ | |
46 | # pincount 1 lock 0 flags ASYNC|DONE|KMEM caller 0xc009af36s | |
47 | # Locked buffers: | |
48 | # dev 8:16 bno 0x8 nblks 0x8 lock 1 owner fsx-14954@3732.005583 | |
49 | # dev 8:16 bno 0x1 nblks 0x1 lock 1 owner fsx-14954@3732.005574 | |
50 | # waiting: fsx-14956 | |
51 | # waiting: fsx-14957 | |
52 | # waiting: fsx-14958 | |
53 | # dev 8:16 bno 0x10 nblks 0x8 lock 1 owner fsx-14954@3732.005585 | |
54 | # | |
55 | # As you can see, fsx-14596 is locking AGFs in violation of the locking | |
56 | # order rules. | |
57 | ||
58 | import sys | |
59 | import fileinput | |
60 | from collections import namedtuple | |
61 | ||
62 | NR_BACKTRACE = 50 | |
63 | ||
64 | class Process: | |
65 | def __init__(self, pid): | |
66 | self.pid = pid; | |
67 | self.bufs = set() | |
68 | self.locked_bufs = set() | |
69 | self.backtrace = [] | |
70 | ||
71 | def dump(self): | |
72 | print('=== %s ===' % self.pid) | |
73 | for bt in self.backtrace: | |
74 | print('%f: %s' % (bt.time, bt.descr)) | |
75 | print('Locked buffers:') | |
76 | for buf in self.locked_bufs: | |
77 | buf.dump() | |
78 | ||
79 | class Buffer: | |
80 | def __init__(self, dev, bno, blen): | |
81 | self.dev = dev | |
82 | self.bno = int(bno, 0) | |
83 | self.blen = int(blen, 0) | |
84 | self.locked = False | |
85 | self.locktime = None | |
86 | self.owner = None | |
87 | self.waiters = set() | |
ece9fa6e | 88 | self.lockline = 0 |
59d2274f DW |
89 | |
90 | def trylock(self, process, time): | |
c2b571f1 DW |
91 | if not self.locked: |
92 | self.lockdone(process, time) | |
59d2274f DW |
93 | |
94 | def lockdone(self, process, time): | |
95 | if self.locked: | |
ece9fa6e DW |
96 | print('Buffer 0x%x already locked at line %d? (line %d)' % \ |
97 | (self.bno, self.lockline, nr)) | |
59d2274f DW |
98 | # process.dump() |
99 | # self.dump() | |
100 | # assert False | |
101 | if process in self.waiters: | |
102 | self.waiters.remove(process) | |
103 | self.locked = True | |
104 | self.owner = process | |
105 | self.locktime = time | |
ece9fa6e | 106 | self.lockline = nr |
59d2274f DW |
107 | process.locked_bufs.add(self) |
108 | process.bufs.add(self) | |
109 | locked_buffers.add(self) | |
110 | ||
111 | def waitlock(self, process): | |
112 | self.waiters.add(process) | |
113 | ||
114 | def unlock(self): | |
115 | self.locked = False | |
116 | if self in locked_buffers: | |
117 | locked_buffers.remove(self) | |
118 | if self.owner is not None and \ | |
119 | self in self.owner.locked_bufs: | |
120 | self.owner.locked_bufs.remove(self) | |
121 | ||
122 | def dump(self): | |
123 | if self.owner is not None: | |
ece9fa6e DW |
124 | pid = '%s@%f (line %d)' % \ |
125 | (self.owner.pid, self.locktime, self.lockline) | |
59d2274f DW |
126 | else: |
127 | pid = '' | |
128 | print('dev %s bno 0x%x nblks 0x%x lock %d owner %s' % \ | |
129 | (self.dev, self.bno, self.blen, self.locked, \ | |
130 | pid)) | |
131 | for proc in self.waiters: | |
132 | print(' waiting: %s' % proc.pid) | |
133 | ||
134 | Event = namedtuple('Event', 'time descr') | |
135 | ||
136 | # Read ftrace input, looking for events and for buffer lock info | |
137 | processes = {} | |
138 | buffers = {} | |
139 | locked_buffers = set() | |
140 | ||
141 | def getbuf(toks): | |
142 | if int(toks[7], 0) == 18446744073709551615: | |
143 | return None | |
144 | bufkey = ' '.join(toks[4:10]) | |
145 | if bufkey in buffers: | |
146 | return buffers[bufkey] | |
147 | buf = Buffer(toks[5], toks[7], toks[9]) | |
148 | buffers[bufkey] = buf | |
149 | return buf | |
150 | ||
151 | nr = 0 | |
152 | for line in fileinput.input(): | |
153 | nr += 1 | |
154 | toks = line.split() | |
155 | if len(toks) < 4: | |
156 | continue | |
157 | pid = toks[0] | |
f231685e DW |
158 | try: |
159 | time = float(toks[2][:-1]) | |
160 | except: | |
161 | continue | |
59d2274f DW |
162 | fn = toks[3][:-1] |
163 | ||
164 | if pid in processes: | |
165 | proc = processes[pid] | |
166 | else: | |
167 | proc = Process(pid) | |
168 | processes[pid] = proc | |
169 | ||
170 | if fn == 'xfs_buf_unlock' or fn == 'xfs_buf_item_unlock_stale': | |
171 | buf = getbuf(toks) | |
172 | if buf is not None: | |
173 | buf.unlock() | |
174 | elif fn == 'xfs_buf_lock_done': | |
175 | buf = getbuf(toks) | |
176 | if buf is not None: | |
177 | buf.lockdone(proc, time) | |
178 | elif fn == 'xfs_buf_lock': | |
179 | buf = getbuf(toks) | |
180 | if buf is not None: | |
181 | buf.waitlock(proc) | |
182 | elif fn == 'xfs_buf_trylock': | |
183 | buf = getbuf(toks) | |
184 | if buf is not None: | |
185 | buf.trylock(proc, time) | |
186 | elif fn == 'xfs_buf_item_unlock': | |
187 | pass | |
188 | else: | |
189 | e = Event(time, ' '.join(toks[3:])) | |
190 | proc.backtrace.append(e) | |
191 | if len(proc.backtrace) > NR_BACKTRACE: | |
192 | proc.backtrace.pop(0) | |
193 | ||
194 | deadlocked = set() | |
195 | for buf in locked_buffers: | |
196 | deadlocked.add(buf.owner) | |
197 | ||
198 | for proc in deadlocked: | |
199 | proc.dump() | |
200 | ||
201 | sys.exit(0) | |
202 | ||
203 | for key in buffers: | |
204 | buf = buffers[key] | |
205 | if buf.locked: | |
206 | print('dev %s bno 0x%x len 0x%x owner %s' % (buf.dev, buf.bno, buf.blen, buf.owner.pid)) | |
207 | else: | |
208 | print('dev %s bno 0x%x len 0x%x' % (buf.dev, buf.bno, buf.blen)) | |
209 | ||
210 | sys.exit(0) | |
211 | ||
212 | for pid in processes: | |
213 | proc = processes[pid] |