1 From: Tao Ma <tao.ma@oracle.com>
2 Subject: [PATCH 11/16] ocfs2: Add xattr bucket iteration for large numbers of EAs
3 Patch-mainline: 2.6.28?
6 Ocfs2 breaks up xattr index tree leaves into 4k regions, called buckets.
7 Attributes are stored within a given bucket, depending on hash value.
9 After a discussion with Mark, we decided that the per-bucket index
10 (xe_entry[]) would only exist in the 1st block of a bucket. Likewise,
11 name/value pairs will not straddle more than one block. This allows the
12 majority of operations to work directly on the buffer heads in a leaf block.
14 This patch adds code to iterate the buckets in an EA. A new abstration of
15 ocfs2_xattr_bucket is added. It records the bhs in this bucket and
16 ocfs2_xattr_header. This keeps the code neat, improving readibility.
18 Signed-off-by: Tao Ma <tao.ma@oracle.com>
19 Signed-off-by: Mark Fasheh <mfasheh@suse.com>
21 fs/ocfs2/ocfs2_fs.h | 35 +++++++-
22 fs/ocfs2/xattr.c | 255 ++++++++++++++++++++++++++++++++++++++++++++++++++-
23 fs/ocfs2/xattr.h | 9 ++
24 3 files changed, 293 insertions(+), 6 deletions(-)
26 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
27 index 98e1f8b..8d5e72f 100644
28 --- a/fs/ocfs2/ocfs2_fs.h
29 +++ b/fs/ocfs2/ocfs2_fs.h
30 @@ -755,8 +755,13 @@ struct ocfs2_xattr_header {
31 __le16 xh_count; /* contains the count of how
32 many records are in the
33 local xattr storage. */
34 - __le16 xh_reserved1;
35 - __le32 xh_reserved2;
36 + __le16 xh_free_start; /* current offset for storing
38 + __le16 xh_name_value_len; /* total length of name/value
39 + length in this bucket. */
40 + __le16 xh_num_buckets; /* bucket nums in one extent
41 + record, only valid in the
44 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
46 @@ -793,6 +798,10 @@ struct ocfs2_xattr_tree_root {
47 #define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \
50 +#define OCFS2_XATTR_BUCKET_SIZE 4096
51 +#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \
52 + / OCFS2_MIN_BLOCKSIZE)
55 * On disk structure for xattr block.
57 @@ -963,6 +972,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
62 +static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
66 + size = sb->s_blocksize -
67 + offsetof(struct ocfs2_xattr_block,
68 + xb_attrs.xb_root.xt_list.l_recs);
70 + return size / sizeof(struct ocfs2_extent_rec);
73 static inline int ocfs2_fast_symlink_chars(int blocksize)
75 @@ -1046,6 +1066,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
80 +static inline int ocfs2_xattr_recs_per_xb(int blocksize)
85 + offsetof(struct ocfs2_xattr_block,
86 + xb_attrs.xb_root.xt_list.l_recs);
88 + return size / sizeof(struct ocfs2_extent_rec);
90 #endif /* __KERNEL__ */
93 diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
94 index 3685cc6..ed41c15 100644
95 --- a/fs/ocfs2/xattr.c
96 +++ b/fs/ocfs2/xattr.c
100 #include "buffer_head_io.h"
105 @@ -60,6 +61,11 @@ struct ocfs2_xattr_def_value_root {
106 struct ocfs2_extent_rec er;
109 +struct ocfs2_xattr_bucket {
110 + struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
111 + struct ocfs2_xattr_header *xh;
114 #define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
115 #define OCFS2_XATTR_INLINE_SIZE 80
117 @@ -115,6 +121,11 @@ struct ocfs2_xattr_search {
121 +static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
122 + struct ocfs2_xattr_tree_root *xt,
124 + size_t buffer_size);
126 static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
128 struct xattr_handler *handler = NULL;
129 @@ -499,7 +510,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
132 struct buffer_head *blk_bh = NULL;
133 - struct ocfs2_xattr_header *header = NULL;
134 + struct ocfs2_xattr_block *xb;
137 if (!di->i_xattr_loc)
138 @@ -519,10 +530,17 @@ static int ocfs2_xattr_block_list(struct inode *inode,
142 - header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
143 - xb_attrs.xb_header;
144 + xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
146 - ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
147 + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
148 + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
149 + ret = ocfs2_xattr_list_entries(inode, header,
150 + buffer, buffer_size);
152 + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
153 + ret = ocfs2_xattr_tree_list_index_block(inode, xt,
154 + buffer, buffer_size);
159 @@ -1939,3 +1957,232 @@ cleanup:
164 + * Find the xattr extent rec which may contains name_hash.
165 + * e_cpos will be the first name hash of the xattr rec.
166 + * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
168 +static int ocfs2_xattr_get_rec(struct inode *inode,
173 + struct ocfs2_extent_list *el)
176 + struct buffer_head *eb_bh = NULL;
177 + struct ocfs2_extent_block *eb;
178 + struct ocfs2_extent_rec *rec = NULL;
181 + if (el->l_tree_depth) {
182 + ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
188 + eb = (struct ocfs2_extent_block *) eb_bh->b_data;
191 + if (el->l_tree_depth) {
192 + ocfs2_error(inode->i_sb,
193 + "Inode %lu has non zero tree depth in "
194 + "xattr tree block %llu\n", inode->i_ino,
195 + (unsigned long long)eb_bh->b_blocknr);
201 + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
202 + rec = &el->l_recs[i];
204 + if (le32_to_cpu(rec->e_cpos) <= name_hash) {
205 + e_blkno = le64_to_cpu(rec->e_blkno);
211 + ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
212 + "record (%u, %u, 0) in xattr", inode->i_ino,
213 + le32_to_cpu(rec->e_cpos),
214 + ocfs2_rec_clusters(el, rec));
219 + *p_blkno = le64_to_cpu(rec->e_blkno);
220 + *num_clusters = le16_to_cpu(rec->e_leaf_clusters);
222 + *e_cpos = le32_to_cpu(rec->e_cpos);
228 +typedef int (xattr_bucket_func)(struct inode *inode,
229 + struct ocfs2_xattr_bucket *bucket,
232 +static int ocfs2_iterate_xattr_buckets(struct inode *inode,
235 + xattr_bucket_func *func,
239 + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
240 + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
241 + u32 num_buckets = clusters * bpc;
242 + struct ocfs2_xattr_bucket bucket;
244 + memset(&bucket, 0, sizeof(bucket));
246 + mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
249 + for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
250 + ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
251 + blkno, blk_per_bucket,
252 + bucket.bhs, OCFS2_BH_CACHED, inode);
258 + bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
260 + * The real bucket num in this series of blocks is stored
261 + * in the 1st bucket.
264 + num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
266 + mlog(0, "iterating xattr bucket %llu\n", blkno);
268 + ret = func(inode, &bucket, para);
275 + for (j = 0; j < blk_per_bucket; j++)
276 + brelse(bucket.bhs[j]);
277 + memset(&bucket, 0, sizeof(bucket));
281 + for (j = 0; j < blk_per_bucket; j++)
282 + brelse(bucket.bhs[j]);
287 +struct ocfs2_xattr_tree_list {
289 + size_t buffer_size;
292 +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
293 + struct ocfs2_xattr_header *xh,
300 + if (index < 0 || index >= le16_to_cpu(xh->xh_count))
303 + name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
305 + *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
306 + *new_offset = name_offset % inode->i_sb->s_blocksize;
311 +static int ocfs2_list_xattr_bucket(struct inode *inode,
312 + struct ocfs2_xattr_bucket *bucket,
316 + struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
318 + int i, block_off, new_offset;
320 + for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
321 + struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
322 + struct xattr_handler *handler =
323 + ocfs2_xattr_handler(ocfs2_xattr_get_type(entry));
326 + ret = ocfs2_xattr_bucket_get_name_value(inode,
333 + size = handler->list(inode, xl->buffer, xl->buffer_size,
334 + bucket->bhs[block_off]->b_data +
336 + entry->xe_name_len);
338 + if (size > xl->buffer_size)
340 + xl->buffer += size;
342 + xl->buffer_size -= size;
349 +static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
350 + struct ocfs2_xattr_tree_root *xt,
352 + size_t buffer_size)
354 + struct ocfs2_extent_list *el = &xt->xt_list;
356 + u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
358 + struct ocfs2_xattr_tree_list xl = {
360 + .buffer_size = buffer_size,
363 + if (le16_to_cpu(el->l_next_free_rec) == 0)
366 + while (name_hash > 0) {
367 + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
368 + &e_cpos, &num_clusters, el);
374 + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
375 + ocfs2_list_xattr_bucket,
385 + name_hash = e_cpos - 1;
388 + ret = buffer_size - xl.buffer_size;
392 diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
393 index f565c64..a69c8aa 100644
394 --- a/fs/ocfs2/xattr.h
395 +++ b/fs/ocfs2/xattr.h
396 @@ -55,4 +55,13 @@ extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
397 extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
398 extern struct xattr_handler *ocfs2_xattr_handlers[];
400 +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
402 + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
405 +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
407 + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
409 #endif /* OCFS2_XATTR_H */