Mention branches and keyring.
[releases.git] / ext4 / mballoc.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4  * Written by Alex Tomas <alex@clusterfs.com>
5  */
6
7
8 /*
9  * mballoc.c contains the multiblocks allocation routines
10  */
11
12 #include "ext4_jbd2.h"
13 #include "mballoc.h"
14 #include <linux/log2.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/nospec.h>
18 #include <linux/backing-dev.h>
19 #include <linux/freezer.h>
20 #include <trace/events/ext4.h>
21
22 /*
23  * MUSTDO:
24  *   - test ext4_ext_search_left() and ext4_ext_search_right()
25  *   - search for metadata in few groups
26  *
27  * TODO v4:
28  *   - normalization should take into account whether file is still open
29  *   - discard preallocations if no free space left (policy?)
30  *   - don't normalize tails
31  *   - quota
32  *   - reservation for superuser
33  *
34  * TODO v3:
35  *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
36  *   - track min/max extents in each group for better group selection
37  *   - mb_mark_used() may allocate chunk right after splitting buddy
38  *   - tree of groups sorted by number of free blocks
39  *   - error handling
40  */
41
42 /*
43  * The allocation request involve request for multiple number of blocks
44  * near to the goal(block) value specified.
45  *
46  * During initialization phase of the allocator we decide to use the
47  * group preallocation or inode preallocation depending on the size of
48  * the file. The size of the file could be the resulting file size we
49  * would have after allocation, or the current file size, which ever
50  * is larger. If the size is less than sbi->s_mb_stream_request we
51  * select to use the group preallocation. The default value of
52  * s_mb_stream_request is 16 blocks. This can also be tuned via
53  * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
54  * terms of number of blocks.
55  *
56  * The main motivation for having small file use group preallocation is to
57  * ensure that we have small files closer together on the disk.
58  *
59  * First stage the allocator looks at the inode prealloc list,
60  * ext4_inode_info->i_prealloc_list, which contains list of prealloc
61  * spaces for this particular inode. The inode prealloc space is
62  * represented as:
63  *
64  * pa_lstart -> the logical start block for this prealloc space
65  * pa_pstart -> the physical start block for this prealloc space
66  * pa_len    -> length for this prealloc space (in clusters)
67  * pa_free   ->  free space available in this prealloc space (in clusters)
68  *
69  * The inode preallocation space is used looking at the _logical_ start
70  * block. If only the logical file block falls within the range of prealloc
71  * space we will consume the particular prealloc space. This makes sure that
72  * we have contiguous physical blocks representing the file blocks
73  *
74  * The important thing to be noted in case of inode prealloc space is that
75  * we don't modify the values associated to inode prealloc space except
76  * pa_free.
77  *
78  * If we are not able to find blocks in the inode prealloc space and if we
79  * have the group allocation flag set then we look at the locality group
80  * prealloc space. These are per CPU prealloc list represented as
81  *
82  * ext4_sb_info.s_locality_groups[smp_processor_id()]
83  *
84  * The reason for having a per cpu locality group is to reduce the contention
85  * between CPUs. It is possible to get scheduled at this point.
86  *
87  * The locality group prealloc space is used looking at whether we have
88  * enough free space (pa_free) within the prealloc space.
89  *
90  * If we can't allocate blocks via inode prealloc or/and locality group
91  * prealloc then we look at the buddy cache. The buddy cache is represented
92  * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
93  * mapped to the buddy and bitmap information regarding different
94  * groups. The buddy information is attached to buddy cache inode so that
95  * we can access them through the page cache. The information regarding
96  * each group is loaded via ext4_mb_load_buddy.  The information involve
97  * block bitmap and buddy information. The information are stored in the
98  * inode as:
99  *
100  *  {                        page                        }
101  *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
102  *
103  *
104  * one block each for bitmap and buddy information.  So for each group we
105  * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
106  * blocksize) blocks.  So it can have information regarding groups_per_page
107  * which is blocks_per_page/2
108  *
109  * The buddy cache inode is not stored on disk. The inode is thrown
110  * away when the filesystem is unmounted.
111  *
112  * We look for count number of blocks in the buddy cache. If we were able
113  * to locate that many free blocks we return with additional information
114  * regarding rest of the contiguous physical block available
115  *
116  * Before allocating blocks via buddy cache we normalize the request
117  * blocks. This ensure we ask for more blocks that we needed. The extra
118  * blocks that we get after allocation is added to the respective prealloc
119  * list. In case of inode preallocation we follow a list of heuristics
120  * based on file size. This can be found in ext4_mb_normalize_request. If
121  * we are doing a group prealloc we try to normalize the request to
122  * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
123  * dependent on the cluster size; for non-bigalloc file systems, it is
124  * 512 blocks. This can be tuned via
125  * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
126  * terms of number of blocks. If we have mounted the file system with -O
127  * stripe=<value> option the group prealloc request is normalized to the
128  * smallest multiple of the stripe value (sbi->s_stripe) which is
129  * greater than the default mb_group_prealloc.
130  *
131  * The regular allocator (using the buddy cache) supports a few tunables.
132  *
133  * /sys/fs/ext4/<partition>/mb_min_to_scan
134  * /sys/fs/ext4/<partition>/mb_max_to_scan
135  * /sys/fs/ext4/<partition>/mb_order2_req
136  *
137  * The regular allocator uses buddy scan only if the request len is power of
138  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
139  * value of s_mb_order2_reqs can be tuned via
140  * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
141  * stripe size (sbi->s_stripe), we try to search for contiguous block in
142  * stripe size. This should result in better allocation on RAID setups. If
143  * not, we search in the specific group using bitmap for best extents. The
144  * tunable min_to_scan and max_to_scan control the behaviour here.
145  * min_to_scan indicate how long the mballoc __must__ look for a best
146  * extent and max_to_scan indicates how long the mballoc __can__ look for a
147  * best extent in the found extents. Searching for the blocks starts with
148  * the group specified as the goal value in allocation context via
149  * ac_g_ex. Each group is first checked based on the criteria whether it
150  * can be used for allocation. ext4_mb_good_group explains how the groups are
151  * checked.
152  *
153  * Both the prealloc space are getting populated as above. So for the first
154  * request we will hit the buddy cache which will result in this prealloc
155  * space getting filled. The prealloc space is then later used for the
156  * subsequent request.
157  */
158
159 /*
160  * mballoc operates on the following data:
161  *  - on-disk bitmap
162  *  - in-core buddy (actually includes buddy and bitmap)
163  *  - preallocation descriptors (PAs)
164  *
165  * there are two types of preallocations:
166  *  - inode
167  *    assiged to specific inode and can be used for this inode only.
168  *    it describes part of inode's space preallocated to specific
169  *    physical blocks. any block from that preallocated can be used
170  *    independent. the descriptor just tracks number of blocks left
171  *    unused. so, before taking some block from descriptor, one must
172  *    make sure corresponded logical block isn't allocated yet. this
173  *    also means that freeing any block within descriptor's range
174  *    must discard all preallocated blocks.
175  *  - locality group
176  *    assigned to specific locality group which does not translate to
177  *    permanent set of inodes: inode can join and leave group. space
178  *    from this type of preallocation can be used for any inode. thus
179  *    it's consumed from the beginning to the end.
180  *
181  * relation between them can be expressed as:
182  *    in-core buddy = on-disk bitmap + preallocation descriptors
183  *
184  * this mean blocks mballoc considers used are:
185  *  - allocated blocks (persistent)
186  *  - preallocated blocks (non-persistent)
187  *
188  * consistency in mballoc world means that at any time a block is either
189  * free or used in ALL structures. notice: "any time" should not be read
190  * literally -- time is discrete and delimited by locks.
191  *
192  *  to keep it simple, we don't use block numbers, instead we count number of
193  *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
194  *
195  * all operations can be expressed as:
196  *  - init buddy:                       buddy = on-disk + PAs
197  *  - new PA:                           buddy += N; PA = N
198  *  - use inode PA:                     on-disk += N; PA -= N
199  *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
200  *  - use locality group PA             on-disk += N; PA -= N
201  *  - discard locality group PA         buddy -= PA; PA = 0
202  *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
203  *        is used in real operation because we can't know actual used
204  *        bits from PA, only from on-disk bitmap
205  *
206  * if we follow this strict logic, then all operations above should be atomic.
207  * given some of them can block, we'd have to use something like semaphores
208  * killing performance on high-end SMP hardware. let's try to relax it using
209  * the following knowledge:
210  *  1) if buddy is referenced, it's already initialized
211  *  2) while block is used in buddy and the buddy is referenced,
212  *     nobody can re-allocate that block
213  *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
214  *     bit set and PA claims same block, it's OK. IOW, one can set bit in
215  *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
216  *     block
217  *
218  * so, now we're building a concurrency table:
219  *  - init buddy vs.
220  *    - new PA
221  *      blocks for PA are allocated in the buddy, buddy must be referenced
222  *      until PA is linked to allocation group to avoid concurrent buddy init
223  *    - use inode PA
224  *      we need to make sure that either on-disk bitmap or PA has uptodate data
225  *      given (3) we care that PA-=N operation doesn't interfere with init
226  *    - discard inode PA
227  *      the simplest way would be to have buddy initialized by the discard
228  *    - use locality group PA
229  *      again PA-=N must be serialized with init
230  *    - discard locality group PA
231  *      the simplest way would be to have buddy initialized by the discard
232  *  - new PA vs.
233  *    - use inode PA
234  *      i_data_sem serializes them
235  *    - discard inode PA
236  *      discard process must wait until PA isn't used by another process
237  *    - use locality group PA
238  *      some mutex should serialize them
239  *    - discard locality group PA
240  *      discard process must wait until PA isn't used by another process
241  *  - use inode PA
242  *    - use inode PA
243  *      i_data_sem or another mutex should serializes them
244  *    - discard inode PA
245  *      discard process must wait until PA isn't used by another process
246  *    - use locality group PA
247  *      nothing wrong here -- they're different PAs covering different blocks
248  *    - discard locality group PA
249  *      discard process must wait until PA isn't used by another process
250  *
251  * now we're ready to make few consequences:
252  *  - PA is referenced and while it is no discard is possible
253  *  - PA is referenced until block isn't marked in on-disk bitmap
254  *  - PA changes only after on-disk bitmap
255  *  - discard must not compete with init. either init is done before
256  *    any discard or they're serialized somehow
257  *  - buddy init as sum of on-disk bitmap and PAs is done atomically
258  *
259  * a special case when we've used PA to emptiness. no need to modify buddy
260  * in this case, but we should care about concurrent init
261  *
262  */
263
264  /*
265  * Logic in few words:
266  *
267  *  - allocation:
268  *    load group
269  *    find blocks
270  *    mark bits in on-disk bitmap
271  *    release group
272  *
273  *  - use preallocation:
274  *    find proper PA (per-inode or group)
275  *    load group
276  *    mark bits in on-disk bitmap
277  *    release group
278  *    release PA
279  *
280  *  - free:
281  *    load group
282  *    mark bits in on-disk bitmap
283  *    release group
284  *
285  *  - discard preallocations in group:
286  *    mark PAs deleted
287  *    move them onto local list
288  *    load on-disk bitmap
289  *    load group
290  *    remove PA from object (inode or locality group)
291  *    mark free blocks in-core
292  *
293  *  - discard inode's preallocations:
294  */
295
296 /*
297  * Locking rules
298  *
299  * Locks:
300  *  - bitlock on a group        (group)
301  *  - object (inode/locality)   (object)
302  *  - per-pa lock               (pa)
303  *
304  * Paths:
305  *  - new pa
306  *    object
307  *    group
308  *
309  *  - find and use pa:
310  *    pa
311  *
312  *  - release consumed pa:
313  *    pa
314  *    group
315  *    object
316  *
317  *  - generate in-core bitmap:
318  *    group
319  *        pa
320  *
321  *  - discard all for given object (inode, locality group):
322  *    object
323  *        pa
324  *    group
325  *
326  *  - discard all for given group:
327  *    group
328  *        pa
329  *    group
330  *        object
331  *
332  */
333 static struct kmem_cache *ext4_pspace_cachep;
334 static struct kmem_cache *ext4_ac_cachep;
335 static struct kmem_cache *ext4_free_data_cachep;
336
337 /* We create slab caches for groupinfo data structures based on the
338  * superblock block size.  There will be one per mounted filesystem for
339  * each unique s_blocksize_bits */
340 #define NR_GRPINFO_CACHES 8
341 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
342
343 static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
344         "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
345         "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
346         "ext4_groupinfo_64k", "ext4_groupinfo_128k"
347 };
348
349 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
350                                         ext4_group_t group);
351 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
352                                                 ext4_group_t group);
353 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
354
355 /*
356  * The algorithm using this percpu seq counter goes below:
357  * 1. We sample the percpu discard_pa_seq counter before trying for block
358  *    allocation in ext4_mb_new_blocks().
359  * 2. We increment this percpu discard_pa_seq counter when we either allocate
360  *    or free these blocks i.e. while marking those blocks as used/free in
361  *    mb_mark_used()/mb_free_blocks().
362  * 3. We also increment this percpu seq counter when we successfully identify
363  *    that the bb_prealloc_list is not empty and hence proceed for discarding
364  *    of those PAs inside ext4_mb_discard_group_preallocations().
365  *
366  * Now to make sure that the regular fast path of block allocation is not
367  * affected, as a small optimization we only sample the percpu seq counter
368  * on that cpu. Only when the block allocation fails and when freed blocks
369  * found were 0, that is when we sample percpu seq counter for all cpus using
370  * below function ext4_get_discard_pa_seq_sum(). This happens after making
371  * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
372  */
373 static DEFINE_PER_CPU(u64, discard_pa_seq);
374 static inline u64 ext4_get_discard_pa_seq_sum(void)
375 {
376         int __cpu;
377         u64 __seq = 0;
378
379         for_each_possible_cpu(__cpu)
380                 __seq += per_cpu(discard_pa_seq, __cpu);
381         return __seq;
382 }
383
384 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
385 {
386 #if BITS_PER_LONG == 64
387         *bit += ((unsigned long) addr & 7UL) << 3;
388         addr = (void *) ((unsigned long) addr & ~7UL);
389 #elif BITS_PER_LONG == 32
390         *bit += ((unsigned long) addr & 3UL) << 3;
391         addr = (void *) ((unsigned long) addr & ~3UL);
392 #else
393 #error "how many bits you are?!"
394 #endif
395         return addr;
396 }
397
398 static inline int mb_test_bit(int bit, void *addr)
399 {
400         /*
401          * ext4_test_bit on architecture like powerpc
402          * needs unsigned long aligned address
403          */
404         addr = mb_correct_addr_and_bit(&bit, addr);
405         return ext4_test_bit(bit, addr);
406 }
407
408 static inline void mb_set_bit(int bit, void *addr)
409 {
410         addr = mb_correct_addr_and_bit(&bit, addr);
411         ext4_set_bit(bit, addr);
412 }
413
414 static inline void mb_clear_bit(int bit, void *addr)
415 {
416         addr = mb_correct_addr_and_bit(&bit, addr);
417         ext4_clear_bit(bit, addr);
418 }
419
420 static inline int mb_test_and_clear_bit(int bit, void *addr)
421 {
422         addr = mb_correct_addr_and_bit(&bit, addr);
423         return ext4_test_and_clear_bit(bit, addr);
424 }
425
426 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
427 {
428         int fix = 0, ret, tmpmax;
429         addr = mb_correct_addr_and_bit(&fix, addr);
430         tmpmax = max + fix;
431         start += fix;
432
433         ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
434         if (ret > max)
435                 return max;
436         return ret;
437 }
438
439 static inline int mb_find_next_bit(void *addr, int max, int start)
440 {
441         int fix = 0, ret, tmpmax;
442         addr = mb_correct_addr_and_bit(&fix, addr);
443         tmpmax = max + fix;
444         start += fix;
445
446         ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
447         if (ret > max)
448                 return max;
449         return ret;
450 }
451
452 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
453 {
454         char *bb;
455
456         BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
457         BUG_ON(max == NULL);
458
459         if (order > e4b->bd_blkbits + 1) {
460                 *max = 0;
461                 return NULL;
462         }
463
464         /* at order 0 we see each particular block */
465         if (order == 0) {
466                 *max = 1 << (e4b->bd_blkbits + 3);
467                 return e4b->bd_bitmap;
468         }
469
470         bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
471         *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
472
473         return bb;
474 }
475
476 #ifdef DOUBLE_CHECK
477 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
478                            int first, int count)
479 {
480         int i;
481         struct super_block *sb = e4b->bd_sb;
482
483         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
484                 return;
485         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
486         for (i = 0; i < count; i++) {
487                 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
488                         ext4_fsblk_t blocknr;
489
490                         blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
491                         blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
492                         ext4_grp_locked_error(sb, e4b->bd_group,
493                                               inode ? inode->i_ino : 0,
494                                               blocknr,
495                                               "freeing block already freed "
496                                               "(bit %u)",
497                                               first + i);
498                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
499                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
500                 }
501                 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
502         }
503 }
504
505 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
506 {
507         int i;
508
509         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
510                 return;
511         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
512         for (i = 0; i < count; i++) {
513                 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
514                 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
515         }
516 }
517
518 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
519 {
520         if (unlikely(e4b->bd_info->bb_bitmap == NULL))
521                 return;
522         if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
523                 unsigned char *b1, *b2;
524                 int i;
525                 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
526                 b2 = (unsigned char *) bitmap;
527                 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
528                         if (b1[i] != b2[i]) {
529                                 ext4_msg(e4b->bd_sb, KERN_ERR,
530                                          "corruption in group %u "
531                                          "at byte %u(%u): %x in copy != %x "
532                                          "on disk/prealloc",
533                                          e4b->bd_group, i, i * 8, b1[i], b2[i]);
534                                 BUG();
535                         }
536                 }
537         }
538 }
539
540 static void mb_group_bb_bitmap_alloc(struct super_block *sb,
541                         struct ext4_group_info *grp, ext4_group_t group)
542 {
543         struct buffer_head *bh;
544
545         grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
546         if (!grp->bb_bitmap)
547                 return;
548
549         bh = ext4_read_block_bitmap(sb, group);
550         if (IS_ERR_OR_NULL(bh)) {
551                 kfree(grp->bb_bitmap);
552                 grp->bb_bitmap = NULL;
553                 return;
554         }
555
556         memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
557         put_bh(bh);
558 }
559
560 static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
561 {
562         kfree(grp->bb_bitmap);
563 }
564
565 #else
566 static inline void mb_free_blocks_double(struct inode *inode,
567                                 struct ext4_buddy *e4b, int first, int count)
568 {
569         return;
570 }
571 static inline void mb_mark_used_double(struct ext4_buddy *e4b,
572                                                 int first, int count)
573 {
574         return;
575 }
576 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
577 {
578         return;
579 }
580
581 static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
582                         struct ext4_group_info *grp, ext4_group_t group)
583 {
584         return;
585 }
586
587 static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
588 {
589         return;
590 }
591 #endif
592
593 #ifdef AGGRESSIVE_CHECK
594
595 #define MB_CHECK_ASSERT(assert)                                         \
596 do {                                                                    \
597         if (!(assert)) {                                                \
598                 printk(KERN_EMERG                                       \
599                         "Assertion failure in %s() at %s:%d: \"%s\"\n", \
600                         function, file, line, # assert);                \
601                 BUG();                                                  \
602         }                                                               \
603 } while (0)
604
605 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
606                                 const char *function, int line)
607 {
608         struct super_block *sb = e4b->bd_sb;
609         int order = e4b->bd_blkbits + 1;
610         int max;
611         int max2;
612         int i;
613         int j;
614         int k;
615         int count;
616         struct ext4_group_info *grp;
617         int fragments = 0;
618         int fstart;
619         struct list_head *cur;
620         void *buddy;
621         void *buddy2;
622
623         if (e4b->bd_info->bb_check_counter++ % 10)
624                 return 0;
625
626         while (order > 1) {
627                 buddy = mb_find_buddy(e4b, order, &max);
628                 MB_CHECK_ASSERT(buddy);
629                 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
630                 MB_CHECK_ASSERT(buddy2);
631                 MB_CHECK_ASSERT(buddy != buddy2);
632                 MB_CHECK_ASSERT(max * 2 == max2);
633
634                 count = 0;
635                 for (i = 0; i < max; i++) {
636
637                         if (mb_test_bit(i, buddy)) {
638                                 /* only single bit in buddy2 may be 1 */
639                                 if (!mb_test_bit(i << 1, buddy2)) {
640                                         MB_CHECK_ASSERT(
641                                                 mb_test_bit((i<<1)+1, buddy2));
642                                 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
643                                         MB_CHECK_ASSERT(
644                                                 mb_test_bit(i << 1, buddy2));
645                                 }
646                                 continue;
647                         }
648
649                         /* both bits in buddy2 must be 1 */
650                         MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
651                         MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
652
653                         for (j = 0; j < (1 << order); j++) {
654                                 k = (i * (1 << order)) + j;
655                                 MB_CHECK_ASSERT(
656                                         !mb_test_bit(k, e4b->bd_bitmap));
657                         }
658                         count++;
659                 }
660                 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
661                 order--;
662         }
663
664         fstart = -1;
665         buddy = mb_find_buddy(e4b, 0, &max);
666         for (i = 0; i < max; i++) {
667                 if (!mb_test_bit(i, buddy)) {
668                         MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
669                         if (fstart == -1) {
670                                 fragments++;
671                                 fstart = i;
672                         }
673                         continue;
674                 }
675                 fstart = -1;
676                 /* check used bits only */
677                 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
678                         buddy2 = mb_find_buddy(e4b, j, &max2);
679                         k = i >> j;
680                         MB_CHECK_ASSERT(k < max2);
681                         MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
682                 }
683         }
684         MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
685         MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
686
687         grp = ext4_get_group_info(sb, e4b->bd_group);
688         if (!grp)
689                 return NULL;
690         list_for_each(cur, &grp->bb_prealloc_list) {
691                 ext4_group_t groupnr;
692                 struct ext4_prealloc_space *pa;
693                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
694                 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
695                 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
696                 for (i = 0; i < pa->pa_len; i++)
697                         MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
698         }
699         return 0;
700 }
701 #undef MB_CHECK_ASSERT
702 #define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
703                                         __FILE__, __func__, __LINE__)
704 #else
705 #define mb_check_buddy(e4b)
706 #endif
707
708 /*
709  * Divide blocks started from @first with length @len into
710  * smaller chunks with power of 2 blocks.
711  * Clear the bits in bitmap which the blocks of the chunk(s) covered,
712  * then increase bb_counters[] for corresponded chunk size.
713  */
714 static void ext4_mb_mark_free_simple(struct super_block *sb,
715                                 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
716                                         struct ext4_group_info *grp)
717 {
718         struct ext4_sb_info *sbi = EXT4_SB(sb);
719         ext4_grpblk_t min;
720         ext4_grpblk_t max;
721         ext4_grpblk_t chunk;
722         unsigned int border;
723
724         BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
725
726         border = 2 << sb->s_blocksize_bits;
727
728         while (len > 0) {
729                 /* find how many blocks can be covered since this position */
730                 max = ffs(first | border) - 1;
731
732                 /* find how many blocks of power 2 we need to mark */
733                 min = fls(len) - 1;
734
735                 if (max < min)
736                         min = max;
737                 chunk = 1 << min;
738
739                 /* mark multiblock chunks only */
740                 grp->bb_counters[min]++;
741                 if (min > 0)
742                         mb_clear_bit(first >> min,
743                                      buddy + sbi->s_mb_offsets[min]);
744
745                 len -= chunk;
746                 first += chunk;
747         }
748 }
749
750 /*
751  * Cache the order of the largest free extent we have available in this block
752  * group.
753  */
754 static void
755 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
756 {
757         int i;
758         int bits;
759
760         grp->bb_largest_free_order = -1; /* uninit */
761
762         bits = sb->s_blocksize_bits + 1;
763         for (i = bits; i >= 0; i--) {
764                 if (grp->bb_counters[i] > 0) {
765                         grp->bb_largest_free_order = i;
766                         break;
767                 }
768         }
769 }
770
771 static noinline_for_stack
772 void ext4_mb_generate_buddy(struct super_block *sb,
773                             void *buddy, void *bitmap, ext4_group_t group,
774                             struct ext4_group_info *grp)
775 {
776         struct ext4_sb_info *sbi = EXT4_SB(sb);
777         ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
778         ext4_grpblk_t i = 0;
779         ext4_grpblk_t first;
780         ext4_grpblk_t len;
781         unsigned free = 0;
782         unsigned fragments = 0;
783         unsigned long long period = get_cycles();
784
785         /* initialize buddy from bitmap which is aggregation
786          * of on-disk bitmap and preallocations */
787         i = mb_find_next_zero_bit(bitmap, max, 0);
788         grp->bb_first_free = i;
789         while (i < max) {
790                 fragments++;
791                 first = i;
792                 i = mb_find_next_bit(bitmap, max, i);
793                 len = i - first;
794                 free += len;
795                 if (len > 1)
796                         ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
797                 else
798                         grp->bb_counters[0]++;
799                 if (i < max)
800                         i = mb_find_next_zero_bit(bitmap, max, i);
801         }
802         grp->bb_fragments = fragments;
803
804         if (free != grp->bb_free) {
805                 ext4_grp_locked_error(sb, group, 0, 0,
806                                       "block bitmap and bg descriptor "
807                                       "inconsistent: %u vs %u free clusters",
808                                       free, grp->bb_free);
809                 /*
810                  * If we intend to continue, we consider group descriptor
811                  * corrupt and update bb_free using bitmap value
812                  */
813                 grp->bb_free = free;
814                 ext4_mark_group_bitmap_corrupted(sb, group,
815                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
816         }
817         mb_set_largest_free_order(sb, grp);
818
819         clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
820
821         period = get_cycles() - period;
822         atomic_inc(&sbi->s_mb_buddies_generated);
823         atomic64_add(period, &sbi->s_mb_generation_time);
824 }
825
826 static void mb_regenerate_buddy(struct ext4_buddy *e4b)
827 {
828         int count;
829         int order = 1;
830         void *buddy;
831
832         while ((buddy = mb_find_buddy(e4b, order++, &count)))
833                 ext4_set_bits(buddy, 0, count);
834
835         e4b->bd_info->bb_fragments = 0;
836         memset(e4b->bd_info->bb_counters, 0,
837                 sizeof(*e4b->bd_info->bb_counters) *
838                 (e4b->bd_sb->s_blocksize_bits + 2));
839
840         ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
841                 e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
842 }
843
844 /* The buddy information is attached the buddy cache inode
845  * for convenience. The information regarding each group
846  * is loaded via ext4_mb_load_buddy. The information involve
847  * block bitmap and buddy information. The information are
848  * stored in the inode as
849  *
850  * {                        page                        }
851  * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
852  *
853  *
854  * one block each for bitmap and buddy information.
855  * So for each group we take up 2 blocks. A page can
856  * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
857  * So it can have information regarding groups_per_page which
858  * is blocks_per_page/2
859  *
860  * Locking note:  This routine takes the block group lock of all groups
861  * for this page; do not hold this lock when calling this routine!
862  */
863
864 static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
865 {
866         ext4_group_t ngroups;
867         int blocksize;
868         int blocks_per_page;
869         int groups_per_page;
870         int err = 0;
871         int i;
872         ext4_group_t first_group, group;
873         int first_block;
874         struct super_block *sb;
875         struct buffer_head *bhs;
876         struct buffer_head **bh = NULL;
877         struct inode *inode;
878         char *data;
879         char *bitmap;
880         struct ext4_group_info *grinfo;
881
882         inode = page->mapping->host;
883         sb = inode->i_sb;
884         ngroups = ext4_get_groups_count(sb);
885         blocksize = i_blocksize(inode);
886         blocks_per_page = PAGE_SIZE / blocksize;
887
888         mb_debug(sb, "init page %lu\n", page->index);
889
890         groups_per_page = blocks_per_page >> 1;
891         if (groups_per_page == 0)
892                 groups_per_page = 1;
893
894         /* allocate buffer_heads to read bitmaps */
895         if (groups_per_page > 1) {
896                 i = sizeof(struct buffer_head *) * groups_per_page;
897                 bh = kzalloc(i, gfp);
898                 if (bh == NULL) {
899                         err = -ENOMEM;
900                         goto out;
901                 }
902         } else
903                 bh = &bhs;
904
905         first_group = page->index * blocks_per_page / 2;
906
907         /* read all groups the page covers into the cache */
908         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
909                 if (group >= ngroups)
910                         break;
911
912                 grinfo = ext4_get_group_info(sb, group);
913                 if (!grinfo)
914                         continue;
915                 /*
916                  * If page is uptodate then we came here after online resize
917                  * which added some new uninitialized group info structs, so
918                  * we must skip all initialized uptodate buddies on the page,
919                  * which may be currently in use by an allocating task.
920                  */
921                 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
922                         bh[i] = NULL;
923                         continue;
924                 }
925                 bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
926                 if (IS_ERR(bh[i])) {
927                         err = PTR_ERR(bh[i]);
928                         bh[i] = NULL;
929                         goto out;
930                 }
931                 mb_debug(sb, "read bitmap for group %u\n", group);
932         }
933
934         /* wait for I/O completion */
935         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
936                 int err2;
937
938                 if (!bh[i])
939                         continue;
940                 err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
941                 if (!err)
942                         err = err2;
943         }
944
945         first_block = page->index * blocks_per_page;
946         for (i = 0; i < blocks_per_page; i++) {
947                 group = (first_block + i) >> 1;
948                 if (group >= ngroups)
949                         break;
950
951                 if (!bh[group - first_group])
952                         /* skip initialized uptodate buddy */
953                         continue;
954
955                 if (!buffer_verified(bh[group - first_group]))
956                         /* Skip faulty bitmaps */
957                         continue;
958                 err = 0;
959
960                 /*
961                  * data carry information regarding this
962                  * particular group in the format specified
963                  * above
964                  *
965                  */
966                 data = page_address(page) + (i * blocksize);
967                 bitmap = bh[group - first_group]->b_data;
968
969                 /*
970                  * We place the buddy block and bitmap block
971                  * close together
972                  */
973                 if ((first_block + i) & 1) {
974                         /* this is block of buddy */
975                         BUG_ON(incore == NULL);
976                         mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
977                                 group, page->index, i * blocksize);
978                         trace_ext4_mb_buddy_bitmap_load(sb, group);
979                         grinfo = ext4_get_group_info(sb, group);
980                         if (!grinfo) {
981                                 err = -EFSCORRUPTED;
982                                 goto out;
983                         }
984                         grinfo->bb_fragments = 0;
985                         memset(grinfo->bb_counters, 0,
986                                sizeof(*grinfo->bb_counters) *
987                                 (sb->s_blocksize_bits+2));
988                         /*
989                          * incore got set to the group block bitmap below
990                          */
991                         ext4_lock_group(sb, group);
992                         /* init the buddy */
993                         memset(data, 0xff, blocksize);
994                         ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
995                         ext4_unlock_group(sb, group);
996                         incore = NULL;
997                 } else {
998                         /* this is block of bitmap */
999                         BUG_ON(incore != NULL);
1000                         mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
1001                                 group, page->index, i * blocksize);
1002                         trace_ext4_mb_bitmap_load(sb, group);
1003
1004                         /* see comments in ext4_mb_put_pa() */
1005                         ext4_lock_group(sb, group);
1006                         memcpy(data, bitmap, blocksize);
1007
1008                         /* mark all preallocated blks used in in-core bitmap */
1009                         ext4_mb_generate_from_pa(sb, data, group);
1010                         ext4_mb_generate_from_freelist(sb, data, group);
1011                         ext4_unlock_group(sb, group);
1012
1013                         /* set incore so that the buddy information can be
1014                          * generated using this
1015                          */
1016                         incore = data;
1017                 }
1018         }
1019         SetPageUptodate(page);
1020
1021 out:
1022         if (bh) {
1023                 for (i = 0; i < groups_per_page; i++)
1024                         brelse(bh[i]);
1025                 if (bh != &bhs)
1026                         kfree(bh);
1027         }
1028         return err;
1029 }
1030
1031 /*
1032  * Lock the buddy and bitmap pages. This make sure other parallel init_group
1033  * on the same buddy page doesn't happen whild holding the buddy page lock.
1034  * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
1035  * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
1036  */
1037 static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1038                 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
1039 {
1040         struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
1041         int block, pnum, poff;
1042         int blocks_per_page;
1043         struct page *page;
1044
1045         e4b->bd_buddy_page = NULL;
1046         e4b->bd_bitmap_page = NULL;
1047
1048         blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1049         /*
1050          * the buddy cache inode stores the block bitmap
1051          * and buddy information in consecutive blocks.
1052          * So for each group we need two blocks.
1053          */
1054         block = group * 2;
1055         pnum = block / blocks_per_page;
1056         poff = block % blocks_per_page;
1057         page = find_or_create_page(inode->i_mapping, pnum, gfp);
1058         if (!page)
1059                 return -ENOMEM;
1060         BUG_ON(page->mapping != inode->i_mapping);
1061         e4b->bd_bitmap_page = page;
1062         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1063
1064         if (blocks_per_page >= 2) {
1065                 /* buddy and bitmap are on the same page */
1066                 return 0;
1067         }
1068
1069         block++;
1070         pnum = block / blocks_per_page;
1071         page = find_or_create_page(inode->i_mapping, pnum, gfp);
1072         if (!page)
1073                 return -ENOMEM;
1074         BUG_ON(page->mapping != inode->i_mapping);
1075         e4b->bd_buddy_page = page;
1076         return 0;
1077 }
1078
1079 static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1080 {
1081         if (e4b->bd_bitmap_page) {
1082                 unlock_page(e4b->bd_bitmap_page);
1083                 put_page(e4b->bd_bitmap_page);
1084         }
1085         if (e4b->bd_buddy_page) {
1086                 unlock_page(e4b->bd_buddy_page);
1087                 put_page(e4b->bd_buddy_page);
1088         }
1089 }
1090
1091 /*
1092  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1093  * block group lock of all groups for this page; do not hold the BG lock when
1094  * calling this routine!
1095  */
1096 static noinline_for_stack
1097 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1098 {
1099
1100         struct ext4_group_info *this_grp;
1101         struct ext4_buddy e4b;
1102         struct page *page;
1103         int ret = 0;
1104
1105         might_sleep();
1106         mb_debug(sb, "init group %u\n", group);
1107         this_grp = ext4_get_group_info(sb, group);
1108         if (!this_grp)
1109                 return -EFSCORRUPTED;
1110
1111         /*
1112          * This ensures that we don't reinit the buddy cache
1113          * page which map to the group from which we are already
1114          * allocating. If we are looking at the buddy cache we would
1115          * have taken a reference using ext4_mb_load_buddy and that
1116          * would have pinned buddy page to page cache.
1117          * The call to ext4_mb_get_buddy_page_lock will mark the
1118          * page accessed.
1119          */
1120         ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1121         if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1122                 /*
1123                  * somebody initialized the group
1124                  * return without doing anything
1125                  */
1126                 goto err;
1127         }
1128
1129         page = e4b.bd_bitmap_page;
1130         ret = ext4_mb_init_cache(page, NULL, gfp);
1131         if (ret)
1132                 goto err;
1133         if (!PageUptodate(page)) {
1134                 ret = -EIO;
1135                 goto err;
1136         }
1137
1138         if (e4b.bd_buddy_page == NULL) {
1139                 /*
1140                  * If both the bitmap and buddy are in
1141                  * the same page we don't need to force
1142                  * init the buddy
1143                  */
1144                 ret = 0;
1145                 goto err;
1146         }
1147         /* init buddy cache */
1148         page = e4b.bd_buddy_page;
1149         ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1150         if (ret)
1151                 goto err;
1152         if (!PageUptodate(page)) {
1153                 ret = -EIO;
1154                 goto err;
1155         }
1156 err:
1157         ext4_mb_put_buddy_page_lock(&e4b);
1158         return ret;
1159 }
1160
1161 /*
1162  * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1163  * block group lock of all groups for this page; do not hold the BG lock when
1164  * calling this routine!
1165  */
1166 static noinline_for_stack int
1167 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1168                        struct ext4_buddy *e4b, gfp_t gfp)
1169 {
1170         int blocks_per_page;
1171         int block;
1172         int pnum;
1173         int poff;
1174         struct page *page;
1175         int ret;
1176         struct ext4_group_info *grp;
1177         struct ext4_sb_info *sbi = EXT4_SB(sb);
1178         struct inode *inode = sbi->s_buddy_cache;
1179
1180         might_sleep();
1181         mb_debug(sb, "load group %u\n", group);
1182
1183         blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1184         grp = ext4_get_group_info(sb, group);
1185         if (!grp)
1186                 return -EFSCORRUPTED;
1187
1188         e4b->bd_blkbits = sb->s_blocksize_bits;
1189         e4b->bd_info = grp;
1190         e4b->bd_sb = sb;
1191         e4b->bd_group = group;
1192         e4b->bd_buddy_page = NULL;
1193         e4b->bd_bitmap_page = NULL;
1194
1195         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1196                 /*
1197                  * we need full data about the group
1198                  * to make a good selection
1199                  */
1200                 ret = ext4_mb_init_group(sb, group, gfp);
1201                 if (ret)
1202                         return ret;
1203         }
1204
1205         /*
1206          * the buddy cache inode stores the block bitmap
1207          * and buddy information in consecutive blocks.
1208          * So for each group we need two blocks.
1209          */
1210         block = group * 2;
1211         pnum = block / blocks_per_page;
1212         poff = block % blocks_per_page;
1213
1214         /* we could use find_or_create_page(), but it locks page
1215          * what we'd like to avoid in fast path ... */
1216         page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1217         if (page == NULL || !PageUptodate(page)) {
1218                 if (page)
1219                         /*
1220                          * drop the page reference and try
1221                          * to get the page with lock. If we
1222                          * are not uptodate that implies
1223                          * somebody just created the page but
1224                          * is yet to initialize the same. So
1225                          * wait for it to initialize.
1226                          */
1227                         put_page(page);
1228                 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1229                 if (page) {
1230                         BUG_ON(page->mapping != inode->i_mapping);
1231                         if (!PageUptodate(page)) {
1232                                 ret = ext4_mb_init_cache(page, NULL, gfp);
1233                                 if (ret) {
1234                                         unlock_page(page);
1235                                         goto err;
1236                                 }
1237                                 mb_cmp_bitmaps(e4b, page_address(page) +
1238                                                (poff * sb->s_blocksize));
1239                         }
1240                         unlock_page(page);
1241                 }
1242         }
1243         if (page == NULL) {
1244                 ret = -ENOMEM;
1245                 goto err;
1246         }
1247         if (!PageUptodate(page)) {
1248                 ret = -EIO;
1249                 goto err;
1250         }
1251
1252         /* Pages marked accessed already */
1253         e4b->bd_bitmap_page = page;
1254         e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1255
1256         block++;
1257         pnum = block / blocks_per_page;
1258         poff = block % blocks_per_page;
1259
1260         page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1261         if (page == NULL || !PageUptodate(page)) {
1262                 if (page)
1263                         put_page(page);
1264                 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1265                 if (page) {
1266                         BUG_ON(page->mapping != inode->i_mapping);
1267                         if (!PageUptodate(page)) {
1268                                 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1269                                                          gfp);
1270                                 if (ret) {
1271                                         unlock_page(page);
1272                                         goto err;
1273                                 }
1274                         }
1275                         unlock_page(page);
1276                 }
1277         }
1278         if (page == NULL) {
1279                 ret = -ENOMEM;
1280                 goto err;
1281         }
1282         if (!PageUptodate(page)) {
1283                 ret = -EIO;
1284                 goto err;
1285         }
1286
1287         /* Pages marked accessed already */
1288         e4b->bd_buddy_page = page;
1289         e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1290
1291         return 0;
1292
1293 err:
1294         if (page)
1295                 put_page(page);
1296         if (e4b->bd_bitmap_page)
1297                 put_page(e4b->bd_bitmap_page);
1298         if (e4b->bd_buddy_page)
1299                 put_page(e4b->bd_buddy_page);
1300         e4b->bd_buddy = NULL;
1301         e4b->bd_bitmap = NULL;
1302         return ret;
1303 }
1304
1305 static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1306                               struct ext4_buddy *e4b)
1307 {
1308         return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1309 }
1310
1311 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1312 {
1313         if (e4b->bd_bitmap_page)
1314                 put_page(e4b->bd_bitmap_page);
1315         if (e4b->bd_buddy_page)
1316                 put_page(e4b->bd_buddy_page);
1317 }
1318
1319
1320 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1321 {
1322         int order = 1;
1323         int bb_incr = 1 << (e4b->bd_blkbits - 1);
1324         void *bb;
1325
1326         BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1327         BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1328
1329         bb = e4b->bd_buddy;
1330         while (order <= e4b->bd_blkbits + 1) {
1331                 block = block >> 1;
1332                 if (!mb_test_bit(block, bb)) {
1333                         /* this block is part of buddy of order 'order' */
1334                         return order;
1335                 }
1336                 bb += bb_incr;
1337                 bb_incr >>= 1;
1338                 order++;
1339         }
1340         return 0;
1341 }
1342
1343 static void mb_clear_bits(void *bm, int cur, int len)
1344 {
1345         __u32 *addr;
1346
1347         len = cur + len;
1348         while (cur < len) {
1349                 if ((cur & 31) == 0 && (len - cur) >= 32) {
1350                         /* fast path: clear whole word at once */
1351                         addr = bm + (cur >> 3);
1352                         *addr = 0;
1353                         cur += 32;
1354                         continue;
1355                 }
1356                 mb_clear_bit(cur, bm);
1357                 cur++;
1358         }
1359 }
1360
1361 /* clear bits in given range
1362  * will return first found zero bit if any, -1 otherwise
1363  */
1364 static int mb_test_and_clear_bits(void *bm, int cur, int len)
1365 {
1366         __u32 *addr;
1367         int zero_bit = -1;
1368
1369         len = cur + len;
1370         while (cur < len) {
1371                 if ((cur & 31) == 0 && (len - cur) >= 32) {
1372                         /* fast path: clear whole word at once */
1373                         addr = bm + (cur >> 3);
1374                         if (*addr != (__u32)(-1) && zero_bit == -1)
1375                                 zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1376                         *addr = 0;
1377                         cur += 32;
1378                         continue;
1379                 }
1380                 if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1381                         zero_bit = cur;
1382                 cur++;
1383         }
1384
1385         return zero_bit;
1386 }
1387
1388 void ext4_set_bits(void *bm, int cur, int len)
1389 {
1390         __u32 *addr;
1391
1392         len = cur + len;
1393         while (cur < len) {
1394                 if ((cur & 31) == 0 && (len - cur) >= 32) {
1395                         /* fast path: set whole word at once */
1396                         addr = bm + (cur >> 3);
1397                         *addr = 0xffffffff;
1398                         cur += 32;
1399                         continue;
1400                 }
1401                 mb_set_bit(cur, bm);
1402                 cur++;
1403         }
1404 }
1405
1406 static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1407 {
1408         if (mb_test_bit(*bit + side, bitmap)) {
1409                 mb_clear_bit(*bit, bitmap);
1410                 (*bit) -= side;
1411                 return 1;
1412         }
1413         else {
1414                 (*bit) += side;
1415                 mb_set_bit(*bit, bitmap);
1416                 return -1;
1417         }
1418 }
1419
1420 static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1421 {
1422         int max;
1423         int order = 1;
1424         void *buddy = mb_find_buddy(e4b, order, &max);
1425
1426         while (buddy) {
1427                 void *buddy2;
1428
1429                 /* Bits in range [first; last] are known to be set since
1430                  * corresponding blocks were allocated. Bits in range
1431                  * (first; last) will stay set because they form buddies on
1432                  * upper layer. We just deal with borders if they don't
1433                  * align with upper layer and then go up.
1434                  * Releasing entire group is all about clearing
1435                  * single bit of highest order buddy.
1436                  */
1437
1438                 /* Example:
1439                  * ---------------------------------
1440                  * |   1   |   1   |   1   |   1   |
1441                  * ---------------------------------
1442                  * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1443                  * ---------------------------------
1444                  *   0   1   2   3   4   5   6   7
1445                  *      \_____________________/
1446                  *
1447                  * Neither [1] nor [6] is aligned to above layer.
1448                  * Left neighbour [0] is free, so mark it busy,
1449                  * decrease bb_counters and extend range to
1450                  * [0; 6]
1451                  * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1452                  * mark [6] free, increase bb_counters and shrink range to
1453                  * [0; 5].
1454                  * Then shift range to [0; 2], go up and do the same.
1455                  */
1456
1457
1458                 if (first & 1)
1459                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1460                 if (!(last & 1))
1461                         e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1462                 if (first > last)
1463                         break;
1464                 order++;
1465
1466                 if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1467                         mb_clear_bits(buddy, first, last - first + 1);
1468                         e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1469                         break;
1470                 }
1471                 first >>= 1;
1472                 last >>= 1;
1473                 buddy = buddy2;
1474         }
1475 }
1476
1477 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1478                            int first, int count)
1479 {
1480         int left_is_free = 0;
1481         int right_is_free = 0;
1482         int block;
1483         int last = first + count - 1;
1484         struct super_block *sb = e4b->bd_sb;
1485
1486         if (WARN_ON(count == 0))
1487                 return;
1488         BUG_ON(last >= (sb->s_blocksize << 3));
1489         assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1490         /* Don't bother if the block group is corrupt. */
1491         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1492                 return;
1493
1494         mb_check_buddy(e4b);
1495         mb_free_blocks_double(inode, e4b, first, count);
1496
1497         /* access memory sequentially: check left neighbour,
1498          * clear range and then check right neighbour
1499          */
1500         if (first != 0)
1501                 left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1502         block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1503         if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1504                 right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1505
1506         if (unlikely(block != -1)) {
1507                 struct ext4_sb_info *sbi = EXT4_SB(sb);
1508                 ext4_fsblk_t blocknr;
1509
1510                 /*
1511                  * Fastcommit replay can free already freed blocks which
1512                  * corrupts allocation info. Regenerate it.
1513                  */
1514                 if (sbi->s_mount_state & EXT4_FC_REPLAY) {
1515                         mb_regenerate_buddy(e4b);
1516                         goto check;
1517                 }
1518
1519                 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1520                 blocknr += EXT4_C2B(sbi, block);
1521                 ext4_grp_locked_error(sb, e4b->bd_group,
1522                                       inode ? inode->i_ino : 0, blocknr,
1523                                       "freeing already freed block (bit %u); block bitmap corrupt.",
1524                                       block);
1525                 ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
1526                                 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1527                 return;
1528         }
1529
1530         this_cpu_inc(discard_pa_seq);
1531         e4b->bd_info->bb_free += count;
1532         if (first < e4b->bd_info->bb_first_free)
1533                 e4b->bd_info->bb_first_free = first;
1534
1535         /* let's maintain fragments counter */
1536         if (left_is_free && right_is_free)
1537                 e4b->bd_info->bb_fragments--;
1538         else if (!left_is_free && !right_is_free)
1539                 e4b->bd_info->bb_fragments++;
1540
1541         /* buddy[0] == bd_bitmap is a special case, so handle
1542          * it right away and let mb_buddy_mark_free stay free of
1543          * zero order checks.
1544          * Check if neighbours are to be coaleasced,
1545          * adjust bitmap bb_counters and borders appropriately.
1546          */
1547         if (first & 1) {
1548                 first += !left_is_free;
1549                 e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1550         }
1551         if (!(last & 1)) {
1552                 last -= !right_is_free;
1553                 e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1554         }
1555
1556         if (first <= last)
1557                 mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1558
1559         mb_set_largest_free_order(sb, e4b->bd_info);
1560 check:
1561         mb_check_buddy(e4b);
1562 }
1563
1564 static int mb_find_extent(struct ext4_buddy *e4b, int block,
1565                                 int needed, struct ext4_free_extent *ex)
1566 {
1567         int next = block;
1568         int max, order;
1569         void *buddy;
1570
1571         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1572         BUG_ON(ex == NULL);
1573
1574         buddy = mb_find_buddy(e4b, 0, &max);
1575         BUG_ON(buddy == NULL);
1576         BUG_ON(block >= max);
1577         if (mb_test_bit(block, buddy)) {
1578                 ex->fe_len = 0;
1579                 ex->fe_start = 0;
1580                 ex->fe_group = 0;
1581                 return 0;
1582         }
1583
1584         /* find actual order */
1585         order = mb_find_order_for_block(e4b, block);
1586         block = block >> order;
1587
1588         ex->fe_len = 1 << order;
1589         ex->fe_start = block << order;
1590         ex->fe_group = e4b->bd_group;
1591
1592         /* calc difference from given start */
1593         next = next - ex->fe_start;
1594         ex->fe_len -= next;
1595         ex->fe_start += next;
1596
1597         while (needed > ex->fe_len &&
1598                mb_find_buddy(e4b, order, &max)) {
1599
1600                 if (block + 1 >= max)
1601                         break;
1602
1603                 next = (block + 1) * (1 << order);
1604                 if (mb_test_bit(next, e4b->bd_bitmap))
1605                         break;
1606
1607                 order = mb_find_order_for_block(e4b, next);
1608
1609                 block = next >> order;
1610                 ex->fe_len += 1 << order;
1611         }
1612
1613         if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
1614                 /* Should never happen! (but apparently sometimes does?!?) */
1615                 WARN_ON(1);
1616                 ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
1617                         "corruption or bug in mb_find_extent "
1618                         "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
1619                         block, order, needed, ex->fe_group, ex->fe_start,
1620                         ex->fe_len, ex->fe_logical);
1621                 ex->fe_len = 0;
1622                 ex->fe_start = 0;
1623                 ex->fe_group = 0;
1624         }
1625         return ex->fe_len;
1626 }
1627
1628 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1629 {
1630         int ord;
1631         int mlen = 0;
1632         int max = 0;
1633         int cur;
1634         int start = ex->fe_start;
1635         int len = ex->fe_len;
1636         unsigned ret = 0;
1637         int len0 = len;
1638         void *buddy;
1639
1640         BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1641         BUG_ON(e4b->bd_group != ex->fe_group);
1642         assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1643         mb_check_buddy(e4b);
1644         mb_mark_used_double(e4b, start, len);
1645
1646         this_cpu_inc(discard_pa_seq);
1647         e4b->bd_info->bb_free -= len;
1648         if (e4b->bd_info->bb_first_free == start)
1649                 e4b->bd_info->bb_first_free += len;
1650
1651         /* let's maintain fragments counter */
1652         if (start != 0)
1653                 mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1654         if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1655                 max = !mb_test_bit(start + len, e4b->bd_bitmap);
1656         if (mlen && max)
1657                 e4b->bd_info->bb_fragments++;
1658         else if (!mlen && !max)
1659                 e4b->bd_info->bb_fragments--;
1660
1661         /* let's maintain buddy itself */
1662         while (len) {
1663                 ord = mb_find_order_for_block(e4b, start);
1664
1665                 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1666                         /* the whole chunk may be allocated at once! */
1667                         mlen = 1 << ord;
1668                         buddy = mb_find_buddy(e4b, ord, &max);
1669                         BUG_ON((start >> ord) >= max);
1670                         mb_set_bit(start >> ord, buddy);
1671                         e4b->bd_info->bb_counters[ord]--;
1672                         start += mlen;
1673                         len -= mlen;
1674                         BUG_ON(len < 0);
1675                         continue;
1676                 }
1677
1678                 /* store for history */
1679                 if (ret == 0)
1680                         ret = len | (ord << 16);
1681
1682                 /* we have to split large buddy */
1683                 BUG_ON(ord <= 0);
1684                 buddy = mb_find_buddy(e4b, ord, &max);
1685                 mb_set_bit(start >> ord, buddy);
1686                 e4b->bd_info->bb_counters[ord]--;
1687
1688                 ord--;
1689                 cur = (start >> ord) & ~1U;
1690                 buddy = mb_find_buddy(e4b, ord, &max);
1691                 mb_clear_bit(cur, buddy);
1692                 mb_clear_bit(cur + 1, buddy);
1693                 e4b->bd_info->bb_counters[ord]++;
1694                 e4b->bd_info->bb_counters[ord]++;
1695         }
1696         mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1697
1698         ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1699         mb_check_buddy(e4b);
1700
1701         return ret;
1702 }
1703
1704 /*
1705  * Must be called under group lock!
1706  */
1707 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1708                                         struct ext4_buddy *e4b)
1709 {
1710         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1711         int ret;
1712
1713         BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1714         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1715
1716         ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1717         ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1718         ret = mb_mark_used(e4b, &ac->ac_b_ex);
1719
1720         /* preallocation can change ac_b_ex, thus we store actually
1721          * allocated blocks for history */
1722         ac->ac_f_ex = ac->ac_b_ex;
1723
1724         ac->ac_status = AC_STATUS_FOUND;
1725         ac->ac_tail = ret & 0xffff;
1726         ac->ac_buddy = ret >> 16;
1727
1728         /*
1729          * take the page reference. We want the page to be pinned
1730          * so that we don't get a ext4_mb_init_cache_call for this
1731          * group until we update the bitmap. That would mean we
1732          * double allocate blocks. The reference is dropped
1733          * in ext4_mb_release_context
1734          */
1735         ac->ac_bitmap_page = e4b->bd_bitmap_page;
1736         get_page(ac->ac_bitmap_page);
1737         ac->ac_buddy_page = e4b->bd_buddy_page;
1738         get_page(ac->ac_buddy_page);
1739         /* store last allocated for subsequent stream allocation */
1740         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1741                 spin_lock(&sbi->s_md_lock);
1742                 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1743                 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1744                 spin_unlock(&sbi->s_md_lock);
1745         }
1746         /*
1747          * As we've just preallocated more space than
1748          * user requested originally, we store allocated
1749          * space in a special descriptor.
1750          */
1751         if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
1752                 ext4_mb_new_preallocation(ac);
1753
1754 }
1755
1756 static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1757                                         struct ext4_buddy *e4b,
1758                                         int finish_group)
1759 {
1760         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1761         struct ext4_free_extent *bex = &ac->ac_b_ex;
1762         struct ext4_free_extent *gex = &ac->ac_g_ex;
1763         struct ext4_free_extent ex;
1764         int max;
1765
1766         if (ac->ac_status == AC_STATUS_FOUND)
1767                 return;
1768         /*
1769          * We don't want to scan for a whole year
1770          */
1771         if (ac->ac_found > sbi->s_mb_max_to_scan &&
1772                         !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1773                 ac->ac_status = AC_STATUS_BREAK;
1774                 return;
1775         }
1776
1777         /*
1778          * Haven't found good chunk so far, let's continue
1779          */
1780         if (bex->fe_len < gex->fe_len)
1781                 return;
1782
1783         if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1784                         && bex->fe_group == e4b->bd_group) {
1785                 /* recheck chunk's availability - we don't know
1786                  * when it was found (within this lock-unlock
1787                  * period or not) */
1788                 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1789                 if (max >= gex->fe_len) {
1790                         ext4_mb_use_best_found(ac, e4b);
1791                         return;
1792                 }
1793         }
1794 }
1795
1796 /*
1797  * The routine checks whether found extent is good enough. If it is,
1798  * then the extent gets marked used and flag is set to the context
1799  * to stop scanning. Otherwise, the extent is compared with the
1800  * previous found extent and if new one is better, then it's stored
1801  * in the context. Later, the best found extent will be used, if
1802  * mballoc can't find good enough extent.
1803  *
1804  * FIXME: real allocation policy is to be designed yet!
1805  */
1806 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1807                                         struct ext4_free_extent *ex,
1808                                         struct ext4_buddy *e4b)
1809 {
1810         struct ext4_free_extent *bex = &ac->ac_b_ex;
1811         struct ext4_free_extent *gex = &ac->ac_g_ex;
1812
1813         BUG_ON(ex->fe_len <= 0);
1814         BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1815         BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1816         BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1817
1818         ac->ac_found++;
1819
1820         /*
1821          * The special case - take what you catch first
1822          */
1823         if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1824                 *bex = *ex;
1825                 ext4_mb_use_best_found(ac, e4b);
1826                 return;
1827         }
1828
1829         /*
1830          * Let's check whether the chuck is good enough
1831          */
1832         if (ex->fe_len == gex->fe_len) {
1833                 *bex = *ex;
1834                 ext4_mb_use_best_found(ac, e4b);
1835                 return;
1836         }
1837
1838         /*
1839          * If this is first found extent, just store it in the context
1840          */
1841         if (bex->fe_len == 0) {
1842                 *bex = *ex;
1843                 return;
1844         }
1845
1846         /*
1847          * If new found extent is better, store it in the context
1848          */
1849         if (bex->fe_len < gex->fe_len) {
1850                 /* if the request isn't satisfied, any found extent
1851                  * larger than previous best one is better */
1852                 if (ex->fe_len > bex->fe_len)
1853                         *bex = *ex;
1854         } else if (ex->fe_len > gex->fe_len) {
1855                 /* if the request is satisfied, then we try to find
1856                  * an extent that still satisfy the request, but is
1857                  * smaller than previous one */
1858                 if (ex->fe_len < bex->fe_len)
1859                         *bex = *ex;
1860         }
1861
1862         ext4_mb_check_limits(ac, e4b, 0);
1863 }
1864
1865 static noinline_for_stack
1866 int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1867                                         struct ext4_buddy *e4b)
1868 {
1869         struct ext4_free_extent ex = ac->ac_b_ex;
1870         ext4_group_t group = ex.fe_group;
1871         int max;
1872         int err;
1873
1874         BUG_ON(ex.fe_len <= 0);
1875         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1876         if (err)
1877                 return err;
1878
1879         ext4_lock_group(ac->ac_sb, group);
1880         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1881                 goto out;
1882
1883         max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1884
1885         if (max > 0) {
1886                 ac->ac_b_ex = ex;
1887                 ext4_mb_use_best_found(ac, e4b);
1888         }
1889
1890 out:
1891         ext4_unlock_group(ac->ac_sb, group);
1892         ext4_mb_unload_buddy(e4b);
1893
1894         return 0;
1895 }
1896
1897 static noinline_for_stack
1898 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1899                                 struct ext4_buddy *e4b)
1900 {
1901         ext4_group_t group = ac->ac_g_ex.fe_group;
1902         int max;
1903         int err;
1904         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1905         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1906         struct ext4_free_extent ex;
1907
1908         if (!grp)
1909                 return -EFSCORRUPTED;
1910         if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
1911                 return 0;
1912         if (grp->bb_free == 0)
1913                 return 0;
1914
1915         err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1916         if (err)
1917                 return err;
1918
1919         ext4_lock_group(ac->ac_sb, group);
1920         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1921                 goto out;
1922
1923         max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1924                              ac->ac_g_ex.fe_len, &ex);
1925         ex.fe_logical = 0xDEADFA11; /* debug value */
1926
1927         if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1928                 ext4_fsblk_t start;
1929
1930                 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1931                         ex.fe_start;
1932                 /* use do_div to get remainder (would be 64-bit modulo) */
1933                 if (do_div(start, sbi->s_stripe) == 0) {
1934                         ac->ac_found++;
1935                         ac->ac_b_ex = ex;
1936                         ext4_mb_use_best_found(ac, e4b);
1937                 }
1938         } else if (max >= ac->ac_g_ex.fe_len) {
1939                 BUG_ON(ex.fe_len <= 0);
1940                 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1941                 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1942                 ac->ac_found++;
1943                 ac->ac_b_ex = ex;
1944                 ext4_mb_use_best_found(ac, e4b);
1945         } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1946                 /* Sometimes, caller may want to merge even small
1947                  * number of blocks to an existing extent */
1948                 BUG_ON(ex.fe_len <= 0);
1949                 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1950                 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1951                 ac->ac_found++;
1952                 ac->ac_b_ex = ex;
1953                 ext4_mb_use_best_found(ac, e4b);
1954         }
1955 out:
1956         ext4_unlock_group(ac->ac_sb, group);
1957         ext4_mb_unload_buddy(e4b);
1958
1959         return 0;
1960 }
1961
1962 /*
1963  * The routine scans buddy structures (not bitmap!) from given order
1964  * to max order and tries to find big enough chunk to satisfy the req
1965  */
1966 static noinline_for_stack
1967 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1968                                         struct ext4_buddy *e4b)
1969 {
1970         struct super_block *sb = ac->ac_sb;
1971         struct ext4_group_info *grp = e4b->bd_info;
1972         void *buddy;
1973         int i;
1974         int k;
1975         int max;
1976
1977         BUG_ON(ac->ac_2order <= 0);
1978         for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1979                 if (grp->bb_counters[i] == 0)
1980                         continue;
1981
1982                 buddy = mb_find_buddy(e4b, i, &max);
1983                 BUG_ON(buddy == NULL);
1984
1985                 k = mb_find_next_zero_bit(buddy, max, 0);
1986                 if (k >= max) {
1987                         ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
1988                                 "%d free clusters of order %d. But found 0",
1989                                 grp->bb_counters[i], i);
1990                         ext4_mark_group_bitmap_corrupted(ac->ac_sb,
1991                                          e4b->bd_group,
1992                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
1993                         break;
1994                 }
1995                 ac->ac_found++;
1996
1997                 ac->ac_b_ex.fe_len = 1 << i;
1998                 ac->ac_b_ex.fe_start = k << i;
1999                 ac->ac_b_ex.fe_group = e4b->bd_group;
2000
2001                 ext4_mb_use_best_found(ac, e4b);
2002
2003                 BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
2004
2005                 if (EXT4_SB(sb)->s_mb_stats)
2006                         atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
2007
2008                 break;
2009         }
2010 }
2011
2012 /*
2013  * The routine scans the group and measures all found extents.
2014  * In order to optimize scanning, caller must pass number of
2015  * free blocks in the group, so the routine can know upper limit.
2016  */
2017 static noinline_for_stack
2018 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
2019                                         struct ext4_buddy *e4b)
2020 {
2021         struct super_block *sb = ac->ac_sb;
2022         void *bitmap = e4b->bd_bitmap;
2023         struct ext4_free_extent ex;
2024         int i;
2025         int free;
2026
2027         free = e4b->bd_info->bb_free;
2028         if (WARN_ON(free <= 0))
2029                 return;
2030
2031         i = e4b->bd_info->bb_first_free;
2032
2033         while (free && ac->ac_status == AC_STATUS_CONTINUE) {
2034                 i = mb_find_next_zero_bit(bitmap,
2035                                                 EXT4_CLUSTERS_PER_GROUP(sb), i);
2036                 if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
2037                         /*
2038                          * IF we have corrupt bitmap, we won't find any
2039                          * free blocks even though group info says we
2040                          * have free blocks
2041                          */
2042                         ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2043                                         "%d free clusters as per "
2044                                         "group info. But bitmap says 0",
2045                                         free);
2046                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2047                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2048                         break;
2049                 }
2050
2051                 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
2052                 if (WARN_ON(ex.fe_len <= 0))
2053                         break;
2054                 if (free < ex.fe_len) {
2055                         ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
2056                                         "%d free clusters as per "
2057                                         "group info. But got %d blocks",
2058                                         free, ex.fe_len);
2059                         ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
2060                                         EXT4_GROUP_INFO_BBITMAP_CORRUPT);
2061                         /*
2062                          * The number of free blocks differs. This mostly
2063                          * indicate that the bitmap is corrupt. So exit
2064                          * without claiming the space.
2065                          */
2066                         break;
2067                 }
2068                 ex.fe_logical = 0xDEADC0DE; /* debug value */
2069                 ext4_mb_measure_extent(ac, &ex, e4b);
2070
2071                 i += ex.fe_len;
2072                 free -= ex.fe_len;
2073         }
2074
2075         ext4_mb_check_limits(ac, e4b, 1);
2076 }
2077
2078 /*
2079  * This is a special case for storages like raid5
2080  * we try to find stripe-aligned chunks for stripe-size-multiple requests
2081  */
2082 static noinline_for_stack
2083 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
2084                                  struct ext4_buddy *e4b)
2085 {
2086         struct super_block *sb = ac->ac_sb;
2087         struct ext4_sb_info *sbi = EXT4_SB(sb);
2088         void *bitmap = e4b->bd_bitmap;
2089         struct ext4_free_extent ex;
2090         ext4_fsblk_t first_group_block;
2091         ext4_fsblk_t a;
2092         ext4_grpblk_t i;
2093         int max;
2094
2095         BUG_ON(sbi->s_stripe == 0);
2096
2097         /* find first stripe-aligned block in group */
2098         first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2099
2100         a = first_group_block + sbi->s_stripe - 1;
2101         do_div(a, sbi->s_stripe);
2102         i = (a * sbi->s_stripe) - first_group_block;
2103
2104         while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2105                 if (!mb_test_bit(i, bitmap)) {
2106                         max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2107                         if (max >= sbi->s_stripe) {
2108                                 ac->ac_found++;
2109                                 ex.fe_logical = 0xDEADF00D; /* debug value */
2110                                 ac->ac_b_ex = ex;
2111                                 ext4_mb_use_best_found(ac, e4b);
2112                                 break;
2113                         }
2114                 }
2115                 i += sbi->s_stripe;
2116         }
2117 }
2118
2119 /*
2120  * This is also called BEFORE we load the buddy bitmap.
2121  * Returns either 1 or 0 indicating that the group is either suitable
2122  * for the allocation or not.
2123  */
2124 static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
2125                                 ext4_group_t group, int cr)
2126 {
2127         ext4_grpblk_t free, fragments;
2128         int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2129         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2130
2131         BUG_ON(cr < 0 || cr >= 4);
2132
2133         if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2134                 return false;
2135
2136         free = grp->bb_free;
2137         if (free == 0)
2138                 return false;
2139
2140         fragments = grp->bb_fragments;
2141         if (fragments == 0)
2142                 return false;
2143
2144         switch (cr) {
2145         case 0:
2146                 BUG_ON(ac->ac_2order == 0);
2147
2148                 /* Avoid using the first bg of a flexgroup for data files */
2149                 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2150                     (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2151                     ((group % flex_size) == 0))
2152                         return false;
2153
2154                 if (free < ac->ac_g_ex.fe_len)
2155                         return false;
2156
2157                 if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
2158                         return true;
2159
2160                 if (grp->bb_largest_free_order < ac->ac_2order)
2161                         return false;
2162
2163                 return true;
2164         case 1:
2165                 if ((free / fragments) >= ac->ac_g_ex.fe_len)
2166                         return true;
2167                 break;
2168         case 2:
2169                 if (free >= ac->ac_g_ex.fe_len)
2170                         return true;
2171                 break;
2172         case 3:
2173                 return true;
2174         default:
2175                 BUG();
2176         }
2177
2178         return false;
2179 }
2180
2181 /*
2182  * This could return negative error code if something goes wrong
2183  * during ext4_mb_init_group(). This should not be called with
2184  * ext4_lock_group() held.
2185  */
2186 static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
2187                                      ext4_group_t group, int cr)
2188 {
2189         struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2190         struct super_block *sb = ac->ac_sb;
2191         struct ext4_sb_info *sbi = EXT4_SB(sb);
2192         bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
2193         ext4_grpblk_t free;
2194         int ret = 0;
2195
2196         if (!grp)
2197                 return -EFSCORRUPTED;
2198         if (sbi->s_mb_stats)
2199                 atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
2200         if (should_lock)
2201                 ext4_lock_group(sb, group);
2202         free = grp->bb_free;
2203         if (free == 0)
2204                 goto out;
2205         if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2206                 goto out;
2207         if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2208                 goto out;
2209         if (should_lock)
2210                 ext4_unlock_group(sb, group);
2211
2212         /* We only do this if the grp has never been initialized */
2213         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2214                 struct ext4_group_desc *gdp =
2215                         ext4_get_group_desc(sb, group, NULL);
2216                 int ret;
2217
2218                 /* cr=0/1 is a very optimistic search to find large
2219                  * good chunks almost for free.  If buddy data is not
2220                  * ready, then this optimization makes no sense.  But
2221                  * we never skip the first block group in a flex_bg,
2222                  * since this gets used for metadata block allocation,
2223                  * and we want to make sure we locate metadata blocks
2224                  * in the first block group in the flex_bg if possible.
2225                  */
2226                 if (cr < 2 &&
2227                     (!sbi->s_log_groups_per_flex ||
2228                      ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
2229                     !(ext4_has_group_desc_csum(sb) &&
2230                       (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
2231                         return 0;
2232                 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
2233                 if (ret)
2234                         return ret;
2235         }
2236
2237         if (should_lock)
2238                 ext4_lock_group(sb, group);
2239         ret = ext4_mb_good_group(ac, group, cr);
2240 out:
2241         if (should_lock)
2242                 ext4_unlock_group(sb, group);
2243         return ret;
2244 }
2245
2246 /*
2247  * Start prefetching @nr block bitmaps starting at @group.
2248  * Return the next group which needs to be prefetched.
2249  */
2250 ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
2251                               unsigned int nr, int *cnt)
2252 {
2253         ext4_group_t ngroups = ext4_get_groups_count(sb);
2254         struct buffer_head *bh;
2255         struct blk_plug plug;
2256
2257         blk_start_plug(&plug);
2258         while (nr-- > 0) {
2259                 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2260                                                                   NULL);
2261                 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2262
2263                 /*
2264                  * Prefetch block groups with free blocks; but don't
2265                  * bother if it is marked uninitialized on disk, since
2266                  * it won't require I/O to read.  Also only try to
2267                  * prefetch once, so we avoid getblk() call, which can
2268                  * be expensive.
2269                  */
2270                 if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
2271                     EXT4_MB_GRP_NEED_INIT(grp) &&
2272                     ext4_free_group_clusters(sb, gdp) > 0 &&
2273                     !(ext4_has_group_desc_csum(sb) &&
2274                       (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2275                         bh = ext4_read_block_bitmap_nowait(sb, group, true);
2276                         if (bh && !IS_ERR(bh)) {
2277                                 if (!buffer_uptodate(bh) && cnt)
2278                                         (*cnt)++;
2279                                 brelse(bh);
2280                         }
2281                 }
2282                 if (++group >= ngroups)
2283                         group = 0;
2284         }
2285         blk_finish_plug(&plug);
2286         return group;
2287 }
2288
2289 /*
2290  * Prefetching reads the block bitmap into the buffer cache; but we
2291  * need to make sure that the buddy bitmap in the page cache has been
2292  * initialized.  Note that ext4_mb_init_group() will block if the I/O
2293  * is not yet completed, or indeed if it was not initiated by
2294  * ext4_mb_prefetch did not start the I/O.
2295  *
2296  * TODO: We should actually kick off the buddy bitmap setup in a work
2297  * queue when the buffer I/O is completed, so that we don't block
2298  * waiting for the block allocation bitmap read to finish when
2299  * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
2300  */
2301 void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
2302                            unsigned int nr)
2303 {
2304         while (nr-- > 0) {
2305                 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
2306                                                                   NULL);
2307                 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
2308
2309                 if (!group)
2310                         group = ext4_get_groups_count(sb);
2311                 group--;
2312                 grp = ext4_get_group_info(sb, group);
2313
2314                 if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
2315                     ext4_free_group_clusters(sb, gdp) > 0 &&
2316                     !(ext4_has_group_desc_csum(sb) &&
2317                       (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
2318                         if (ext4_mb_init_group(sb, group, GFP_NOFS))
2319                                 break;
2320                 }
2321         }
2322 }
2323
2324 static noinline_for_stack int
2325 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2326 {
2327         ext4_group_t prefetch_grp = 0, ngroups, group, i;
2328         int cr = -1;
2329         int err = 0, first_err = 0;
2330         unsigned int nr = 0, prefetch_ios = 0;
2331         struct ext4_sb_info *sbi;
2332         struct super_block *sb;
2333         struct ext4_buddy e4b;
2334         int lost;
2335
2336         sb = ac->ac_sb;
2337         sbi = EXT4_SB(sb);
2338         ngroups = ext4_get_groups_count(sb);
2339         /* non-extent files are limited to low blocks/groups */
2340         if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2341                 ngroups = sbi->s_blockfile_groups;
2342
2343         BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2344
2345         /* first, try the goal */
2346         err = ext4_mb_find_by_goal(ac, &e4b);
2347         if (err || ac->ac_status == AC_STATUS_FOUND)
2348                 goto out;
2349
2350         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2351                 goto out;
2352
2353         /*
2354          * ac->ac_2order is set only if the fe_len is a power of 2
2355          * if ac->ac_2order is set we also set criteria to 0 so that we
2356          * try exact allocation using buddy.
2357          */
2358         i = fls(ac->ac_g_ex.fe_len);
2359         ac->ac_2order = 0;
2360         /*
2361          * We search using buddy data only if the order of the request
2362          * is greater than equal to the sbi_s_mb_order2_reqs
2363          * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2364          * We also support searching for power-of-two requests only for
2365          * requests upto maximum buddy size we have constructed.
2366          */
2367         if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
2368                 /*
2369                  * This should tell if fe_len is exactly power of 2
2370                  */
2371                 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2372                         ac->ac_2order = array_index_nospec(i - 1,
2373                                                            sb->s_blocksize_bits + 2);
2374         }
2375
2376         /* if stream allocation is enabled, use global goal */
2377         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2378                 /* TBD: may be hot point */
2379                 spin_lock(&sbi->s_md_lock);
2380                 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2381                 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2382                 spin_unlock(&sbi->s_md_lock);
2383         }
2384
2385         /* Let's just scan groups to find more-less suitable blocks */
2386         cr = ac->ac_2order ? 0 : 1;
2387         /*
2388          * cr == 0 try to get exact allocation,
2389          * cr == 3  try to get anything
2390          */
2391 repeat:
2392         for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2393                 ac->ac_criteria = cr;
2394                 /*
2395                  * searching for the right group start
2396                  * from the goal value specified
2397                  */
2398                 group = ac->ac_g_ex.fe_group;
2399                 prefetch_grp = group;
2400
2401                 for (i = 0; i < ngroups; group++, i++) {
2402                         int ret = 0;
2403                         cond_resched();
2404                         /*
2405                          * Artificially restricted ngroups for non-extent
2406                          * files makes group > ngroups possible on first loop.
2407                          */
2408                         if (group >= ngroups)
2409                                 group = 0;
2410
2411                         /*
2412                          * Batch reads of the block allocation bitmaps
2413                          * to get multiple READs in flight; limit
2414                          * prefetching at cr=0/1, otherwise mballoc can
2415                          * spend a lot of time loading imperfect groups
2416                          */
2417                         if ((prefetch_grp == group) &&
2418                             (cr > 1 ||
2419                              prefetch_ios < sbi->s_mb_prefetch_limit)) {
2420                                 unsigned int curr_ios = prefetch_ios;
2421
2422                                 nr = sbi->s_mb_prefetch;
2423                                 if (ext4_has_feature_flex_bg(sb)) {
2424                                         nr = 1 << sbi->s_log_groups_per_flex;
2425                                         nr -= group & (nr - 1);
2426                                         nr = min(nr, sbi->s_mb_prefetch);
2427                                 }
2428                                 prefetch_grp = ext4_mb_prefetch(sb, group,
2429                                                         nr, &prefetch_ios);
2430                                 if (prefetch_ios == curr_ios)
2431                                         nr = 0;
2432                         }
2433
2434                         /* This now checks without needing the buddy page */
2435                         ret = ext4_mb_good_group_nolock(ac, group, cr);
2436                         if (ret <= 0) {
2437                                 if (!first_err)
2438                                         first_err = ret;
2439                                 continue;
2440                         }
2441
2442                         err = ext4_mb_load_buddy(sb, group, &e4b);
2443                         if (err)
2444                                 goto out;
2445
2446                         ext4_lock_group(sb, group);
2447
2448                         /*
2449                          * We need to check again after locking the
2450                          * block group
2451                          */
2452                         ret = ext4_mb_good_group(ac, group, cr);
2453                         if (ret == 0) {
2454                                 ext4_unlock_group(sb, group);
2455                                 ext4_mb_unload_buddy(&e4b);
2456                                 continue;
2457                         }
2458
2459                         ac->ac_groups_scanned++;
2460                         if (cr == 0)
2461                                 ext4_mb_simple_scan_group(ac, &e4b);
2462                         else if (cr == 1 && sbi->s_stripe &&
2463                                         !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2464                                 ext4_mb_scan_aligned(ac, &e4b);
2465                         else
2466                                 ext4_mb_complex_scan_group(ac, &e4b);
2467
2468                         ext4_unlock_group(sb, group);
2469                         ext4_mb_unload_buddy(&e4b);
2470
2471                         if (ac->ac_status != AC_STATUS_CONTINUE)
2472                                 break;
2473                 }
2474                 /* Processed all groups and haven't found blocks */
2475                 if (sbi->s_mb_stats && i == ngroups)
2476                         atomic64_inc(&sbi->s_bal_cX_failed[cr]);
2477         }
2478
2479         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2480             !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2481                 /*
2482                  * We've been searching too long. Let's try to allocate
2483                  * the best chunk we've found so far
2484                  */
2485                 ext4_mb_try_best_found(ac, &e4b);
2486                 if (ac->ac_status != AC_STATUS_FOUND) {
2487                         /*
2488                          * Someone more lucky has already allocated it.
2489                          * The only thing we can do is just take first
2490                          * found block(s)
2491                          */
2492                         lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
2493                         mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
2494                                  ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
2495                                  ac->ac_b_ex.fe_len, lost);
2496
2497                         ac->ac_b_ex.fe_group = 0;
2498                         ac->ac_b_ex.fe_start = 0;
2499                         ac->ac_b_ex.fe_len = 0;
2500                         ac->ac_status = AC_STATUS_CONTINUE;
2501                         ac->ac_flags |= EXT4_MB_HINT_FIRST;
2502                         cr = 3;
2503                         goto repeat;
2504                 }
2505         }
2506
2507         if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
2508                 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
2509 out:
2510         if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2511                 err = first_err;
2512
2513         mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
2514                  ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
2515                  ac->ac_flags, cr, err);
2516
2517         if (nr)
2518                 ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
2519
2520         return err;
2521 }
2522
2523 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2524 {
2525         struct super_block *sb = PDE_DATA(file_inode(seq->file));
2526         ext4_group_t group;
2527
2528         if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2529                 return NULL;
2530         group = *pos + 1;
2531         return (void *) ((unsigned long) group);
2532 }
2533
2534 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2535 {
2536         struct super_block *sb = PDE_DATA(file_inode(seq->file));
2537         ext4_group_t group;
2538
2539         ++*pos;
2540         if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2541                 return NULL;
2542         group = *pos + 1;
2543         return (void *) ((unsigned long) group);
2544 }
2545
2546 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2547 {
2548         struct super_block *sb = PDE_DATA(file_inode(seq->file));
2549         ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2550         int i;
2551         int err, buddy_loaded = 0;
2552         struct ext4_buddy e4b;
2553         struct ext4_group_info *grinfo;
2554         unsigned char blocksize_bits = min_t(unsigned char,
2555                                              sb->s_blocksize_bits,
2556                                              EXT4_MAX_BLOCK_LOG_SIZE);
2557         struct sg {
2558                 struct ext4_group_info info;
2559                 ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
2560         } sg;
2561
2562         group--;
2563         if (group == 0)
2564                 seq_puts(seq, "#group: free  frags first ["
2565                               " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
2566                               " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
2567
2568         i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2569                 sizeof(struct ext4_group_info);
2570
2571         grinfo = ext4_get_group_info(sb, group);
2572         if (!grinfo)
2573                 return 0;
2574         /* Load the group info in memory only if not already loaded. */
2575         if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2576                 err = ext4_mb_load_buddy(sb, group, &e4b);
2577                 if (err) {
2578                         seq_printf(seq, "#%-5u: I/O error\n", group);
2579                         return 0;
2580                 }
2581                 buddy_loaded = 1;
2582         }
2583
2584         memcpy(&sg, grinfo, i);
2585
2586         if (buddy_loaded)
2587                 ext4_mb_unload_buddy(&e4b);
2588
2589         seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2590                         sg.info.bb_fragments, sg.info.bb_first_free);
2591         for (i = 0; i <= 13; i++)
2592                 seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
2593                                 sg.info.bb_counters[i] : 0);
2594         seq_puts(seq, " ]");
2595         if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
2596                 seq_puts(seq, " Block bitmap corrupted!");
2597         seq_puts(seq, "\n");
2598
2599         return 0;
2600 }
2601
2602 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2603 {
2604 }
2605
2606 const struct seq_operations ext4_mb_seq_groups_ops = {
2607         .start  = ext4_mb_seq_groups_start,
2608         .next   = ext4_mb_seq_groups_next,
2609         .stop   = ext4_mb_seq_groups_stop,
2610         .show   = ext4_mb_seq_groups_show,
2611 };
2612
2613 int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
2614 {
2615         struct super_block *sb = (struct super_block *)seq->private;
2616         struct ext4_sb_info *sbi = EXT4_SB(sb);
2617
2618         seq_puts(seq, "mballoc:\n");
2619         if (!sbi->s_mb_stats) {
2620                 seq_puts(seq, "\tmb stats collection turned off.\n");
2621                 seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
2622                 return 0;
2623         }
2624         seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
2625         seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
2626
2627         seq_printf(seq, "\tgroups_scanned: %u\n",  atomic_read(&sbi->s_bal_groups_scanned));
2628
2629         seq_puts(seq, "\tcr0_stats:\n");
2630         seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
2631         seq_printf(seq, "\t\tgroups_considered: %llu\n",
2632                    atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
2633         seq_printf(seq, "\t\tuseless_loops: %llu\n",
2634                    atomic64_read(&sbi->s_bal_cX_failed[0]));
2635
2636         seq_puts(seq, "\tcr1_stats:\n");
2637         seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
2638         seq_printf(seq, "\t\tgroups_considered: %llu\n",
2639                    atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
2640         seq_printf(seq, "\t\tuseless_loops: %llu\n",
2641                    atomic64_read(&sbi->s_bal_cX_failed[1]));
2642
2643         seq_puts(seq, "\tcr2_stats:\n");
2644         seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
2645         seq_printf(seq, "\t\tgroups_considered: %llu\n",
2646                    atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
2647         seq_printf(seq, "\t\tuseless_loops: %llu\n",
2648                    atomic64_read(&sbi->s_bal_cX_failed[2]));
2649
2650         seq_puts(seq, "\tcr3_stats:\n");
2651         seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
2652         seq_printf(seq, "\t\tgroups_considered: %llu\n",
2653                    atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
2654         seq_printf(seq, "\t\tuseless_loops: %llu\n",
2655                    atomic64_read(&sbi->s_bal_cX_failed[3]));
2656         seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
2657         seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
2658         seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
2659         seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
2660         seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
2661
2662         seq_printf(seq, "\tbuddies_generated: %u/%u\n",
2663                    atomic_read(&sbi->s_mb_buddies_generated),
2664                    ext4_get_groups_count(sb));
2665         seq_printf(seq, "\tbuddies_time_used: %llu\n",
2666                    atomic64_read(&sbi->s_mb_generation_time));
2667         seq_printf(seq, "\tpreallocated: %u\n",
2668                    atomic_read(&sbi->s_mb_preallocated));
2669         seq_printf(seq, "\tdiscarded: %u\n",
2670                    atomic_read(&sbi->s_mb_discarded));
2671         return 0;
2672 }
2673
2674 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2675 {
2676         int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2677         struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2678
2679         BUG_ON(!cachep);
2680         return cachep;
2681 }
2682
2683 /*
2684  * Allocate the top-level s_group_info array for the specified number
2685  * of groups
2686  */
2687 int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2688 {
2689         struct ext4_sb_info *sbi = EXT4_SB(sb);
2690         unsigned size;
2691         struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
2692
2693         size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2694                 EXT4_DESC_PER_BLOCK_BITS(sb);
2695         if (size <= sbi->s_group_info_size)
2696                 return 0;
2697
2698         size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2699         new_groupinfo = kvzalloc(size, GFP_KERNEL);
2700         if (!new_groupinfo) {
2701                 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2702                 return -ENOMEM;
2703         }
2704         rcu_read_lock();
2705         old_groupinfo = rcu_dereference(sbi->s_group_info);
2706         if (old_groupinfo)
2707                 memcpy(new_groupinfo, old_groupinfo,
2708                        sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2709         rcu_read_unlock();
2710         rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
2711         sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2712         if (old_groupinfo)
2713                 ext4_kvfree_array_rcu(old_groupinfo);
2714         ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
2715                    sbi->s_group_info_size);
2716         return 0;
2717 }
2718
2719 /* Create and initialize ext4_group_info data for the given group. */
2720 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2721                           struct ext4_group_desc *desc)
2722 {
2723         int i;
2724         int metalen = 0;
2725         int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
2726         struct ext4_sb_info *sbi = EXT4_SB(sb);
2727         struct ext4_group_info **meta_group_info;
2728         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2729
2730         /*
2731          * First check if this group is the first of a reserved block.
2732          * If it's true, we have to allocate a new table of pointers
2733          * to ext4_group_info structures
2734          */
2735         if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2736                 metalen = sizeof(*meta_group_info) <<
2737                         EXT4_DESC_PER_BLOCK_BITS(sb);
2738                 meta_group_info = kmalloc(metalen, GFP_NOFS);
2739                 if (meta_group_info == NULL) {
2740                         ext4_msg(sb, KERN_ERR, "can't allocate mem "
2741                                  "for a buddy group");
2742                         goto exit_meta_group_info;
2743                 }
2744                 rcu_read_lock();
2745                 rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
2746                 rcu_read_unlock();
2747         }
2748
2749         meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
2750         i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2751
2752         meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2753         if (meta_group_info[i] == NULL) {
2754                 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2755                 goto exit_group_info;
2756         }
2757         set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2758                 &(meta_group_info[i]->bb_state));
2759
2760         /*
2761          * initialize bb_free to be able to skip
2762          * empty groups without initialization
2763          */
2764         if (ext4_has_group_desc_csum(sb) &&
2765             (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
2766                 meta_group_info[i]->bb_free =
2767                         ext4_free_clusters_after_init(sb, group, desc);
2768         } else {
2769                 meta_group_info[i]->bb_free =
2770                         ext4_free_group_clusters(sb, desc);
2771         }
2772
2773         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2774         init_rwsem(&meta_group_info[i]->alloc_sem);
2775         meta_group_info[i]->bb_free_root = RB_ROOT;
2776         meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2777
2778         mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
2779         return 0;
2780
2781 exit_group_info:
2782         /* If a meta_group_info table has been allocated, release it now */
2783         if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2784                 struct ext4_group_info ***group_info;
2785
2786                 rcu_read_lock();
2787                 group_info = rcu_dereference(sbi->s_group_info);
2788                 kfree(group_info[idx]);
2789                 group_info[idx] = NULL;
2790                 rcu_read_unlock();
2791         }
2792 exit_meta_group_info:
2793         return -ENOMEM;
2794 } /* ext4_mb_add_groupinfo */
2795
2796 static int ext4_mb_init_backend(struct super_block *sb)
2797 {
2798         ext4_group_t ngroups = ext4_get_groups_count(sb);
2799         ext4_group_t i;
2800         struct ext4_sb_info *sbi = EXT4_SB(sb);
2801         int err;
2802         struct ext4_group_desc *desc;
2803         struct ext4_group_info ***group_info;
2804         struct kmem_cache *cachep;
2805
2806         err = ext4_mb_alloc_groupinfo(sb, ngroups);
2807         if (err)
2808                 return err;
2809
2810         sbi->s_buddy_cache = new_inode(sb);
2811         if (sbi->s_buddy_cache == NULL) {
2812                 ext4_msg(sb, KERN_ERR, "can't get new inode");
2813                 goto err_freesgi;
2814         }
2815         /* To avoid potentially colliding with an valid on-disk inode number,
2816          * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2817          * not in the inode hash, so it should never be found by iget(), but
2818          * this will avoid confusion if it ever shows up during debugging. */
2819         sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2820         EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2821         for (i = 0; i < ngroups; i++) {
2822                 cond_resched();
2823                 desc = ext4_get_group_desc(sb, i, NULL);
2824                 if (desc == NULL) {
2825                         ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2826                         goto err_freebuddy;
2827                 }
2828                 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2829                         goto err_freebuddy;
2830         }
2831
2832         if (ext4_has_feature_flex_bg(sb)) {
2833                 /* a single flex group is supposed to be read by a single IO.
2834                  * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
2835                  * unsigned integer, so the maximum shift is 32.
2836                  */
2837                 if (sbi->s_es->s_log_groups_per_flex >= 32) {
2838                         ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
2839                         goto err_freebuddy;
2840                 }
2841                 sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
2842                         BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
2843                 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
2844         } else {
2845                 sbi->s_mb_prefetch = 32;
2846         }
2847         if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
2848                 sbi->s_mb_prefetch = ext4_get_groups_count(sb);
2849         /* now many real IOs to prefetch within a single allocation at cr=0
2850          * given cr=0 is an CPU-related optimization we shouldn't try to
2851          * load too many groups, at some point we should start to use what
2852          * we've got in memory.
2853          * with an average random access time 5ms, it'd take a second to get
2854          * 200 groups (* N with flex_bg), so let's make this limit 4
2855          */
2856         sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
2857         if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
2858                 sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
2859
2860         return 0;
2861
2862 err_freebuddy:
2863         cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2864         while (i-- > 0) {
2865                 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
2866
2867                 if (grp)
2868                         kmem_cache_free(cachep, grp);
2869         }
2870         i = sbi->s_group_info_size;
2871         rcu_read_lock();
2872         group_info = rcu_dereference(sbi->s_group_info);
2873         while (i-- > 0)
2874                 kfree(group_info[i]);
2875         rcu_read_unlock();
2876         iput(sbi->s_buddy_cache);
2877 err_freesgi:
2878         rcu_read_lock();
2879         kvfree(rcu_dereference(sbi->s_group_info));
2880         rcu_read_unlock();
2881         return -ENOMEM;
2882 }
2883
2884 static void ext4_groupinfo_destroy_slabs(void)
2885 {
2886         int i;
2887
2888         for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2889                 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2890                 ext4_groupinfo_caches[i] = NULL;
2891         }
2892 }
2893
2894 static int ext4_groupinfo_create_slab(size_t size)
2895 {
2896         static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2897         int slab_size;
2898         int blocksize_bits = order_base_2(size);
2899         int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2900         struct kmem_cache *cachep;
2901
2902         if (cache_index >= NR_GRPINFO_CACHES)
2903                 return -EINVAL;
2904
2905         if (unlikely(cache_index < 0))
2906                 cache_index = 0;
2907
2908         mutex_lock(&ext4_grpinfo_slab_create_mutex);
2909         if (ext4_groupinfo_caches[cache_index]) {
2910                 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2911                 return 0;       /* Already created */
2912         }
2913
2914         slab_size = offsetof(struct ext4_group_info,
2915                                 bb_counters[blocksize_bits + 2]);
2916
2917         cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2918                                         slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2919                                         NULL);
2920
2921         ext4_groupinfo_caches[cache_index] = cachep;
2922
2923         mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2924         if (!cachep) {
2925                 printk(KERN_EMERG
2926                        "EXT4-fs: no memory for groupinfo slab cache\n");
2927                 return -ENOMEM;
2928         }
2929
2930         return 0;
2931 }
2932
2933 int ext4_mb_init(struct super_block *sb)
2934 {
2935         struct ext4_sb_info *sbi = EXT4_SB(sb);
2936         unsigned i, j;
2937         unsigned offset, offset_incr;
2938         unsigned max;
2939         int ret;
2940
2941         i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2942
2943         sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2944         if (sbi->s_mb_offsets == NULL) {
2945                 ret = -ENOMEM;
2946                 goto out;
2947         }
2948
2949         i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2950         sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2951         if (sbi->s_mb_maxs == NULL) {
2952                 ret = -ENOMEM;
2953                 goto out;
2954         }
2955
2956         ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2957         if (ret < 0)
2958                 goto out;
2959
2960         /* order 0 is regular bitmap */
2961         sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2962         sbi->s_mb_offsets[0] = 0;
2963
2964         i = 1;
2965         offset = 0;
2966         offset_incr = 1 << (sb->s_blocksize_bits - 1);
2967         max = sb->s_blocksize << 2;
2968         do {
2969                 sbi->s_mb_offsets[i] = offset;
2970                 sbi->s_mb_maxs[i] = max;
2971                 offset += offset_incr;
2972                 offset_incr = offset_incr >> 1;
2973                 max = max >> 1;
2974                 i++;
2975         } while (i <= sb->s_blocksize_bits + 1);
2976
2977         spin_lock_init(&sbi->s_md_lock);
2978         sbi->s_mb_free_pending = 0;
2979         INIT_LIST_HEAD(&sbi->s_freed_data_list);
2980
2981         sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2982         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2983         sbi->s_mb_stats = MB_DEFAULT_STATS;
2984         sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2985         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2986         sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
2987         /*
2988          * The default group preallocation is 512, which for 4k block
2989          * sizes translates to 2 megabytes.  However for bigalloc file
2990          * systems, this is probably too big (i.e, if the cluster size
2991          * is 1 megabyte, then group preallocation size becomes half a
2992          * gigabyte!).  As a default, we will keep a two megabyte
2993          * group pralloc size for cluster sizes up to 64k, and after
2994          * that, we will force a minimum group preallocation size of
2995          * 32 clusters.  This translates to 8 megs when the cluster
2996          * size is 256k, and 32 megs when the cluster size is 1 meg,
2997          * which seems reasonable as a default.
2998          */
2999         sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
3000                                        sbi->s_cluster_bits, 32);
3001         /*
3002          * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
3003          * to the lowest multiple of s_stripe which is bigger than
3004          * the s_mb_group_prealloc as determined above. We want
3005          * the preallocation size to be an exact multiple of the
3006          * RAID stripe size so that preallocations don't fragment
3007          * the stripes.
3008          */
3009         if (sbi->s_stripe > 1) {
3010                 sbi->s_mb_group_prealloc = roundup(
3011                         sbi->s_mb_group_prealloc, sbi->s_stripe);
3012         }
3013
3014         sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
3015         if (sbi->s_locality_groups == NULL) {
3016                 ret = -ENOMEM;
3017                 goto out;
3018         }
3019         for_each_possible_cpu(i) {
3020                 struct ext4_locality_group *lg;
3021                 lg = per_cpu_ptr(sbi->s_locality_groups, i);
3022                 mutex_init(&lg->lg_mutex);
3023                 for (j = 0; j < PREALLOC_TB_SIZE; j++)
3024                         INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
3025                 spin_lock_init(&lg->lg_prealloc_lock);
3026         }
3027
3028         /* init file for buddy data */
3029         ret = ext4_mb_init_backend(sb);
3030         if (ret != 0)
3031                 goto out_free_locality_groups;
3032
3033         return 0;
3034
3035 out_free_locality_groups:
3036         free_percpu(sbi->s_locality_groups);
3037         sbi->s_locality_groups = NULL;
3038 out:
3039         kfree(sbi->s_mb_offsets);
3040         sbi->s_mb_offsets = NULL;
3041         kfree(sbi->s_mb_maxs);
3042         sbi->s_mb_maxs = NULL;
3043         return ret;
3044 }
3045
3046 /* need to called with the ext4 group lock held */
3047 static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
3048 {
3049         struct ext4_prealloc_space *pa;
3050         struct list_head *cur, *tmp;
3051         int count = 0;
3052
3053         list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
3054                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3055                 list_del(&pa->pa_group_list);
3056                 count++;
3057                 kmem_cache_free(ext4_pspace_cachep, pa);
3058         }
3059         return count;
3060 }
3061
3062 int ext4_mb_release(struct super_block *sb)
3063 {
3064         ext4_group_t ngroups = ext4_get_groups_count(sb);
3065         ext4_group_t i;
3066         int num_meta_group_infos;
3067         struct ext4_group_info *grinfo, ***group_info;
3068         struct ext4_sb_info *sbi = EXT4_SB(sb);
3069         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
3070         int count;
3071
3072         if (sbi->s_group_info) {
3073                 for (i = 0; i < ngroups; i++) {
3074                         cond_resched();
3075                         grinfo = ext4_get_group_info(sb, i);
3076                         if (!grinfo)
3077                                 continue;
3078                         mb_group_bb_bitmap_free(grinfo);
3079                         ext4_lock_group(sb, i);
3080                         count = ext4_mb_cleanup_pa(grinfo);
3081                         if (count)
3082                                 mb_debug(sb, "mballoc: %d PAs left\n",
3083                                          count);
3084                         ext4_unlock_group(sb, i);
3085                         kmem_cache_free(cachep, grinfo);
3086                 }
3087                 num_meta_group_infos = (ngroups +
3088                                 EXT4_DESC_PER_BLOCK(sb) - 1) >>
3089                         EXT4_DESC_PER_BLOCK_BITS(sb);
3090                 rcu_read_lock();
3091                 group_info = rcu_dereference(sbi->s_group_info);
3092                 for (i = 0; i < num_meta_group_infos; i++)
3093                         kfree(group_info[i]);
3094                 kvfree(group_info);
3095                 rcu_read_unlock();
3096         }
3097         kfree(sbi->s_mb_offsets);
3098         kfree(sbi->s_mb_maxs);
3099         iput(sbi->s_buddy_cache);
3100         if (sbi->s_mb_stats) {
3101                 ext4_msg(sb, KERN_INFO,
3102                        "mballoc: %u blocks %u reqs (%u success)",
3103                                 atomic_read(&sbi->s_bal_allocated),
3104                                 atomic_read(&sbi->s_bal_reqs),
3105                                 atomic_read(&sbi->s_bal_success));
3106                 ext4_msg(sb, KERN_INFO,
3107                       "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
3108                                 "%u 2^N hits, %u breaks, %u lost",
3109                                 atomic_read(&sbi->s_bal_ex_scanned),
3110                                 atomic_read(&sbi->s_bal_groups_scanned),
3111                                 atomic_read(&sbi->s_bal_goals),
3112                                 atomic_read(&sbi->s_bal_2orders),
3113                                 atomic_read(&sbi->s_bal_breaks),
3114                                 atomic_read(&sbi->s_mb_lost_chunks));
3115                 ext4_msg(sb, KERN_INFO,
3116                        "mballoc: %u generated and it took %llu",
3117                                 atomic_read(&sbi->s_mb_buddies_generated),
3118                                 atomic64_read(&sbi->s_mb_generation_time));
3119                 ext4_msg(sb, KERN_INFO,
3120                        "mballoc: %u preallocated, %u discarded",
3121                                 atomic_read(&sbi->s_mb_preallocated),
3122                                 atomic_read(&sbi->s_mb_discarded));
3123         }
3124
3125         free_percpu(sbi->s_locality_groups);
3126
3127         return 0;
3128 }
3129
3130 static inline int ext4_issue_discard(struct super_block *sb,
3131                 ext4_group_t block_group, ext4_grpblk_t cluster, int count,
3132                 struct bio **biop)
3133 {
3134         ext4_fsblk_t discard_block;
3135
3136         discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
3137                          ext4_group_first_block_no(sb, block_group));
3138         count = EXT4_C2B(EXT4_SB(sb), count);
3139         trace_ext4_discard_blocks(sb,
3140                         (unsigned long long) discard_block, count);
3141         if (biop) {
3142                 return __blkdev_issue_discard(sb->s_bdev,
3143                         (sector_t)discard_block << (sb->s_blocksize_bits - 9),
3144                         (sector_t)count << (sb->s_blocksize_bits - 9),
3145                         GFP_NOFS, 0, biop);
3146         } else
3147                 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
3148 }
3149
3150 static void ext4_free_data_in_buddy(struct super_block *sb,
3151                                     struct ext4_free_data *entry)
3152 {
3153         struct ext4_buddy e4b;
3154         struct ext4_group_info *db;
3155         int err, count = 0, count2 = 0;
3156
3157         mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
3158                  entry->efd_count, entry->efd_group, entry);
3159
3160         err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
3161         /* we expect to find existing buddy because it's pinned */
3162         BUG_ON(err != 0);
3163
3164         spin_lock(&EXT4_SB(sb)->s_md_lock);
3165         EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
3166         spin_unlock(&EXT4_SB(sb)->s_md_lock);
3167
3168         db = e4b.bd_info;
3169         /* there are blocks to put in buddy to make them really free */
3170         count += entry->efd_count;
3171         count2++;
3172         ext4_lock_group(sb, entry->efd_group);
3173         /* Take it out of per group rb tree */
3174         rb_erase(&entry->efd_node, &(db->bb_free_root));
3175         mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
3176
3177         /*
3178          * Clear the trimmed flag for the group so that the next
3179          * ext4_trim_fs can trim it.
3180          * If the volume is mounted with -o discard, online discard
3181          * is supported and the free blocks will be trimmed online.
3182          */
3183         if (!test_opt(sb, DISCARD))
3184                 EXT4_MB_GRP_CLEAR_TRIMMED(db);
3185
3186         if (!db->bb_free_root.rb_node) {
3187                 /* No more items in the per group rb tree
3188                  * balance refcounts from ext4_mb_free_metadata()
3189                  */
3190                 put_page(e4b.bd_buddy_page);
3191                 put_page(e4b.bd_bitmap_page);
3192         }
3193         ext4_unlock_group(sb, entry->efd_group);
3194         kmem_cache_free(ext4_free_data_cachep, entry);
3195         ext4_mb_unload_buddy(&e4b);
3196
3197         mb_debug(sb, "freed %d blocks in %d structures\n", count,
3198                  count2);
3199 }
3200
3201 /*
3202  * This function is called by the jbd2 layer once the commit has finished,
3203  * so we know we can free the blocks that were released with that commit.
3204  */
3205 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
3206 {
3207         struct ext4_sb_info *sbi = EXT4_SB(sb);
3208         struct ext4_free_data *entry, *tmp;
3209         struct bio *discard_bio = NULL;
3210         struct list_head freed_data_list;
3211         struct list_head *cut_pos = NULL;
3212         int err;
3213
3214         INIT_LIST_HEAD(&freed_data_list);
3215
3216         spin_lock(&sbi->s_md_lock);
3217         list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
3218                 if (entry->efd_tid != commit_tid)
3219                         break;
3220                 cut_pos = &entry->efd_list;
3221         }
3222         if (cut_pos)
3223                 list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
3224                                   cut_pos);
3225         spin_unlock(&sbi->s_md_lock);
3226
3227         if (test_opt(sb, DISCARD)) {
3228                 list_for_each_entry(entry, &freed_data_list, efd_list) {
3229                         err = ext4_issue_discard(sb, entry->efd_group,
3230                                                  entry->efd_start_cluster,
3231                                                  entry->efd_count,
3232                                                  &discard_bio);
3233                         if (err && err != -EOPNOTSUPP) {
3234                                 ext4_msg(sb, KERN_WARNING, "discard request in"
3235                                          " group:%d block:%d count:%d failed"
3236                                          " with %d", entry->efd_group,
3237                                          entry->efd_start_cluster,
3238                                          entry->efd_count, err);
3239                         } else if (err == -EOPNOTSUPP)
3240                                 break;
3241                 }
3242
3243                 if (discard_bio) {
3244                         submit_bio_wait(discard_bio);
3245                         bio_put(discard_bio);
3246                 }
3247         }
3248
3249         list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
3250                 ext4_free_data_in_buddy(sb, entry);
3251 }
3252
3253 int __init ext4_init_mballoc(void)
3254 {
3255         ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
3256                                         SLAB_RECLAIM_ACCOUNT);
3257         if (ext4_pspace_cachep == NULL)
3258                 goto out;
3259
3260         ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
3261                                     SLAB_RECLAIM_ACCOUNT);
3262         if (ext4_ac_cachep == NULL)
3263                 goto out_pa_free;
3264
3265         ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
3266                                            SLAB_RECLAIM_ACCOUNT);
3267         if (ext4_free_data_cachep == NULL)
3268                 goto out_ac_free;
3269
3270         return 0;
3271
3272 out_ac_free:
3273         kmem_cache_destroy(ext4_ac_cachep);
3274 out_pa_free:
3275         kmem_cache_destroy(ext4_pspace_cachep);
3276 out:
3277         return -ENOMEM;
3278 }
3279
3280 void ext4_exit_mballoc(void)
3281 {
3282         /*
3283          * Wait for completion of call_rcu()'s on ext4_pspace_cachep
3284          * before destroying the slab cache.
3285          */
3286         rcu_barrier();
3287         kmem_cache_destroy(ext4_pspace_cachep);
3288         kmem_cache_destroy(ext4_ac_cachep);
3289         kmem_cache_destroy(ext4_free_data_cachep);
3290         ext4_groupinfo_destroy_slabs();
3291 }
3292
3293
3294 /*
3295  * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
3296  * Returns 0 if success or error code
3297  */
3298 static noinline_for_stack int
3299 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
3300                                 handle_t *handle, unsigned int reserv_clstrs)
3301 {
3302         struct buffer_head *bitmap_bh = NULL;
3303         struct ext4_group_desc *gdp;
3304         struct buffer_head *gdp_bh;
3305         struct ext4_sb_info *sbi;
3306         struct super_block *sb;
3307         ext4_fsblk_t block;
3308         int err, len;
3309
3310         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3311         BUG_ON(ac->ac_b_ex.fe_len <= 0);
3312
3313         sb = ac->ac_sb;
3314         sbi = EXT4_SB(sb);
3315
3316         bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
3317         if (IS_ERR(bitmap_bh)) {
3318                 err = PTR_ERR(bitmap_bh);
3319                 bitmap_bh = NULL;
3320                 goto out_err;
3321         }
3322
3323         BUFFER_TRACE(bitmap_bh, "getting write access");
3324         err = ext4_journal_get_write_access(handle, bitmap_bh);
3325         if (err)
3326                 goto out_err;
3327
3328         err = -EIO;
3329         gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3330         if (!gdp)
3331                 goto out_err;
3332
3333         ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
3334                         ext4_free_group_clusters(sb, gdp));
3335
3336         BUFFER_TRACE(gdp_bh, "get_write_access");
3337         err = ext4_journal_get_write_access(handle, gdp_bh);
3338         if (err)
3339                 goto out_err;
3340
3341         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3342
3343         len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3344         if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
3345                 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
3346                            "fs metadata", block, block+len);
3347                 /* File system mounted not to panic on error
3348                  * Fix the bitmap and return EFSCORRUPTED
3349                  * We leak some of the blocks here.
3350                  */
3351                 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3352                 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3353                               ac->ac_b_ex.fe_len);
3354                 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3355                 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3356                 if (!err)
3357                         err = -EFSCORRUPTED;
3358                 goto out_err;
3359         }
3360
3361         ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3362 #ifdef AGGRESSIVE_CHECK
3363         {
3364                 int i;
3365                 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3366                         BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3367                                                 bitmap_bh->b_data));
3368                 }
3369         }
3370 #endif
3371         ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3372                       ac->ac_b_ex.fe_len);
3373         if (ext4_has_group_desc_csum(sb) &&
3374             (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3375                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3376                 ext4_free_group_clusters_set(sb, gdp,
3377                                              ext4_free_clusters_after_init(sb,
3378                                                 ac->ac_b_ex.fe_group, gdp));
3379         }
3380         len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
3381         ext4_free_group_clusters_set(sb, gdp, len);
3382         ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
3383         ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
3384
3385         ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3386         percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
3387         /*
3388          * Now reduce the dirty block count also. Should not go negative
3389          */
3390         if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3391                 /* release all the reserved blocks if non delalloc */
3392                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
3393                                    reserv_clstrs);
3394
3395         if (sbi->s_log_groups_per_flex) {
3396                 ext4_group_t flex_group = ext4_flex_group(sbi,
3397                                                           ac->ac_b_ex.fe_group);
3398                 atomic64_sub(ac->ac_b_ex.fe_len,
3399                              &sbi_array_rcu_deref(sbi, s_flex_groups,
3400                                                   flex_group)->free_clusters);
3401         }
3402
3403         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3404         if (err)
3405                 goto out_err;
3406         err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
3407
3408 out_err:
3409         brelse(bitmap_bh);
3410         return err;
3411 }
3412
3413 /*
3414  * Idempotent helper for Ext4 fast commit replay path to set the state of
3415  * blocks in bitmaps and update counters.
3416  */
3417 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
3418                         int len, int state)
3419 {
3420         struct buffer_head *bitmap_bh = NULL;
3421         struct ext4_group_desc *gdp;
3422         struct buffer_head *gdp_bh;
3423         struct ext4_sb_info *sbi = EXT4_SB(sb);
3424         ext4_group_t group;
3425         ext4_grpblk_t blkoff;
3426         int i, err;
3427         int already;
3428         unsigned int clen, clen_changed, thisgrp_len;
3429
3430         while (len > 0) {
3431                 ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
3432
3433                 /*
3434                  * Check to see if we are freeing blocks across a group
3435                  * boundary.
3436                  * In case of flex_bg, this can happen that (block, len) may
3437                  * span across more than one group. In that case we need to
3438                  * get the corresponding group metadata to work with.
3439                  * For this we have goto again loop.
3440                  */
3441                 thisgrp_len = min_t(unsigned int, (unsigned int)len,
3442                         EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
3443                 clen = EXT4_NUM_B2C(sbi, thisgrp_len);
3444
3445                 bitmap_bh = ext4_read_block_bitmap(sb, group);
3446                 if (IS_ERR(bitmap_bh)) {
3447                         err = PTR_ERR(bitmap_bh);
3448                         bitmap_bh = NULL;
3449                         break;
3450                 }
3451
3452                 err = -EIO;
3453                 gdp = ext4_get_group_desc(sb, group, &gdp_bh);
3454                 if (!gdp)
3455                         break;
3456
3457                 ext4_lock_group(sb, group);
3458                 already = 0;
3459                 for (i = 0; i < clen; i++)
3460                         if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
3461                                          !state)
3462                                 already++;
3463
3464                 clen_changed = clen - already;
3465                 if (state)
3466                         ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
3467                 else
3468                         mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
3469                 if (ext4_has_group_desc_csum(sb) &&
3470                     (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
3471                         gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3472                         ext4_free_group_clusters_set(sb, gdp,
3473                              ext4_free_clusters_after_init(sb, group, gdp));
3474                 }
3475                 if (state)
3476                         clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
3477                 else
3478                         clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
3479
3480                 ext4_free_group_clusters_set(sb, gdp, clen);
3481                 ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
3482                 ext4_group_desc_csum_set(sb, group, gdp);
3483
3484                 ext4_unlock_group(sb, group);
3485
3486                 if (sbi->s_log_groups_per_flex) {
3487                         ext4_group_t flex_group = ext4_flex_group(sbi, group);
3488                         struct flex_groups *fg = sbi_array_rcu_deref(sbi,
3489                                                    s_flex_groups, flex_group);
3490
3491                         if (state)
3492                                 atomic64_sub(clen_changed, &fg->free_clusters);
3493                         else
3494                                 atomic64_add(clen_changed, &fg->free_clusters);
3495
3496                 }
3497
3498                 err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
3499                 if (err)
3500                         break;
3501                 sync_dirty_buffer(bitmap_bh);
3502                 err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
3503                 sync_dirty_buffer(gdp_bh);
3504                 if (err)
3505                         break;
3506
3507                 block += thisgrp_len;
3508                 len -= thisgrp_len;
3509                 brelse(bitmap_bh);
3510                 BUG_ON(len < 0);
3511         }
3512
3513         if (err)
3514                 brelse(bitmap_bh);
3515 }
3516
3517 /*
3518  * here we normalize request for locality group
3519  * Group request are normalized to s_mb_group_prealloc, which goes to
3520  * s_strip if we set the same via mount option.
3521  * s_mb_group_prealloc can be configured via
3522  * /sys/fs/ext4/<partition>/mb_group_prealloc
3523  *
3524  * XXX: should we try to preallocate more than the group has now?
3525  */
3526 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3527 {
3528         struct super_block *sb = ac->ac_sb;
3529         struct ext4_locality_group *lg = ac->ac_lg;
3530
3531         BUG_ON(lg == NULL);
3532         ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3533         mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
3534 }
3535
3536 /*
3537  * Normalization means making request better in terms of
3538  * size and alignment
3539  */
3540 static noinline_for_stack void
3541 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3542                                 struct ext4_allocation_request *ar)
3543 {
3544         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3545         struct ext4_super_block *es = sbi->s_es;
3546         int bsbits, max;
3547         loff_t size, start_off, end;
3548         loff_t orig_size __maybe_unused;
3549         ext4_lblk_t start;
3550         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3551         struct ext4_prealloc_space *pa;
3552
3553         /* do normalize only data requests, metadata requests
3554            do not need preallocation */
3555         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3556                 return;
3557
3558         /* sometime caller may want exact blocks */
3559         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3560                 return;
3561
3562         /* caller may indicate that preallocation isn't
3563          * required (it's a tail, for example) */
3564         if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3565                 return;
3566
3567         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3568                 ext4_mb_normalize_group_request(ac);
3569                 return ;
3570         }
3571
3572         bsbits = ac->ac_sb->s_blocksize_bits;
3573
3574         /* first, let's learn actual file size
3575          * given current request is allocated */
3576         size = extent_logical_end(sbi, &ac->ac_o_ex);
3577         size = size << bsbits;
3578         if (size < i_size_read(ac->ac_inode))
3579                 size = i_size_read(ac->ac_inode);
3580         orig_size = size;
3581
3582         /* max size of free chunks */
3583         max = 2 << bsbits;
3584
3585 #define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
3586                 (req <= (size) || max <= (chunk_size))
3587
3588         /* first, try to predict filesize */
3589         /* XXX: should this table be tunable? */
3590         start_off = 0;
3591         if (size <= 16 * 1024) {
3592                 size = 16 * 1024;
3593         } else if (size <= 32 * 1024) {
3594                 size = 32 * 1024;
3595         } else if (size <= 64 * 1024) {
3596                 size = 64 * 1024;
3597         } else if (size <= 128 * 1024) {
3598                 size = 128 * 1024;
3599         } else if (size <= 256 * 1024) {
3600                 size = 256 * 1024;
3601         } else if (size <= 512 * 1024) {
3602                 size = 512 * 1024;
3603         } else if (size <= 1024 * 1024) {
3604                 size = 1024 * 1024;
3605         } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3606                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3607                                                 (21 - bsbits)) << 21;
3608                 size = 2 * 1024 * 1024;
3609         } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3610                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3611                                                         (22 - bsbits)) << 22;
3612                 size = 4 * 1024 * 1024;
3613         } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3614                                         (8<<20)>>bsbits, max, 8 * 1024)) {
3615                 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3616                                                         (23 - bsbits)) << 23;
3617                 size = 8 * 1024 * 1024;
3618         } else {
3619                 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3620                 size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3621                                               ac->ac_o_ex.fe_len) << bsbits;
3622         }
3623         size = size >> bsbits;
3624         start = start_off >> bsbits;
3625
3626         /*
3627          * For tiny groups (smaller than 8MB) the chosen allocation
3628          * alignment may be larger than group size. Make sure the
3629          * alignment does not move allocation to a different group which
3630          * makes mballoc fail assertions later.
3631          */
3632         start = max(start, rounddown(ac->ac_o_ex.fe_logical,
3633                         (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
3634
3635         /* avoid unnecessary preallocation that may trigger assertions */
3636         if (start + size > EXT_MAX_BLOCKS)
3637                 size = EXT_MAX_BLOCKS - start;
3638
3639         /* don't cover already allocated blocks in selected range */
3640         if (ar->pleft && start <= ar->lleft) {
3641                 size -= ar->lleft + 1 - start;
3642                 start = ar->lleft + 1;
3643         }
3644         if (ar->pright && start + size - 1 >= ar->lright)
3645                 size -= start + size - ar->lright;
3646
3647         /*
3648          * Trim allocation request for filesystems with artificially small
3649          * groups.
3650          */
3651         if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
3652                 size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
3653
3654         end = start + size;
3655
3656         /* check we don't cross already preallocated blocks */
3657         rcu_read_lock();
3658         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3659                 loff_t pa_end;
3660
3661                 if (pa->pa_deleted)
3662                         continue;
3663                 spin_lock(&pa->pa_lock);
3664                 if (pa->pa_deleted) {
3665                         spin_unlock(&pa->pa_lock);
3666                         continue;
3667                 }
3668
3669                 pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);
3670
3671                 /* PA must not overlap original request */
3672                 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3673                         ac->ac_o_ex.fe_logical < pa->pa_lstart));
3674
3675                 /* skip PAs this normalized request doesn't overlap with */
3676                 if (pa->pa_lstart >= end || pa_end <= start) {
3677                         spin_unlock(&pa->pa_lock);
3678                         continue;
3679                 }
3680                 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3681
3682                 /* adjust start or end to be adjacent to this pa */
3683                 if (pa_end <= ac->ac_o_ex.fe_logical) {
3684                         BUG_ON(pa_end < start);
3685                         start = pa_end;
3686                 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3687                         BUG_ON(pa->pa_lstart > end);
3688                         end = pa->pa_lstart;
3689                 }
3690                 spin_unlock(&pa->pa_lock);
3691         }
3692         rcu_read_unlock();
3693         size = end - start;
3694
3695         /* XXX: extra loop to check we really don't overlap preallocations */
3696         rcu_read_lock();
3697         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3698                 loff_t pa_end;
3699
3700                 spin_lock(&pa->pa_lock);
3701                 if (pa->pa_deleted == 0) {
3702                         pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);
3703                         BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3704                 }
3705                 spin_unlock(&pa->pa_lock);
3706         }
3707         rcu_read_unlock();
3708
3709         if (start + size <= ac->ac_o_ex.fe_logical &&
3710                         start > ac->ac_o_ex.fe_logical) {
3711                 ext4_msg(ac->ac_sb, KERN_ERR,
3712                          "start %lu, size %lu, fe_logical %lu",
3713                          (unsigned long) start, (unsigned long) size,
3714                          (unsigned long) ac->ac_o_ex.fe_logical);
3715                 BUG();
3716         }
3717         BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3718
3719         /* now prepare goal request */
3720
3721         /* XXX: is it better to align blocks WRT to logical
3722          * placement or satisfy big request as is */
3723         ac->ac_g_ex.fe_logical = start;
3724         ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3725
3726         /* define goal start in order to merge */
3727         if (ar->pright && (ar->lright == (start + size)) &&
3728             ar->pright >= size &&
3729             ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
3730                 /* merge to the right */
3731                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3732                                                 &ac->ac_g_ex.fe_group,
3733                                                 &ac->ac_g_ex.fe_start);
3734                 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3735         }
3736         if (ar->pleft && (ar->lleft + 1 == start) &&
3737             ar->pleft + 1 < ext4_blocks_count(es)) {
3738                 /* merge to the left */
3739                 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3740                                                 &ac->ac_g_ex.fe_group,
3741                                                 &ac->ac_g_ex.fe_start);
3742                 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3743         }
3744
3745         mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
3746                  orig_size, start);
3747 }
3748
3749 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3750 {
3751         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3752
3753         if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
3754                 atomic_inc(&sbi->s_bal_reqs);
3755                 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3756                 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3757                         atomic_inc(&sbi->s_bal_success);
3758                 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3759                 atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
3760                 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3761                                 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3762                         atomic_inc(&sbi->s_bal_goals);
3763                 if (ac->ac_found > sbi->s_mb_max_to_scan)
3764                         atomic_inc(&sbi->s_bal_breaks);
3765         }
3766
3767         if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3768                 trace_ext4_mballoc_alloc(ac);
3769         else
3770                 trace_ext4_mballoc_prealloc(ac);
3771 }
3772
3773 /*
3774  * Called on failure; free up any blocks from the inode PA for this
3775  * context.  We don't need this for MB_GROUP_PA because we only change
3776  * pa_free in ext4_mb_release_context(), but on failure, we've already
3777  * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3778  */
3779 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3780 {
3781         struct ext4_prealloc_space *pa = ac->ac_pa;
3782         struct ext4_buddy e4b;
3783         int err;
3784
3785         if (pa == NULL) {
3786                 if (ac->ac_f_ex.fe_len == 0)
3787                         return;
3788                 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3789                 if (err) {
3790                         /*
3791                          * This should never happen since we pin the
3792                          * pages in the ext4_allocation_context so
3793                          * ext4_mb_load_buddy() should never fail.
3794                          */
3795                         WARN(1, "mb_load_buddy failed (%d)", err);
3796                         return;
3797                 }
3798                 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3799                 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3800                                ac->ac_f_ex.fe_len);
3801                 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3802                 ext4_mb_unload_buddy(&e4b);
3803                 return;
3804         }
3805         if (pa->pa_type == MB_INODE_PA)
3806                 pa->pa_free += ac->ac_b_ex.fe_len;
3807 }
3808
3809 /*
3810  * use blocks preallocated to inode
3811  */
3812 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3813                                 struct ext4_prealloc_space *pa)
3814 {
3815         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3816         ext4_fsblk_t start;
3817         ext4_fsblk_t end;
3818         int len;
3819
3820         /* found preallocated blocks, use them */
3821         start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3822         end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3823                   start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3824         len = EXT4_NUM_B2C(sbi, end - start);
3825         ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3826                                         &ac->ac_b_ex.fe_start);
3827         ac->ac_b_ex.fe_len = len;
3828         ac->ac_status = AC_STATUS_FOUND;
3829         ac->ac_pa = pa;
3830
3831         BUG_ON(start < pa->pa_pstart);
3832         BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3833         BUG_ON(pa->pa_free < len);
3834         BUG_ON(ac->ac_b_ex.fe_len <= 0);
3835         pa->pa_free -= len;
3836
3837         mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
3838 }
3839
3840 /*
3841  * use blocks preallocated to locality group
3842  */
3843 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3844                                 struct ext4_prealloc_space *pa)
3845 {
3846         unsigned int len = ac->ac_o_ex.fe_len;
3847
3848         ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3849                                         &ac->ac_b_ex.fe_group,
3850                                         &ac->ac_b_ex.fe_start);
3851         ac->ac_b_ex.fe_len = len;
3852         ac->ac_status = AC_STATUS_FOUND;
3853         ac->ac_pa = pa;
3854
3855         /* we don't correct pa_pstart or pa_plen here to avoid
3856          * possible race when the group is being loaded concurrently
3857          * instead we correct pa later, after blocks are marked
3858          * in on-disk bitmap -- see ext4_mb_release_context()
3859          * Other CPUs are prevented from allocating from this pa by lg_mutex
3860          */
3861         mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
3862                  pa->pa_lstart-len, len, pa);
3863 }
3864
3865 /*
3866  * Return the prealloc space that have minimal distance
3867  * from the goal block. @cpa is the prealloc
3868  * space that is having currently known minimal distance
3869  * from the goal block.
3870  */
3871 static struct ext4_prealloc_space *
3872 ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3873                         struct ext4_prealloc_space *pa,
3874                         struct ext4_prealloc_space *cpa)
3875 {
3876         ext4_fsblk_t cur_distance, new_distance;
3877
3878         if (cpa == NULL) {
3879                 atomic_inc(&pa->pa_count);
3880                 return pa;
3881         }
3882         cur_distance = abs(goal_block - cpa->pa_pstart);
3883         new_distance = abs(goal_block - pa->pa_pstart);
3884
3885         if (cur_distance <= new_distance)
3886                 return cpa;
3887
3888         /* drop the previous reference */
3889         atomic_dec(&cpa->pa_count);
3890         atomic_inc(&pa->pa_count);
3891         return pa;
3892 }
3893
3894 /*
3895  * search goal blocks in preallocated space
3896  */
3897 static noinline_for_stack bool
3898 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3899 {
3900         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3901         int order, i;
3902         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3903         struct ext4_locality_group *lg;
3904         struct ext4_prealloc_space *pa, *cpa = NULL;
3905         ext4_fsblk_t goal_block;
3906
3907         /* only data can be preallocated */
3908         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3909                 return false;
3910
3911         /* first, try per-file preallocation */
3912         rcu_read_lock();
3913         list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3914
3915                 /* all fields in this condition don't change,
3916                  * so we can skip locking for them */
3917                 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3918                     ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, pa))
3919                         continue;
3920
3921                 /* non-extent files can't have physical blocks past 2^32 */
3922                 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3923                     (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3924                      EXT4_MAX_BLOCK_FILE_PHYS))
3925                         continue;
3926
3927                 /* found preallocated blocks, use them */
3928                 spin_lock(&pa->pa_lock);
3929                 if (pa->pa_deleted == 0 && pa->pa_free) {
3930                         atomic_inc(&pa->pa_count);
3931                         ext4_mb_use_inode_pa(ac, pa);
3932                         spin_unlock(&pa->pa_lock);
3933                         ac->ac_criteria = 10;
3934                         rcu_read_unlock();
3935                         return true;
3936                 }
3937                 spin_unlock(&pa->pa_lock);
3938         }
3939         rcu_read_unlock();
3940
3941         /* can we use group allocation? */
3942         if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3943                 return false;
3944
3945         /* inode may have no locality group for some reason */
3946         lg = ac->ac_lg;
3947         if (lg == NULL)
3948                 return false;
3949         order  = fls(ac->ac_o_ex.fe_len) - 1;
3950         if (order > PREALLOC_TB_SIZE - 1)
3951                 /* The max size of hash table is PREALLOC_TB_SIZE */
3952                 order = PREALLOC_TB_SIZE - 1;
3953
3954         goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3955         /*
3956          * search for the prealloc space that is having
3957          * minimal distance from the goal block.
3958          */
3959         for (i = order; i < PREALLOC_TB_SIZE; i++) {
3960                 rcu_read_lock();
3961                 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3962                                         pa_inode_list) {
3963                         spin_lock(&pa->pa_lock);
3964                         if (pa->pa_deleted == 0 &&
3965                                         pa->pa_free >= ac->ac_o_ex.fe_len) {
3966
3967                                 cpa = ext4_mb_check_group_pa(goal_block,
3968                                                                 pa, cpa);
3969                         }
3970                         spin_unlock(&pa->pa_lock);
3971                 }
3972                 rcu_read_unlock();
3973         }
3974         if (cpa) {
3975                 ext4_mb_use_group_pa(ac, cpa);
3976                 ac->ac_criteria = 20;
3977                 return true;
3978         }
3979         return false;
3980 }
3981
3982 /*
3983  * the function goes through all block freed in the group
3984  * but not yet committed and marks them used in in-core bitmap.
3985  * buddy must be generated from this bitmap
3986  * Need to be called with the ext4 group lock held
3987  */
3988 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3989                                                 ext4_group_t group)
3990 {
3991         struct rb_node *n;
3992         struct ext4_group_info *grp;
3993         struct ext4_free_data *entry;
3994
3995         grp = ext4_get_group_info(sb, group);
3996         if (!grp)
3997                 return;
3998         n = rb_first(&(grp->bb_free_root));
3999
4000         while (n) {
4001                 entry = rb_entry(n, struct ext4_free_data, efd_node);
4002                 ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
4003                 n = rb_next(n);
4004         }
4005         return;
4006 }
4007
4008 /*
4009  * the function goes through all preallocation in this group and marks them
4010  * used in in-core bitmap. buddy must be generated from this bitmap
4011  * Need to be called with ext4 group lock held
4012  */
4013 static noinline_for_stack
4014 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
4015                                         ext4_group_t group)
4016 {
4017         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
4018         struct ext4_prealloc_space *pa;
4019         struct list_head *cur;
4020         ext4_group_t groupnr;
4021         ext4_grpblk_t start;
4022         int preallocated = 0;
4023         int len;
4024
4025         if (!grp)
4026                 return;
4027
4028         /* all form of preallocation discards first load group,
4029          * so the only competing code is preallocation use.
4030          * we don't need any locking here
4031          * notice we do NOT ignore preallocations with pa_deleted
4032          * otherwise we could leave used blocks available for
4033          * allocation in buddy when concurrent ext4_mb_put_pa()
4034          * is dropping preallocation
4035          */
4036         list_for_each(cur, &grp->bb_prealloc_list) {
4037                 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
4038                 spin_lock(&pa->pa_lock);
4039                 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4040                                              &groupnr, &start);
4041                 len = pa->pa_len;
4042                 spin_unlock(&pa->pa_lock);
4043                 if (unlikely(len == 0))
4044                         continue;
4045                 BUG_ON(groupnr != group);
4046                 ext4_set_bits(bitmap, start, len);
4047                 preallocated += len;
4048         }
4049         mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
4050 }
4051
4052 static void ext4_mb_mark_pa_deleted(struct super_block *sb,
4053                                     struct ext4_prealloc_space *pa)
4054 {
4055         struct ext4_inode_info *ei;
4056
4057         if (pa->pa_deleted) {
4058                 ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
4059                              pa->pa_type, pa->pa_pstart, pa->pa_lstart,
4060                              pa->pa_len);
4061                 return;
4062         }
4063
4064         pa->pa_deleted = 1;
4065
4066         if (pa->pa_type == MB_INODE_PA) {
4067                 ei = EXT4_I(pa->pa_inode);
4068                 atomic_dec(&ei->i_prealloc_active);
4069         }
4070 }
4071
4072 static void ext4_mb_pa_callback(struct rcu_head *head)
4073 {
4074         struct ext4_prealloc_space *pa;
4075         pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
4076
4077         BUG_ON(atomic_read(&pa->pa_count));
4078         BUG_ON(pa->pa_deleted == 0);
4079         kmem_cache_free(ext4_pspace_cachep, pa);
4080 }
4081
4082 /*
4083  * drops a reference to preallocated space descriptor
4084  * if this was the last reference and the space is consumed
4085  */
4086 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
4087                         struct super_block *sb, struct ext4_prealloc_space *pa)
4088 {
4089         ext4_group_t grp;
4090         ext4_fsblk_t grp_blk;
4091
4092         /* in this short window concurrent discard can set pa_deleted */
4093         spin_lock(&pa->pa_lock);
4094         if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
4095                 spin_unlock(&pa->pa_lock);
4096                 return;
4097         }
4098
4099         if (pa->pa_deleted == 1) {
4100                 spin_unlock(&pa->pa_lock);
4101                 return;
4102         }
4103
4104         ext4_mb_mark_pa_deleted(sb, pa);
4105         spin_unlock(&pa->pa_lock);
4106
4107         grp_blk = pa->pa_pstart;
4108         /*
4109          * If doing group-based preallocation, pa_pstart may be in the
4110          * next group when pa is used up
4111          */
4112         if (pa->pa_type == MB_GROUP_PA)
4113                 grp_blk--;
4114
4115         grp = ext4_get_group_number(sb, grp_blk);
4116
4117         /*
4118          * possible race:
4119          *
4120          *  P1 (buddy init)                     P2 (regular allocation)
4121          *                                      find block B in PA
4122          *  copy on-disk bitmap to buddy
4123          *                                      mark B in on-disk bitmap
4124          *                                      drop PA from group
4125          *  mark all PAs in buddy
4126          *
4127          * thus, P1 initializes buddy with B available. to prevent this
4128          * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
4129          * against that pair
4130          */
4131         ext4_lock_group(sb, grp);
4132         list_del(&pa->pa_group_list);
4133         ext4_unlock_group(sb, grp);
4134
4135         spin_lock(pa->pa_obj_lock);
4136         list_del_rcu(&pa->pa_inode_list);
4137         spin_unlock(pa->pa_obj_lock);
4138
4139         call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4140 }
4141
4142 /*
4143  * creates new preallocated space for given inode
4144  */
4145 static noinline_for_stack void
4146 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
4147 {
4148         struct super_block *sb = ac->ac_sb;
4149         struct ext4_sb_info *sbi = EXT4_SB(sb);
4150         struct ext4_prealloc_space *pa;
4151         struct ext4_group_info *grp;
4152         struct ext4_inode_info *ei;
4153
4154         /* preallocate only when found space is larger then requested */
4155         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
4156         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
4157         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4158         BUG_ON(ac->ac_pa == NULL);
4159
4160         pa = ac->ac_pa;
4161
4162         if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
4163                 struct ext4_free_extent ex = {
4164                         .fe_logical = ac->ac_g_ex.fe_logical,
4165                         .fe_len = ac->ac_g_ex.fe_len,
4166                 };
4167                 loff_t orig_goal_end = extent_logical_end(sbi, &ex);
4168                 loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);
4169
4170                 /*
4171                  * We can't allocate as much as normalizer wants, so we try
4172                  * to get proper lstart to cover the original request, except
4173                  * when the goal doesn't cover the original request as below:
4174                  *
4175                  * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
4176                  * best_ex:0/200(200) -> adjusted: 1848/2048(200)
4177                  */
4178                 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
4179                 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
4180
4181                 /*
4182                  * Use the below logic for adjusting best extent as it keeps
4183                  * fragmentation in check while ensuring logical range of best
4184                  * extent doesn't overflow out of goal extent:
4185                  *
4186                  * 1. Check if best ex can be kept at end of goal and still
4187                  *    cover original start
4188                  * 2. Else, check if best ex can be kept at start of goal and
4189                  *    still cover original end
4190                  * 3. Else, keep the best ex at start of original request.
4191                  */
4192                 ex.fe_len = ac->ac_b_ex.fe_len;
4193
4194                 ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
4195                 if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
4196                         goto adjust_bex;
4197
4198                 ex.fe_logical = ac->ac_g_ex.fe_logical;
4199                 if (o_ex_end <= extent_logical_end(sbi, &ex))
4200                         goto adjust_bex;
4201
4202                 ex.fe_logical = ac->ac_o_ex.fe_logical;
4203 adjust_bex:
4204                 ac->ac_b_ex.fe_logical = ex.fe_logical;
4205
4206                 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
4207                 BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
4208         }
4209
4210         /* preallocation can change ac_b_ex, thus we store actually
4211          * allocated blocks for history */
4212         ac->ac_f_ex = ac->ac_b_ex;
4213
4214         pa->pa_lstart = ac->ac_b_ex.fe_logical;
4215         pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4216         pa->pa_len = ac->ac_b_ex.fe_len;
4217         pa->pa_free = pa->pa_len;
4218         spin_lock_init(&pa->pa_lock);
4219         INIT_LIST_HEAD(&pa->pa_inode_list);
4220         INIT_LIST_HEAD(&pa->pa_group_list);
4221         pa->pa_deleted = 0;
4222         pa->pa_type = MB_INODE_PA;
4223
4224         mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4225                  pa->pa_len, pa->pa_lstart);
4226         trace_ext4_mb_new_inode_pa(ac, pa);
4227
4228         ext4_mb_use_inode_pa(ac, pa);
4229         atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
4230
4231         ei = EXT4_I(ac->ac_inode);
4232         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4233         if (!grp)
4234                 return;
4235
4236         pa->pa_obj_lock = &ei->i_prealloc_lock;
4237         pa->pa_inode = ac->ac_inode;
4238
4239         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
4240
4241         spin_lock(pa->pa_obj_lock);
4242         list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
4243         spin_unlock(pa->pa_obj_lock);
4244         atomic_inc(&ei->i_prealloc_active);
4245 }
4246
4247 /*
4248  * creates new preallocated space for locality group inodes belongs to
4249  */
4250 static noinline_for_stack void
4251 ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
4252 {
4253         struct super_block *sb = ac->ac_sb;
4254         struct ext4_locality_group *lg;
4255         struct ext4_prealloc_space *pa;
4256         struct ext4_group_info *grp;
4257
4258         /* preallocate only when found space is larger then requested */
4259         BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
4260         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
4261         BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
4262         BUG_ON(ac->ac_pa == NULL);
4263
4264         pa = ac->ac_pa;
4265
4266         /* preallocation can change ac_b_ex, thus we store actually
4267          * allocated blocks for history */
4268         ac->ac_f_ex = ac->ac_b_ex;
4269
4270         pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4271         pa->pa_lstart = pa->pa_pstart;
4272         pa->pa_len = ac->ac_b_ex.fe_len;
4273         pa->pa_free = pa->pa_len;
4274         spin_lock_init(&pa->pa_lock);
4275         INIT_LIST_HEAD(&pa->pa_inode_list);
4276         INIT_LIST_HEAD(&pa->pa_group_list);
4277         pa->pa_deleted = 0;
4278         pa->pa_type = MB_GROUP_PA;
4279
4280         mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
4281                  pa->pa_len, pa->pa_lstart);
4282         trace_ext4_mb_new_group_pa(ac, pa);
4283
4284         ext4_mb_use_group_pa(ac, pa);
4285         atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
4286
4287         grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
4288         if (!grp)
4289                 return;
4290         lg = ac->ac_lg;
4291         BUG_ON(lg == NULL);
4292
4293         pa->pa_obj_lock = &lg->lg_prealloc_lock;
4294         pa->pa_inode = NULL;
4295
4296         list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
4297
4298         /*
4299          * We will later add the new pa to the right bucket
4300          * after updating the pa_free in ext4_mb_release_context
4301          */
4302 }
4303
4304 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
4305 {
4306         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4307                 ext4_mb_new_group_pa(ac);
4308         else
4309                 ext4_mb_new_inode_pa(ac);
4310 }
4311
4312 /*
4313  * finds all unused blocks in on-disk bitmap, frees them in
4314  * in-core bitmap and buddy.
4315  * @pa must be unlinked from inode and group lists, so that
4316  * nobody else can find/use it.
4317  * the caller MUST hold group/inode locks.
4318  * TODO: optimize the case when there are no in-core structures yet
4319  */
4320 static noinline_for_stack int
4321 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
4322                         struct ext4_prealloc_space *pa)
4323 {
4324         struct super_block *sb = e4b->bd_sb;
4325         struct ext4_sb_info *sbi = EXT4_SB(sb);
4326         unsigned int end;
4327         unsigned int next;
4328         ext4_group_t group;
4329         ext4_grpblk_t bit;
4330         unsigned long long grp_blk_start;
4331         int free = 0;
4332
4333         BUG_ON(pa->pa_deleted == 0);
4334         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
4335         grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
4336         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
4337         end = bit + pa->pa_len;
4338
4339         while (bit < end) {
4340                 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
4341                 if (bit >= end)
4342                         break;
4343                 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
4344                 mb_debug(sb, "free preallocated %u/%u in group %u\n",
4345                          (unsigned) ext4_group_first_block_no(sb, group) + bit,
4346                          (unsigned) next - bit, (unsigned) group);
4347                 free += next - bit;
4348
4349                 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
4350                 trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
4351                                                     EXT4_C2B(sbi, bit)),
4352                                                next - bit);
4353                 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
4354                 bit = next + 1;
4355         }
4356         if (free != pa->pa_free) {
4357                 ext4_msg(e4b->bd_sb, KERN_CRIT,
4358                          "pa %p: logic %lu, phys. %lu, len %d",
4359                          pa, (unsigned long) pa->pa_lstart,
4360                          (unsigned long) pa->pa_pstart,
4361                          pa->pa_len);
4362                 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
4363                                         free, pa->pa_free);
4364                 /*
4365                  * pa is already deleted so we use the value obtained
4366                  * from the bitmap and continue.
4367                  */
4368         }
4369         atomic_add(free, &sbi->s_mb_discarded);
4370
4371         return 0;
4372 }
4373
4374 static noinline_for_stack int
4375 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
4376                                 struct ext4_prealloc_space *pa)
4377 {
4378         struct super_block *sb = e4b->bd_sb;
4379         ext4_group_t group;
4380         ext4_grpblk_t bit;
4381
4382         trace_ext4_mb_release_group_pa(sb, pa);
4383         BUG_ON(pa->pa_deleted == 0);
4384         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
4385         if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
4386                 ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
4387                              e4b->bd_group, group, pa->pa_pstart);
4388                 return 0;
4389         }
4390         mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
4391         atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
4392         trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
4393
4394         return 0;
4395 }
4396
4397 /*
4398  * releases all preallocations in given group
4399  *
4400  * first, we need to decide discard policy:
4401  * - when do we discard
4402  *   1) ENOSPC
4403  * - how many do we discard
4404  *   1) how many requested
4405  */
4406 static noinline_for_stack int
4407 ext4_mb_discard_group_preallocations(struct super_block *sb,
4408                                      ext4_group_t group, int *busy)
4409 {
4410         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
4411         struct buffer_head *bitmap_bh = NULL;
4412         struct ext4_prealloc_space *pa, *tmp;
4413         struct list_head list;
4414         struct ext4_buddy e4b;
4415         int err;
4416         int free = 0;
4417
4418         if (!grp)
4419                 return 0;
4420         mb_debug(sb, "discard preallocation for group %u\n", group);
4421         if (list_empty(&grp->bb_prealloc_list))
4422                 goto out_dbg;
4423
4424         bitmap_bh = ext4_read_block_bitmap(sb, group);
4425         if (IS_ERR(bitmap_bh)) {
4426                 err = PTR_ERR(bitmap_bh);
4427                 ext4_error_err(sb, -err,
4428                                "Error %d reading block bitmap for %u",
4429                                err, group);
4430                 goto out_dbg;
4431         }
4432
4433         err = ext4_mb_load_buddy(sb, group, &e4b);
4434         if (err) {
4435                 ext4_warning(sb, "Error %d loading buddy information for %u",
4436                              err, group);
4437                 put_bh(bitmap_bh);
4438                 goto out_dbg;
4439         }
4440
4441         INIT_LIST_HEAD(&list);
4442         ext4_lock_group(sb, group);
4443         list_for_each_entry_safe(pa, tmp,
4444                                 &grp->bb_prealloc_list, pa_group_list) {
4445                 spin_lock(&pa->pa_lock);
4446                 if (atomic_read(&pa->pa_count)) {
4447                         spin_unlock(&pa->pa_lock);
4448                         *busy = 1;
4449                         continue;
4450                 }
4451                 if (pa->pa_deleted) {
4452                         spin_unlock(&pa->pa_lock);
4453                         continue;
4454                 }
4455
4456                 /* seems this one can be freed ... */
4457                 ext4_mb_mark_pa_deleted(sb, pa);
4458
4459                 if (!free)
4460                         this_cpu_inc(discard_pa_seq);
4461
4462                 /* we can trust pa_free ... */
4463                 free += pa->pa_free;
4464
4465                 spin_unlock(&pa->pa_lock);
4466
4467                 list_del(&pa->pa_group_list);
4468                 list_add(&pa->u.pa_tmp_list, &list);
4469         }
4470
4471         /* now free all selected PAs */
4472         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4473
4474                 /* remove from object (inode or locality group) */
4475                 spin_lock(pa->pa_obj_lock);
4476                 list_del_rcu(&pa->pa_inode_list);
4477                 spin_unlock(pa->pa_obj_lock);
4478
4479                 if (pa->pa_type == MB_GROUP_PA)
4480                         ext4_mb_release_group_pa(&e4b, pa);
4481                 else
4482                         ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4483
4484                 list_del(&pa->u.pa_tmp_list);
4485                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4486         }
4487
4488         ext4_unlock_group(sb, group);
4489         ext4_mb_unload_buddy(&e4b);
4490         put_bh(bitmap_bh);
4491 out_dbg:
4492         mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
4493                  free, group, grp->bb_free);
4494         return free;
4495 }
4496
4497 /*
4498  * releases all non-used preallocated blocks for given inode
4499  *
4500  * It's important to discard preallocations under i_data_sem
4501  * We don't want another block to be served from the prealloc
4502  * space when we are discarding the inode prealloc space.
4503  *
4504  * FIXME!! Make sure it is valid at all the call sites
4505  */
4506 void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
4507 {
4508         struct ext4_inode_info *ei = EXT4_I(inode);
4509         struct super_block *sb = inode->i_sb;
4510         struct buffer_head *bitmap_bh = NULL;
4511         struct ext4_prealloc_space *pa, *tmp;
4512         ext4_group_t group = 0;
4513         struct list_head list;
4514         struct ext4_buddy e4b;
4515         int err;
4516
4517         if (!S_ISREG(inode->i_mode)) {
4518                 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
4519                 return;
4520         }
4521
4522         if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
4523                 return;
4524
4525         mb_debug(sb, "discard preallocation for inode %lu\n",
4526                  inode->i_ino);
4527         trace_ext4_discard_preallocations(inode,
4528                         atomic_read(&ei->i_prealloc_active), needed);
4529
4530         INIT_LIST_HEAD(&list);
4531
4532         if (needed == 0)
4533                 needed = UINT_MAX;
4534
4535 repeat:
4536         /* first, collect all pa's in the inode */
4537         spin_lock(&ei->i_prealloc_lock);
4538         while (!list_empty(&ei->i_prealloc_list) && needed) {
4539                 pa = list_entry(ei->i_prealloc_list.prev,
4540                                 struct ext4_prealloc_space, pa_inode_list);
4541                 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
4542                 spin_lock(&pa->pa_lock);
4543                 if (atomic_read(&pa->pa_count)) {
4544                         /* this shouldn't happen often - nobody should
4545                          * use preallocation while we're discarding it */
4546                         spin_unlock(&pa->pa_lock);
4547                         spin_unlock(&ei->i_prealloc_lock);
4548                         ext4_msg(sb, KERN_ERR,
4549                                  "uh-oh! used pa while discarding");
4550                         WARN_ON(1);
4551                         schedule_timeout_uninterruptible(HZ);
4552                         goto repeat;
4553
4554                 }
4555                 if (pa->pa_deleted == 0) {
4556                         ext4_mb_mark_pa_deleted(sb, pa);
4557                         spin_unlock(&pa->pa_lock);
4558                         list_del_rcu(&pa->pa_inode_list);
4559                         list_add(&pa->u.pa_tmp_list, &list);
4560                         needed--;
4561                         continue;
4562                 }
4563
4564                 /* someone is deleting pa right now */
4565                 spin_unlock(&pa->pa_lock);
4566                 spin_unlock(&ei->i_prealloc_lock);
4567
4568                 /* we have to wait here because pa_deleted
4569                  * doesn't mean pa is already unlinked from
4570                  * the list. as we might be called from
4571                  * ->clear_inode() the inode will get freed
4572                  * and concurrent thread which is unlinking
4573                  * pa from inode's list may access already
4574                  * freed memory, bad-bad-bad */
4575
4576                 /* XXX: if this happens too often, we can
4577                  * add a flag to force wait only in case
4578                  * of ->clear_inode(), but not in case of
4579                  * regular truncate */
4580                 schedule_timeout_uninterruptible(HZ);
4581                 goto repeat;
4582         }
4583         spin_unlock(&ei->i_prealloc_lock);
4584
4585         list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4586                 BUG_ON(pa->pa_type != MB_INODE_PA);
4587                 group = ext4_get_group_number(sb, pa->pa_pstart);
4588
4589                 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4590                                              GFP_NOFS|__GFP_NOFAIL);
4591                 if (err) {
4592                         ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
4593                                        err, group);
4594                         continue;
4595                 }
4596
4597                 bitmap_bh = ext4_read_block_bitmap(sb, group);
4598                 if (IS_ERR(bitmap_bh)) {
4599                         err = PTR_ERR(bitmap_bh);
4600                         ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
4601                                        err, group);
4602                         ext4_mb_unload_buddy(&e4b);
4603                         continue;
4604                 }
4605
4606                 ext4_lock_group(sb, group);
4607                 list_del(&pa->pa_group_list);
4608                 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4609                 ext4_unlock_group(sb, group);
4610
4611                 ext4_mb_unload_buddy(&e4b);
4612                 put_bh(bitmap_bh);
4613
4614                 list_del(&pa->u.pa_tmp_list);
4615                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4616         }
4617 }
4618
4619 static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
4620 {
4621         struct ext4_prealloc_space *pa;
4622
4623         BUG_ON(ext4_pspace_cachep == NULL);
4624         pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
4625         if (!pa)
4626                 return -ENOMEM;
4627         atomic_set(&pa->pa_count, 1);
4628         ac->ac_pa = pa;
4629         return 0;
4630 }
4631
4632 static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
4633 {
4634         struct ext4_prealloc_space *pa = ac->ac_pa;
4635
4636         BUG_ON(!pa);
4637         ac->ac_pa = NULL;
4638         WARN_ON(!atomic_dec_and_test(&pa->pa_count));
4639         kmem_cache_free(ext4_pspace_cachep, pa);
4640 }
4641
4642 #ifdef CONFIG_EXT4_DEBUG
4643 static inline void ext4_mb_show_pa(struct super_block *sb)
4644 {
4645         ext4_group_t i, ngroups;
4646
4647         if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
4648                 return;
4649
4650         ngroups = ext4_get_groups_count(sb);
4651         mb_debug(sb, "groups: ");
4652         for (i = 0; i < ngroups; i++) {
4653                 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4654                 struct ext4_prealloc_space *pa;
4655                 ext4_grpblk_t start;
4656                 struct list_head *cur;
4657
4658                 if (!grp)
4659                         continue;
4660                 ext4_lock_group(sb, i);
4661                 list_for_each(cur, &grp->bb_prealloc_list) {
4662                         pa = list_entry(cur, struct ext4_prealloc_space,
4663                                         pa_group_list);
4664                         spin_lock(&pa->pa_lock);
4665                         ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4666                                                      NULL, &start);
4667                         spin_unlock(&pa->pa_lock);
4668                         mb_debug(sb, "PA:%u:%d:%d\n", i, start,
4669                                  pa->pa_len);
4670                 }
4671                 ext4_unlock_group(sb, i);
4672                 mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
4673                          grp->bb_fragments);
4674         }
4675 }
4676
4677 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4678 {
4679         struct super_block *sb = ac->ac_sb;
4680
4681         if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
4682                 return;
4683
4684         mb_debug(sb, "Can't allocate:"
4685                         " Allocation context details:");
4686         mb_debug(sb, "status %u flags 0x%x",
4687                         ac->ac_status, ac->ac_flags);
4688         mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
4689                         "goal %lu/%lu/%lu@%lu, "
4690                         "best %lu/%lu/%lu@%lu cr %d",
4691                         (unsigned long)ac->ac_o_ex.fe_group,
4692                         (unsigned long)ac->ac_o_ex.fe_start,
4693                         (unsigned long)ac->ac_o_ex.fe_len,
4694                         (unsigned long)ac->ac_o_ex.fe_logical,
4695                         (unsigned long)ac->ac_g_ex.fe_group,
4696                         (unsigned long)ac->ac_g_ex.fe_start,
4697                         (unsigned long)ac->ac_g_ex.fe_len,
4698                         (unsigned long)ac->ac_g_ex.fe_logical,
4699                         (unsigned long)ac->ac_b_ex.fe_group,
4700                         (unsigned long)ac->ac_b_ex.fe_start,
4701                         (unsigned long)ac->ac_b_ex.fe_len,
4702                         (unsigned long)ac->ac_b_ex.fe_logical,
4703                         (int)ac->ac_criteria);
4704         mb_debug(sb, "%u found", ac->ac_found);
4705         ext4_mb_show_pa(sb);
4706 }
4707 #else
4708 static inline void ext4_mb_show_pa(struct super_block *sb)
4709 {
4710         return;
4711 }
4712 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4713 {
4714         ext4_mb_show_pa(ac->ac_sb);
4715         return;
4716 }
4717 #endif
4718
4719 /*
4720  * We use locality group preallocation for small size file. The size of the
4721  * file is determined by the current size or the resulting size after
4722  * allocation which ever is larger
4723  *
4724  * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4725  */
4726 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4727 {
4728         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4729         int bsbits = ac->ac_sb->s_blocksize_bits;
4730         loff_t size, isize;
4731
4732         if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4733                 return;
4734
4735         if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4736                 return;
4737
4738         size = extent_logical_end(sbi, &ac->ac_o_ex);
4739         isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4740                 >> bsbits;
4741
4742         if ((size == isize) && !ext4_fs_is_busy(sbi) &&
4743             !inode_is_open_for_write(ac->ac_inode)) {
4744                 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4745                 return;
4746         }
4747
4748         if (sbi->s_mb_group_prealloc <= 0) {
4749                 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4750                 return;
4751         }
4752
4753         /* don't use group allocation for large files */
4754         size = max(size, isize);
4755         if (size > sbi->s_mb_stream_request) {
4756                 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4757                 return;
4758         }
4759
4760         BUG_ON(ac->ac_lg != NULL);
4761         /*
4762          * locality group prealloc space are per cpu. The reason for having
4763          * per cpu locality group is to reduce the contention between block
4764          * request from multiple CPUs.
4765          */
4766         ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
4767
4768         /* we're going to use group allocation */
4769         ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4770
4771         /* serialize all allocations in the group */
4772         mutex_lock(&ac->ac_lg->lg_mutex);
4773 }
4774
4775 static noinline_for_stack int
4776 ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4777                                 struct ext4_allocation_request *ar)
4778 {
4779         struct super_block *sb = ar->inode->i_sb;
4780         struct ext4_sb_info *sbi = EXT4_SB(sb);
4781         struct ext4_super_block *es = sbi->s_es;
4782         ext4_group_t group;
4783         unsigned int len;
4784         ext4_fsblk_t goal;
4785         ext4_grpblk_t block;
4786
4787         /* we can't allocate > group size */
4788         len = ar->len;
4789
4790         /* just a dirty hack to filter too big requests  */
4791         if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4792                 len = EXT4_CLUSTERS_PER_GROUP(sb);
4793
4794         /* start searching from the goal */
4795         goal = ar->goal;
4796         if (goal < le32_to_cpu(es->s_first_data_block) ||
4797                         goal >= ext4_blocks_count(es))
4798                 goal = le32_to_cpu(es->s_first_data_block);
4799         ext4_get_group_no_and_offset(sb, goal, &group, &block);
4800
4801         /* set up allocation goals */
4802         ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4803         ac->ac_status = AC_STATUS_CONTINUE;
4804         ac->ac_sb = sb;
4805         ac->ac_inode = ar->inode;
4806         ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4807         ac->ac_o_ex.fe_group = group;
4808         ac->ac_o_ex.fe_start = block;
4809         ac->ac_o_ex.fe_len = len;
4810         ac->ac_g_ex = ac->ac_o_ex;
4811         ac->ac_flags = ar->flags;
4812
4813         /* we have to define context: we'll work with a file or
4814          * locality group. this is a policy, actually */
4815         ext4_mb_group_or_file(ac);
4816
4817         mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
4818                         "left: %u/%u, right %u/%u to %swritable\n",
4819                         (unsigned) ar->len, (unsigned) ar->logical,
4820                         (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4821                         (unsigned) ar->lleft, (unsigned) ar->pleft,
4822                         (unsigned) ar->lright, (unsigned) ar->pright,
4823                         inode_is_open_for_write(ar->inode) ? "" : "non-");
4824         return 0;
4825
4826 }
4827
4828 static noinline_for_stack void
4829 ext4_mb_discard_lg_preallocations(struct super_block *sb,
4830                                         struct ext4_locality_group *lg,
4831                                         int order, int total_entries)
4832 {
4833         ext4_group_t group = 0;
4834         struct ext4_buddy e4b;
4835         struct list_head discard_list;
4836         struct ext4_prealloc_space *pa, *tmp;
4837
4838         mb_debug(sb, "discard locality group preallocation\n");
4839
4840         INIT_LIST_HEAD(&discard_list);
4841
4842         spin_lock(&lg->lg_prealloc_lock);
4843         list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4844                                 pa_inode_list,
4845                                 lockdep_is_held(&lg->lg_prealloc_lock)) {
4846                 spin_lock(&pa->pa_lock);
4847                 if (atomic_read(&pa->pa_count)) {
4848                         /*
4849                          * This is the pa that we just used
4850                          * for block allocation. So don't
4851                          * free that
4852                          */
4853                         spin_unlock(&pa->pa_lock);
4854                         continue;
4855                 }
4856                 if (pa->pa_deleted) {
4857                         spin_unlock(&pa->pa_lock);
4858                         continue;
4859                 }
4860                 /* only lg prealloc space */
4861                 BUG_ON(pa->pa_type != MB_GROUP_PA);
4862
4863                 /* seems this one can be freed ... */
4864                 ext4_mb_mark_pa_deleted(sb, pa);
4865                 spin_unlock(&pa->pa_lock);
4866
4867                 list_del_rcu(&pa->pa_inode_list);
4868                 list_add(&pa->u.pa_tmp_list, &discard_list);
4869
4870                 total_entries--;
4871                 if (total_entries <= 5) {
4872                         /*
4873                          * we want to keep only 5 entries
4874                          * allowing it to grow to 8. This
4875                          * mak sure we don't call discard
4876                          * soon for this list.
4877                          */
4878                         break;
4879                 }
4880         }
4881         spin_unlock(&lg->lg_prealloc_lock);
4882
4883         list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4884                 int err;
4885
4886                 group = ext4_get_group_number(sb, pa->pa_pstart);
4887                 err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4888                                              GFP_NOFS|__GFP_NOFAIL);
4889                 if (err) {
4890                         ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
4891                                        err, group);
4892                         continue;
4893                 }
4894                 ext4_lock_group(sb, group);
4895                 list_del(&pa->pa_group_list);
4896                 ext4_mb_release_group_pa(&e4b, pa);
4897                 ext4_unlock_group(sb, group);
4898
4899                 ext4_mb_unload_buddy(&e4b);
4900                 list_del(&pa->u.pa_tmp_list);
4901                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4902         }
4903 }
4904
4905 /*
4906  * We have incremented pa_count. So it cannot be freed at this
4907  * point. Also we hold lg_mutex. So no parallel allocation is
4908  * possible from this lg. That means pa_free cannot be updated.
4909  *
4910  * A parallel ext4_mb_discard_group_preallocations is possible.
4911  * which can cause the lg_prealloc_list to be updated.
4912  */
4913
4914 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4915 {
4916         int order, added = 0, lg_prealloc_count = 1;
4917         struct super_block *sb = ac->ac_sb;
4918         struct ext4_locality_group *lg = ac->ac_lg;
4919         struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4920
4921         order = fls(pa->pa_free) - 1;
4922         if (order > PREALLOC_TB_SIZE - 1)
4923                 /* The max size of hash table is PREALLOC_TB_SIZE */
4924                 order = PREALLOC_TB_SIZE - 1;
4925         /* Add the prealloc space to lg */
4926         spin_lock(&lg->lg_prealloc_lock);
4927         list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4928                                 pa_inode_list,
4929                                 lockdep_is_held(&lg->lg_prealloc_lock)) {
4930                 spin_lock(&tmp_pa->pa_lock);
4931                 if (tmp_pa->pa_deleted) {
4932                         spin_unlock(&tmp_pa->pa_lock);
4933                         continue;
4934                 }
4935                 if (!added && pa->pa_free < tmp_pa->pa_free) {
4936                         /* Add to the tail of the previous entry */
4937                         list_add_tail_rcu(&pa->pa_inode_list,
4938                                                 &tmp_pa->pa_inode_list);
4939                         added = 1;
4940                         /*
4941                          * we want to count the total
4942                          * number of entries in the list
4943                          */
4944                 }
4945                 spin_unlock(&tmp_pa->pa_lock);
4946                 lg_prealloc_count++;
4947         }
4948         if (!added)
4949                 list_add_tail_rcu(&pa->pa_inode_list,
4950                                         &lg->lg_prealloc_list[order]);
4951         spin_unlock(&lg->lg_prealloc_lock);
4952
4953         /* Now trim the list to be not more than 8 elements */
4954         if (lg_prealloc_count > 8) {
4955                 ext4_mb_discard_lg_preallocations(sb, lg,
4956                                                   order, lg_prealloc_count);
4957                 return;
4958         }
4959         return ;
4960 }
4961
4962 /*
4963  * if per-inode prealloc list is too long, trim some PA
4964  */
4965 static void ext4_mb_trim_inode_pa(struct inode *inode)
4966 {
4967         struct ext4_inode_info *ei = EXT4_I(inode);
4968         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4969         int count, delta;
4970
4971         count = atomic_read(&ei->i_prealloc_active);
4972         delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
4973         if (count > sbi->s_mb_max_inode_prealloc + delta) {
4974                 count -= sbi->s_mb_max_inode_prealloc;
4975                 ext4_discard_preallocations(inode, count);
4976         }
4977 }
4978
4979 /*
4980  * release all resource we used in allocation
4981  */
4982 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4983 {
4984         struct inode *inode = ac->ac_inode;
4985         struct ext4_inode_info *ei = EXT4_I(inode);
4986         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4987         struct ext4_prealloc_space *pa = ac->ac_pa;
4988         if (pa) {
4989                 if (pa->pa_type == MB_GROUP_PA) {
4990                         /* see comment in ext4_mb_use_group_pa() */
4991                         spin_lock(&pa->pa_lock);
4992                         pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4993                         pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4994                         pa->pa_free -= ac->ac_b_ex.fe_len;
4995                         pa->pa_len -= ac->ac_b_ex.fe_len;
4996                         spin_unlock(&pa->pa_lock);
4997
4998                         /*
4999                          * We want to add the pa to the right bucket.
5000                          * Remove it from the list and while adding
5001                          * make sure the list to which we are adding
5002                          * doesn't grow big.
5003                          */
5004                         if (likely(pa->pa_free)) {
5005                                 spin_lock(pa->pa_obj_lock);
5006                                 list_del_rcu(&pa->pa_inode_list);
5007                                 spin_unlock(pa->pa_obj_lock);
5008                                 ext4_mb_add_n_trim(ac);
5009                         }
5010                 }
5011
5012                 if (pa->pa_type == MB_INODE_PA) {
5013                         /*
5014                          * treat per-inode prealloc list as a lru list, then try
5015                          * to trim the least recently used PA.
5016                          */
5017                         spin_lock(pa->pa_obj_lock);
5018                         list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
5019                         spin_unlock(pa->pa_obj_lock);
5020                 }
5021
5022                 ext4_mb_put_pa(ac, ac->ac_sb, pa);
5023         }
5024         if (ac->ac_bitmap_page)
5025                 put_page(ac->ac_bitmap_page);
5026         if (ac->ac_buddy_page)
5027                 put_page(ac->ac_buddy_page);
5028         if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
5029                 mutex_unlock(&ac->ac_lg->lg_mutex);
5030         ext4_mb_collect_stats(ac);
5031         ext4_mb_trim_inode_pa(inode);
5032         return 0;
5033 }
5034
5035 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
5036 {
5037         ext4_group_t i, ngroups = ext4_get_groups_count(sb);
5038         int ret;
5039         int freed = 0, busy = 0;
5040         int retry = 0;
5041
5042         trace_ext4_mb_discard_preallocations(sb, needed);
5043
5044         if (needed == 0)
5045                 needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
5046  repeat:
5047         for (i = 0; i < ngroups && needed > 0; i++) {
5048                 ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
5049                 freed += ret;
5050                 needed -= ret;
5051                 cond_resched();
5052         }
5053
5054         if (needed > 0 && busy && ++retry < 3) {
5055                 busy = 0;
5056                 goto repeat;
5057         }
5058
5059         return freed;
5060 }
5061
5062 static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
5063                         struct ext4_allocation_context *ac, u64 *seq)
5064 {
5065         int freed;
5066         u64 seq_retry = 0;
5067         bool ret = false;
5068
5069         freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
5070         if (freed) {
5071                 ret = true;
5072                 goto out_dbg;
5073         }
5074         seq_retry = ext4_get_discard_pa_seq_sum();
5075         if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
5076                 ac->ac_flags |= EXT4_MB_STRICT_CHECK;
5077                 *seq = seq_retry;
5078                 ret = true;
5079         }
5080
5081 out_dbg:
5082         mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
5083         return ret;
5084 }
5085
5086 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5087                                 struct ext4_allocation_request *ar, int *errp);
5088
5089 /*
5090  * Main entry point into mballoc to allocate blocks
5091  * it tries to use preallocation first, then falls back
5092  * to usual allocation
5093  */
5094 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5095                                 struct ext4_allocation_request *ar, int *errp)
5096 {
5097         struct ext4_allocation_context *ac = NULL;
5098         struct ext4_sb_info *sbi;
5099         struct super_block *sb;
5100         ext4_fsblk_t block = 0;
5101         unsigned int inquota = 0;
5102         unsigned int reserv_clstrs = 0;
5103         int retries = 0;
5104         u64 seq;
5105
5106         might_sleep();
5107         sb = ar->inode->i_sb;
5108         sbi = EXT4_SB(sb);
5109
5110         trace_ext4_request_blocks(ar);
5111         if (sbi->s_mount_state & EXT4_FC_REPLAY)
5112                 return ext4_mb_new_blocks_simple(handle, ar, errp);
5113
5114         /* Allow to use superuser reservation for quota file */
5115         if (ext4_is_quota_file(ar->inode))
5116                 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
5117
5118         if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
5119                 /* Without delayed allocation we need to verify
5120                  * there is enough free blocks to do block allocation
5121                  * and verify allocation doesn't exceed the quota limits.
5122                  */
5123                 while (ar->len &&
5124                         ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
5125
5126                         /* let others to free the space */
5127                         cond_resched();
5128                         ar->len = ar->len >> 1;
5129                 }
5130                 if (!ar->len) {
5131                         ext4_mb_show_pa(sb);
5132                         *errp = -ENOSPC;
5133                         return 0;
5134                 }
5135                 reserv_clstrs = ar->len;
5136                 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
5137                         dquot_alloc_block_nofail(ar->inode,
5138                                                  EXT4_C2B(sbi, ar->len));
5139                 } else {
5140                         while (ar->len &&
5141                                 dquot_alloc_block(ar->inode,
5142                                                   EXT4_C2B(sbi, ar->len))) {
5143
5144                                 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
5145                                 ar->len--;
5146                         }
5147                 }
5148                 inquota = ar->len;
5149                 if (ar->len == 0) {
5150                         *errp = -EDQUOT;
5151                         goto out;
5152                 }
5153         }
5154
5155         ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
5156         if (!ac) {
5157                 ar->len = 0;
5158                 *errp = -ENOMEM;
5159                 goto out;
5160         }
5161
5162         *errp = ext4_mb_initialize_context(ac, ar);
5163         if (*errp) {
5164                 ar->len = 0;
5165                 goto out;
5166         }
5167
5168         ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
5169         seq = this_cpu_read(discard_pa_seq);
5170         if (!ext4_mb_use_preallocated(ac)) {
5171                 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
5172                 ext4_mb_normalize_request(ac, ar);
5173
5174                 *errp = ext4_mb_pa_alloc(ac);
5175                 if (*errp)
5176                         goto errout;
5177 repeat:
5178                 /* allocate space in core */
5179                 *errp = ext4_mb_regular_allocator(ac);
5180                 /*
5181                  * pa allocated above is added to grp->bb_prealloc_list only
5182                  * when we were able to allocate some block i.e. when
5183                  * ac->ac_status == AC_STATUS_FOUND.
5184                  * And error from above mean ac->ac_status != AC_STATUS_FOUND
5185                  * So we have to free this pa here itself.
5186                  */
5187                 if (*errp) {
5188                         ext4_mb_pa_free(ac);
5189                         ext4_discard_allocated_blocks(ac);
5190                         goto errout;
5191                 }
5192                 if (ac->ac_status == AC_STATUS_FOUND &&
5193                         ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
5194                         ext4_mb_pa_free(ac);
5195         }
5196         if (likely(ac->ac_status == AC_STATUS_FOUND)) {
5197                 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
5198                 if (*errp) {
5199                         ext4_discard_allocated_blocks(ac);
5200                         goto errout;
5201                 } else {
5202                         block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
5203                         ar->len = ac->ac_b_ex.fe_len;
5204                 }
5205         } else {
5206                 if (++retries < 3 &&
5207                     ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
5208                         goto repeat;
5209                 /*
5210                  * If block allocation fails then the pa allocated above
5211                  * needs to be freed here itself.
5212                  */
5213                 ext4_mb_pa_free(ac);
5214                 *errp = -ENOSPC;
5215         }
5216
5217 errout:
5218         if (*errp) {
5219                 ac->ac_b_ex.fe_len = 0;
5220                 ar->len = 0;
5221                 ext4_mb_show_ac(ac);
5222         }
5223         ext4_mb_release_context(ac);
5224 out:
5225         if (ac)
5226                 kmem_cache_free(ext4_ac_cachep, ac);
5227         if (inquota && ar->len < inquota)
5228                 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
5229         if (!ar->len) {
5230                 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
5231                         /* release all the reserved blocks if non delalloc */
5232                         percpu_counter_sub(&sbi->s_dirtyclusters_counter,
5233                                                 reserv_clstrs);
5234         }
5235
5236         trace_ext4_allocate_blocks(ar, (unsigned long long)block);
5237
5238         return block;
5239 }
5240
5241 /*
5242  * We can merge two free data extents only if the physical blocks
5243  * are contiguous, AND the extents were freed by the same transaction,
5244  * AND the blocks are associated with the same group.
5245  */
5246 static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
5247                                         struct ext4_free_data *entry,
5248                                         struct ext4_free_data *new_entry,
5249                                         struct rb_root *entry_rb_root)
5250 {
5251         if ((entry->efd_tid != new_entry->efd_tid) ||
5252             (entry->efd_group != new_entry->efd_group))
5253                 return;
5254         if (entry->efd_start_cluster + entry->efd_count ==
5255             new_entry->efd_start_cluster) {
5256                 new_entry->efd_start_cluster = entry->efd_start_cluster;
5257                 new_entry->efd_count += entry->efd_count;
5258         } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
5259                    entry->efd_start_cluster) {
5260                 new_entry->efd_count += entry->efd_count;
5261         } else
5262                 return;
5263         spin_lock(&sbi->s_md_lock);
5264         list_del(&entry->efd_list);
5265         spin_unlock(&sbi->s_md_lock);
5266         rb_erase(&entry->efd_node, entry_rb_root);
5267         kmem_cache_free(ext4_free_data_cachep, entry);
5268 }
5269
5270 static noinline_for_stack int
5271 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
5272                       struct ext4_free_data *new_entry)
5273 {
5274         ext4_group_t group = e4b->bd_group;
5275         ext4_grpblk_t cluster;
5276         ext4_grpblk_t clusters = new_entry->efd_count;
5277         struct ext4_free_data *entry;
5278         struct ext4_group_info *db = e4b->bd_info;
5279         struct super_block *sb = e4b->bd_sb;
5280         struct ext4_sb_info *sbi = EXT4_SB(sb);
5281         struct rb_node **n = &db->bb_free_root.rb_node, *node;
5282         struct rb_node *parent = NULL, *new_node;
5283
5284         BUG_ON(!ext4_handle_valid(handle));
5285         BUG_ON(e4b->bd_bitmap_page == NULL);
5286         BUG_ON(e4b->bd_buddy_page == NULL);
5287
5288         new_node = &new_entry->efd_node;
5289         cluster = new_entry->efd_start_cluster;
5290
5291         if (!*n) {
5292                 /* first free block exent. We need to
5293                    protect buddy cache from being freed,
5294                  * otherwise we'll refresh it from
5295                  * on-disk bitmap and lose not-yet-available
5296                  * blocks */
5297                 get_page(e4b->bd_buddy_page);
5298                 get_page(e4b->bd_bitmap_page);
5299         }
5300         while (*n) {
5301                 parent = *n;
5302                 entry = rb_entry(parent, struct ext4_free_data, efd_node);
5303                 if (cluster < entry->efd_start_cluster)
5304                         n = &(*n)->rb_left;
5305                 else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
5306                         n = &(*n)->rb_right;
5307                 else {
5308                         ext4_grp_locked_error(sb, group, 0,
5309                                 ext4_group_first_block_no(sb, group) +
5310                                 EXT4_C2B(sbi, cluster),
5311                                 "Block already on to-be-freed list");
5312                         kmem_cache_free(ext4_free_data_cachep, new_entry);
5313                         return 0;
5314                 }
5315         }
5316
5317         rb_link_node(new_node, parent, n);
5318         rb_insert_color(new_node, &db->bb_free_root);
5319
5320         /* Now try to see the extent can be merged to left and right */
5321         node = rb_prev(new_node);
5322         if (node) {
5323                 entry = rb_entry(node, struct ext4_free_data, efd_node);
5324                 ext4_try_merge_freed_extent(sbi, entry, new_entry,
5325                                             &(db->bb_free_root));
5326         }
5327
5328         node = rb_next(new_node);
5329         if (node) {
5330                 entry = rb_entry(node, struct ext4_free_data, efd_node);
5331                 ext4_try_merge_freed_extent(sbi, entry, new_entry,
5332                                             &(db->bb_free_root));
5333         }
5334
5335         spin_lock(&sbi->s_md_lock);
5336         list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
5337         sbi->s_mb_free_pending += clusters;
5338         spin_unlock(&sbi->s_md_lock);
5339         return 0;
5340 }
5341
5342 /*
5343  * Simple allocator for Ext4 fast commit replay path. It searches for blocks
5344  * linearly starting at the goal block and also excludes the blocks which
5345  * are going to be in use after fast commit replay.
5346  */
5347 static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
5348                                 struct ext4_allocation_request *ar, int *errp)
5349 {
5350         struct buffer_head *bitmap_bh;
5351         struct super_block *sb = ar->inode->i_sb;
5352         ext4_group_t group;
5353         ext4_grpblk_t blkoff;
5354         ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
5355         ext4_grpblk_t i = 0;
5356         ext4_fsblk_t goal, block;
5357         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
5358
5359         goal = ar->goal;
5360         if (goal < le32_to_cpu(es->s_first_data_block) ||
5361                         goal >= ext4_blocks_count(es))
5362                 goal = le32_to_cpu(es->s_first_data_block);
5363
5364         ar->len = 0;
5365         ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
5366         for (; group < ext4_get_groups_count(sb); group++) {
5367                 bitmap_bh = ext4_read_block_bitmap(sb, group);
5368                 if (IS_ERR(bitmap_bh)) {
5369                         *errp = PTR_ERR(bitmap_bh);
5370                         pr_warn("Failed to read block bitmap\n");
5371                         return 0;
5372                 }
5373
5374                 ext4_get_group_no_and_offset(sb,
5375                         max(ext4_group_first_block_no(sb, group), goal),
5376                         NULL, &blkoff);
5377                 while (1) {
5378                         i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
5379                                                 blkoff);
5380                         if (i >= max)
5381                                 break;
5382                         if (ext4_fc_replay_check_excluded(sb,
5383                                 ext4_group_first_block_no(sb, group) + i)) {
5384                                 blkoff = i + 1;
5385                         } else
5386                                 break;
5387                 }
5388                 brelse(bitmap_bh);
5389                 if (i < max)
5390                         break;
5391         }
5392
5393         if (group >= ext4_get_groups_count(sb) || i >= max) {
5394                 *errp = -ENOSPC;
5395                 return 0;
5396         }
5397
5398         block = ext4_group_first_block_no(sb, group) + i;
5399         ext4_mb_mark_bb(sb, block, 1, 1);
5400         ar->len = 1;
5401
5402         return block;
5403 }
5404
5405 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
5406                                         unsigned long count)
5407 {
5408         struct buffer_head *bitmap_bh;
5409         struct super_block *sb = inode->i_sb;
5410         struct ext4_group_desc *gdp;
5411         struct buffer_head *gdp_bh;
5412         ext4_group_t group;
5413         ext4_grpblk_t blkoff;
5414         int already_freed = 0, err, i;
5415
5416         ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
5417         bitmap_bh = ext4_read_block_bitmap(sb, group);
5418         if (IS_ERR(bitmap_bh)) {
5419                 err = PTR_ERR(bitmap_bh);
5420                 pr_warn("Failed to read block bitmap\n");
5421                 return;
5422         }
5423         gdp = ext4_get_group_desc(sb, group, &gdp_bh);
5424         if (!gdp)
5425                 return;
5426
5427         for (i = 0; i < count; i++) {
5428                 if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
5429                         already_freed++;
5430         }
5431         mb_clear_bits(bitmap_bh->b_data, blkoff, count);
5432         err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
5433         if (err)
5434                 return;
5435         ext4_free_group_clusters_set(
5436                 sb, gdp, ext4_free_group_clusters(sb, gdp) +
5437                 count - already_freed);
5438         ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
5439         ext4_group_desc_csum_set(sb, group, gdp);
5440         ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
5441         sync_dirty_buffer(bitmap_bh);
5442         sync_dirty_buffer(gdp_bh);
5443         brelse(bitmap_bh);
5444 }
5445
5446 /**
5447  * ext4_mb_clear_bb() -- helper function for freeing blocks.
5448  *                      Used by ext4_free_blocks()
5449  * @handle:             handle for this transaction
5450  * @inode:              inode
5451  * @bh:                 optional buffer of the block to be freed
5452  * @block:              starting physical block to be freed
5453  * @count:              number of blocks to be freed
5454  * @flags:              flags used by ext4_free_blocks
5455  */
5456 static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
5457                                ext4_fsblk_t block, unsigned long count,
5458                                int flags)
5459 {
5460         struct buffer_head *bitmap_bh = NULL;
5461         struct super_block *sb = inode->i_sb;
5462         struct ext4_group_desc *gdp;
5463         struct ext4_group_info *grp;
5464         unsigned int overflow;
5465         ext4_grpblk_t bit;
5466         struct buffer_head *gd_bh;
5467         ext4_group_t block_group;
5468         struct ext4_sb_info *sbi;
5469         struct ext4_buddy e4b;
5470         unsigned int count_clusters;
5471         int err = 0;
5472         int ret;
5473
5474         sbi = EXT4_SB(sb);
5475
5476         if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5477             !ext4_inode_block_valid(inode, block, count)) {
5478                 ext4_error(sb, "Freeing blocks in system zone - "
5479                            "Block = %llu, count = %lu", block, count);
5480                 /* err = 0. ext4_std_error should be a no op */
5481                 goto error_return;
5482         }
5483         flags |= EXT4_FREE_BLOCKS_VALIDATED;
5484
5485 do_more:
5486         overflow = 0;
5487         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
5488
5489         grp = ext4_get_group_info(sb, block_group);
5490         if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
5491                 return;
5492
5493         /*
5494          * Check to see if we are freeing blocks across a group
5495          * boundary.
5496          */
5497         if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
5498                 overflow = EXT4_C2B(sbi, bit) + count -
5499                         EXT4_BLOCKS_PER_GROUP(sb);
5500                 count -= overflow;
5501                 /* The range changed so it's no longer validated */
5502                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5503         }
5504         count_clusters = EXT4_NUM_B2C(sbi, count);
5505         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
5506         if (IS_ERR(bitmap_bh)) {
5507                 err = PTR_ERR(bitmap_bh);
5508                 bitmap_bh = NULL;
5509                 goto error_return;
5510         }
5511         gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
5512         if (!gdp) {
5513                 err = -EIO;
5514                 goto error_return;
5515         }
5516
5517         if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5518             !ext4_inode_block_valid(inode, block, count)) {
5519                 ext4_error(sb, "Freeing blocks in system zone - "
5520                            "Block = %llu, count = %lu", block, count);
5521                 /* err = 0. ext4_std_error should be a no op */
5522                 goto error_return;
5523         }
5524
5525         BUFFER_TRACE(bitmap_bh, "getting write access");
5526         err = ext4_journal_get_write_access(handle, bitmap_bh);
5527         if (err)
5528                 goto error_return;
5529
5530         /*
5531          * We are about to modify some metadata.  Call the journal APIs
5532          * to unshare ->b_data if a currently-committing transaction is
5533          * using it
5534          */
5535         BUFFER_TRACE(gd_bh, "get_write_access");
5536         err = ext4_journal_get_write_access(handle, gd_bh);
5537         if (err)
5538                 goto error_return;
5539 #ifdef AGGRESSIVE_CHECK
5540         {
5541                 int i;
5542                 for (i = 0; i < count_clusters; i++)
5543                         BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
5544         }
5545 #endif
5546         trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
5547
5548         /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
5549         err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
5550                                      GFP_NOFS|__GFP_NOFAIL);
5551         if (err)
5552                 goto error_return;
5553
5554         /*
5555          * We need to make sure we don't reuse the freed block until after the
5556          * transaction is committed. We make an exception if the inode is to be
5557          * written in writeback mode since writeback mode has weak data
5558          * consistency guarantees.
5559          */
5560         if (ext4_handle_valid(handle) &&
5561             ((flags & EXT4_FREE_BLOCKS_METADATA) ||
5562              !ext4_should_writeback_data(inode))) {
5563                 struct ext4_free_data *new_entry;
5564                 /*
5565                  * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
5566                  * to fail.
5567                  */
5568                 new_entry = kmem_cache_alloc(ext4_free_data_cachep,
5569                                 GFP_NOFS|__GFP_NOFAIL);
5570                 new_entry->efd_start_cluster = bit;
5571                 new_entry->efd_group = block_group;
5572                 new_entry->efd_count = count_clusters;
5573                 new_entry->efd_tid = handle->h_transaction->t_tid;
5574
5575                 ext4_lock_group(sb, block_group);
5576                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
5577                 ext4_mb_free_metadata(handle, &e4b, new_entry);
5578         } else {
5579                 /* need to update group_info->bb_free and bitmap
5580                  * with group lock held. generate_buddy look at
5581                  * them with group lock_held
5582                  */
5583                 if (test_opt(sb, DISCARD)) {
5584                         err = ext4_issue_discard(sb, block_group, bit,
5585                                                  count_clusters, NULL);
5586                         if (err && err != -EOPNOTSUPP)
5587                                 ext4_msg(sb, KERN_WARNING, "discard request in"
5588                                          " group:%u block:%d count:%lu failed"
5589                                          " with %d", block_group, bit, count,
5590                                          err);
5591                 } else
5592                         EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
5593
5594                 ext4_lock_group(sb, block_group);
5595                 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
5596                 mb_free_blocks(inode, &e4b, bit, count_clusters);
5597         }
5598
5599         ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
5600         ext4_free_group_clusters_set(sb, gdp, ret);
5601         ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
5602         ext4_group_desc_csum_set(sb, block_group, gdp);
5603         ext4_unlock_group(sb, block_group);
5604
5605         if (sbi->s_log_groups_per_flex) {
5606                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5607                 atomic64_add(count_clusters,
5608                              &sbi_array_rcu_deref(sbi, s_flex_groups,
5609                                                   flex_group)->free_clusters);
5610         }
5611
5612         /*
5613          * on a bigalloc file system, defer the s_freeclusters_counter
5614          * update to the caller (ext4_remove_space and friends) so they
5615          * can determine if a cluster freed here should be rereserved
5616          */
5617         if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
5618                 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
5619                         dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
5620                 percpu_counter_add(&sbi->s_freeclusters_counter,
5621                                    count_clusters);
5622         }
5623
5624         ext4_mb_unload_buddy(&e4b);
5625
5626         /* We dirtied the bitmap block */
5627         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5628         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5629
5630         /* And the group descriptor block */
5631         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5632         ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5633         if (!err)
5634                 err = ret;
5635
5636         if (overflow && !err) {
5637                 block += count;
5638                 count = overflow;
5639                 put_bh(bitmap_bh);
5640                 /* The range changed so it's no longer validated */
5641                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5642                 goto do_more;
5643         }
5644 error_return:
5645         brelse(bitmap_bh);
5646         ext4_std_error(sb, err);
5647         return;
5648 }
5649
5650 /**
5651  * ext4_free_blocks() -- Free given blocks and update quota
5652  * @handle:             handle for this transaction
5653  * @inode:              inode
5654  * @bh:                 optional buffer of the block to be freed
5655  * @block:              starting physical block to be freed
5656  * @count:              number of blocks to be freed
5657  * @flags:              flags used by ext4_free_blocks
5658  */
5659 void ext4_free_blocks(handle_t *handle, struct inode *inode,
5660                       struct buffer_head *bh, ext4_fsblk_t block,
5661                       unsigned long count, int flags)
5662 {
5663         struct super_block *sb = inode->i_sb;
5664         unsigned int overflow;
5665         struct ext4_sb_info *sbi;
5666
5667         sbi = EXT4_SB(sb);
5668
5669         if (bh) {
5670                 if (block)
5671                         BUG_ON(block != bh->b_blocknr);
5672                 else
5673                         block = bh->b_blocknr;
5674         }
5675
5676         if (sbi->s_mount_state & EXT4_FC_REPLAY) {
5677                 ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
5678                 return;
5679         }
5680
5681         might_sleep();
5682
5683         if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
5684             !ext4_inode_block_valid(inode, block, count)) {
5685                 ext4_error(sb, "Freeing blocks not in datazone - "
5686                            "block = %llu, count = %lu", block, count);
5687                 return;
5688         }
5689         flags |= EXT4_FREE_BLOCKS_VALIDATED;
5690
5691         ext4_debug("freeing block %llu\n", block);
5692         trace_ext4_free_blocks(inode, block, count, flags);
5693
5694         if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5695                 BUG_ON(count > 1);
5696
5697                 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
5698                             inode, bh, block);
5699         }
5700
5701         /*
5702          * If the extent to be freed does not begin on a cluster
5703          * boundary, we need to deal with partial clusters at the
5704          * beginning and end of the extent.  Normally we will free
5705          * blocks at the beginning or the end unless we are explicitly
5706          * requested to avoid doing so.
5707          */
5708         overflow = EXT4_PBLK_COFF(sbi, block);
5709         if (overflow) {
5710                 if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
5711                         overflow = sbi->s_cluster_ratio - overflow;
5712                         block += overflow;
5713                         if (count > overflow)
5714                                 count -= overflow;
5715                         else
5716                                 return;
5717                 } else {
5718                         block -= overflow;
5719                         count += overflow;
5720                 }
5721                 /* The range changed so it's no longer validated */
5722                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5723         }
5724         overflow = EXT4_LBLK_COFF(sbi, count);
5725         if (overflow) {
5726                 if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
5727                         if (count > overflow)
5728                                 count -= overflow;
5729                         else
5730                                 return;
5731                 } else
5732                         count += sbi->s_cluster_ratio - overflow;
5733                 /* The range changed so it's no longer validated */
5734                 flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
5735         }
5736
5737         if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
5738                 int i;
5739                 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
5740
5741                 for (i = 0; i < count; i++) {
5742                         cond_resched();
5743                         if (is_metadata)
5744                                 bh = sb_find_get_block(inode->i_sb, block + i);
5745                         ext4_forget(handle, is_metadata, inode, bh, block + i);
5746                 }
5747         }
5748
5749         ext4_mb_clear_bb(handle, inode, block, count, flags);
5750         return;
5751 }
5752
5753 /**
5754  * ext4_group_add_blocks() -- Add given blocks to an existing group
5755  * @handle:                     handle to this transaction
5756  * @sb:                         super block
5757  * @block:                      start physical block to add to the block group
5758  * @count:                      number of blocks to free
5759  *
5760  * This marks the blocks as free in the bitmap and buddy.
5761  */
5762 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
5763                          ext4_fsblk_t block, unsigned long count)
5764 {
5765         struct buffer_head *bitmap_bh = NULL;
5766         struct buffer_head *gd_bh;
5767         ext4_group_t block_group;
5768         ext4_grpblk_t bit;
5769         unsigned int i;
5770         struct ext4_group_desc *desc;
5771         struct ext4_sb_info *sbi = EXT4_SB(sb);
5772         struct ext4_buddy e4b;
5773         int err = 0, ret, free_clusters_count;
5774         ext4_grpblk_t clusters_freed;
5775         ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
5776         ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
5777         unsigned long cluster_count = last_cluster - first_cluster + 1;
5778
5779         ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
5780
5781         if (count == 0)
5782                 return 0;
5783
5784         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
5785         /*
5786          * Check to see if we are freeing blocks across a group
5787          * boundary.
5788          */
5789         if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
5790                 ext4_warning(sb, "too many blocks added to group %u",
5791                              block_group);
5792                 err = -EINVAL;
5793                 goto error_return;
5794         }
5795
5796         bitmap_bh = ext4_read_block_bitmap(sb, block_group);
5797         if (IS_ERR(bitmap_bh)) {
5798                 err = PTR_ERR(bitmap_bh);
5799                 bitmap_bh = NULL;
5800                 goto error_return;
5801         }
5802
5803         desc = ext4_get_group_desc(sb, block_group, &gd_bh);
5804         if (!desc) {
5805                 err = -EIO;
5806                 goto error_return;
5807         }
5808
5809         if (!ext4_sb_block_valid(sb, NULL, block, count)) {
5810                 ext4_error(sb, "Adding blocks in system zones - "
5811                            "Block = %llu, count = %lu",
5812                            block, count);
5813                 err = -EINVAL;
5814                 goto error_return;
5815         }
5816
5817         BUFFER_TRACE(bitmap_bh, "getting write access");
5818         err = ext4_journal_get_write_access(handle, bitmap_bh);
5819         if (err)
5820                 goto error_return;
5821
5822         /*
5823          * We are about to modify some metadata.  Call the journal APIs
5824          * to unshare ->b_data if a currently-committing transaction is
5825          * using it
5826          */
5827         BUFFER_TRACE(gd_bh, "get_write_access");
5828         err = ext4_journal_get_write_access(handle, gd_bh);
5829         if (err)
5830                 goto error_return;
5831
5832         for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
5833                 BUFFER_TRACE(bitmap_bh, "clear bit");
5834                 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
5835                         ext4_error(sb, "bit already cleared for block %llu",
5836                                    (ext4_fsblk_t)(block + i));
5837                         BUFFER_TRACE(bitmap_bh, "bit already cleared");
5838                 } else {
5839                         clusters_freed++;
5840                 }
5841         }
5842
5843         err = ext4_mb_load_buddy(sb, block_group, &e4b);
5844         if (err)
5845                 goto error_return;
5846
5847         /*
5848          * need to update group_info->bb_free and bitmap
5849          * with group lock held. generate_buddy look at
5850          * them with group lock_held
5851          */
5852         ext4_lock_group(sb, block_group);
5853         mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
5854         mb_free_blocks(NULL, &e4b, bit, cluster_count);
5855         free_clusters_count = clusters_freed +
5856                 ext4_free_group_clusters(sb, desc);
5857         ext4_free_group_clusters_set(sb, desc, free_clusters_count);
5858         ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
5859         ext4_group_desc_csum_set(sb, block_group, desc);
5860         ext4_unlock_group(sb, block_group);
5861         percpu_counter_add(&sbi->s_freeclusters_counter,
5862                            clusters_freed);
5863
5864         if (sbi->s_log_groups_per_flex) {
5865                 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5866                 atomic64_add(clusters_freed,
5867                              &sbi_array_rcu_deref(sbi, s_flex_groups,
5868                                                   flex_group)->free_clusters);
5869         }
5870
5871         ext4_mb_unload_buddy(&e4b);
5872
5873         /* We dirtied the bitmap block */
5874         BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5875         err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5876
5877         /* And the group descriptor block */
5878         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5879         ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5880         if (!err)
5881                 err = ret;
5882
5883 error_return:
5884         brelse(bitmap_bh);
5885         ext4_std_error(sb, err);
5886         return err;
5887 }
5888
5889 /**
5890  * ext4_trim_extent -- function to TRIM one single free extent in the group
5891  * @sb:         super block for the file system
5892  * @start:      starting block of the free extent in the alloc. group
5893  * @count:      number of blocks to TRIM
5894  * @e4b:        ext4 buddy for the group
5895  *
5896  * Trim "count" blocks starting at "start" in the "group". To assure that no
5897  * one will allocate those blocks, mark it as used in buddy bitmap. This must
5898  * be called with under the group lock.
5899  */
5900 static int ext4_trim_extent(struct super_block *sb,
5901                 int start, int count, struct ext4_buddy *e4b)
5902 __releases(bitlock)
5903 __acquires(bitlock)
5904 {
5905         struct ext4_free_extent ex;
5906         ext4_group_t group = e4b->bd_group;
5907         int ret = 0;
5908
5909         trace_ext4_trim_extent(sb, group, start, count);
5910
5911         assert_spin_locked(ext4_group_lock_ptr(sb, group));
5912
5913         ex.fe_start = start;
5914         ex.fe_group = group;
5915         ex.fe_len = count;
5916
5917         /*
5918          * Mark blocks used, so no one can reuse them while
5919          * being trimmed.
5920          */
5921         mb_mark_used(e4b, &ex);
5922         ext4_unlock_group(sb, group);
5923         ret = ext4_issue_discard(sb, group, start, count, NULL);
5924         ext4_lock_group(sb, group);
5925         mb_free_blocks(NULL, e4b, start, ex.fe_len);
5926         return ret;
5927 }
5928
5929 static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
5930                                            ext4_group_t grp)
5931 {
5932         unsigned long nr_clusters_in_group;
5933
5934         if (grp < (ext4_get_groups_count(sb) - 1))
5935                 nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
5936         else
5937                 nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
5938                                         ext4_group_first_block_no(sb, grp))
5939                                        >> EXT4_CLUSTER_BITS(sb);
5940
5941         return nr_clusters_in_group - 1;
5942 }
5943
5944 static bool ext4_trim_interrupted(void)
5945 {
5946         return fatal_signal_pending(current) || freezing(current);
5947 }
5948
5949 static int ext4_try_to_trim_range(struct super_block *sb,
5950                 struct ext4_buddy *e4b, ext4_grpblk_t start,
5951                 ext4_grpblk_t max, ext4_grpblk_t minblocks)
5952 {
5953         ext4_grpblk_t next, count, free_count, last, origin_start;
5954         bool set_trimmed = false;
5955         void *bitmap;
5956
5957         last = ext4_last_grp_cluster(sb, e4b->bd_group);
5958         bitmap = e4b->bd_bitmap;
5959         if (start == 0 && max >= last)
5960                 set_trimmed = true;
5961         origin_start = start;
5962         start = max(e4b->bd_info->bb_first_free, start);
5963         count = 0;
5964         free_count = 0;
5965
5966         while (start <= max) {
5967                 start = mb_find_next_zero_bit(bitmap, max + 1, start);
5968                 if (start > max)
5969                         break;
5970
5971                 next = mb_find_next_bit(bitmap, last + 1, start);
5972                 if (origin_start == 0 && next >= last)
5973                         set_trimmed = true;
5974
5975                 if ((next - start) >= minblocks) {
5976                         int ret = ext4_trim_extent(sb, start, next - start, e4b);
5977
5978                         if (ret && ret != -EOPNOTSUPP)
5979                                 return count;
5980                         count += next - start;
5981                 }
5982                 free_count += next - start;
5983                 start = next + 1;
5984
5985                 if (ext4_trim_interrupted())
5986                         return count;
5987
5988                 if (need_resched()) {
5989                         ext4_unlock_group(sb, e4b->bd_group);
5990                         cond_resched();
5991                         ext4_lock_group(sb, e4b->bd_group);
5992                 }
5993
5994                 if ((e4b->bd_info->bb_free - free_count) < minblocks)
5995                         break;
5996         }
5997
5998         if (set_trimmed)
5999                 EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
6000
6001         return count;
6002 }
6003
6004 /**
6005  * ext4_trim_all_free -- function to trim all free space in alloc. group
6006  * @sb:                 super block for file system
6007  * @group:              group to be trimmed
6008  * @start:              first group block to examine
6009  * @max:                last group block to examine
6010  * @minblocks:          minimum extent block count
6011  *
6012  * ext4_trim_all_free walks through group's buddy bitmap searching for free
6013  * extents. When the free block is found, ext4_trim_extent is called to TRIM
6014  * the extent.
6015  *
6016  *
6017  * ext4_trim_all_free walks through group's block bitmap searching for free
6018  * extents. When the free extent is found, mark it as used in group buddy
6019  * bitmap. Then issue a TRIM command on this extent and free the extent in
6020  * the group buddy bitmap. This is done until whole group is scanned.
6021  */
6022 static ext4_grpblk_t
6023 ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
6024                    ext4_grpblk_t start, ext4_grpblk_t max,
6025                    ext4_grpblk_t minblocks)
6026 {
6027         struct ext4_buddy e4b;
6028         int ret;
6029
6030         trace_ext4_trim_all_free(sb, group, start, max);
6031
6032         ret = ext4_mb_load_buddy(sb, group, &e4b);
6033         if (ret) {
6034                 ext4_warning(sb, "Error %d loading buddy information for %u",
6035                              ret, group);
6036                 return ret;
6037         }
6038
6039         ext4_lock_group(sb, group);
6040
6041         if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
6042             minblocks < EXT4_SB(sb)->s_last_trim_minblks)
6043                 ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
6044         else
6045                 ret = 0;
6046
6047         ext4_unlock_group(sb, group);
6048         ext4_mb_unload_buddy(&e4b);
6049
6050         ext4_debug("trimmed %d blocks in the group %d\n",
6051                 ret, group);
6052
6053         return ret;
6054 }
6055
6056 /**
6057  * ext4_trim_fs() -- trim ioctl handle function
6058  * @sb:                 superblock for filesystem
6059  * @range:              fstrim_range structure
6060  *
6061  * start:       First Byte to trim
6062  * len:         number of Bytes to trim from start
6063  * minlen:      minimum extent length in Bytes
6064  * ext4_trim_fs goes through all allocation groups containing Bytes from
6065  * start to start+len. For each such a group ext4_trim_all_free function
6066  * is invoked to trim all free space.
6067  */
6068 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
6069 {
6070         struct request_queue *q = bdev_get_queue(sb->s_bdev);
6071         struct ext4_group_info *grp;
6072         ext4_group_t group, first_group, last_group;
6073         ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
6074         uint64_t start, end, minlen, trimmed = 0;
6075         ext4_fsblk_t first_data_blk =
6076                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
6077         ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
6078         int ret = 0;
6079
6080         start = range->start >> sb->s_blocksize_bits;
6081         end = start + (range->len >> sb->s_blocksize_bits) - 1;
6082         minlen = EXT4_NUM_B2C(EXT4_SB(sb),
6083                               range->minlen >> sb->s_blocksize_bits);
6084
6085         if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
6086             start >= max_blks ||
6087             range->len < sb->s_blocksize)
6088                 return -EINVAL;
6089         /* No point to try to trim less than discard granularity */
6090         if (range->minlen < q->limits.discard_granularity) {
6091                 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
6092                         q->limits.discard_granularity >> sb->s_blocksize_bits);
6093                 if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
6094                         goto out;
6095         }
6096         if (end >= max_blks - 1)
6097                 end = max_blks - 1;
6098         if (end <= first_data_blk)
6099                 goto out;
6100         if (start < first_data_blk)
6101                 start = first_data_blk;
6102
6103         /* Determine first and last group to examine based on start and end */
6104         ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
6105                                      &first_group, &first_cluster);
6106         ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
6107                                      &last_group, &last_cluster);
6108
6109         /* end now represents the last cluster to discard in this group */
6110         end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
6111
6112         for (group = first_group; group <= last_group; group++) {
6113                 if (ext4_trim_interrupted())
6114                         break;
6115                 grp = ext4_get_group_info(sb, group);
6116                 if (!grp)
6117                         continue;
6118                 /* We only do this if the grp has never been initialized */
6119                 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
6120                         ret = ext4_mb_init_group(sb, group, GFP_NOFS);
6121                         if (ret)
6122                                 break;
6123                 }
6124
6125                 /*
6126                  * For all the groups except the last one, last cluster will
6127                  * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
6128                  * change it for the last group, note that last_cluster is
6129                  * already computed earlier by ext4_get_group_no_and_offset()
6130                  */
6131                 if (group == last_group)
6132                         end = last_cluster;
6133                 if (grp->bb_free >= minlen) {
6134                         cnt = ext4_trim_all_free(sb, group, first_cluster,
6135                                                  end, minlen);
6136                         if (cnt < 0) {
6137                                 ret = cnt;
6138                                 break;
6139                         }
6140                         trimmed += cnt;
6141                 }
6142
6143                 /*
6144                  * For every group except the first one, we are sure
6145                  * that the first cluster to discard will be cluster #0.
6146                  */
6147                 first_cluster = 0;
6148         }
6149
6150         if (!ret)
6151                 EXT4_SB(sb)->s_last_trim_minblks = minlen;
6152
6153 out:
6154         range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
6155         return ret;
6156 }
6157
6158 /* Iterate all the free extents in the group. */
6159 int
6160 ext4_mballoc_query_range(
6161         struct super_block              *sb,
6162         ext4_group_t                    group,
6163         ext4_grpblk_t                   start,
6164         ext4_grpblk_t                   end,
6165         ext4_mballoc_query_range_fn     formatter,
6166         void                            *priv)
6167 {
6168         void                            *bitmap;
6169         ext4_grpblk_t                   next;
6170         struct ext4_buddy               e4b;
6171         int                             error;
6172
6173         error = ext4_mb_load_buddy(sb, group, &e4b);
6174         if (error)
6175                 return error;
6176         bitmap = e4b.bd_bitmap;
6177
6178         ext4_lock_group(sb, group);
6179
6180         start = max(e4b.bd_info->bb_first_free, start);
6181         if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
6182                 end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
6183
6184         while (start <= end) {
6185                 start = mb_find_next_zero_bit(bitmap, end + 1, start);
6186                 if (start > end)
6187                         break;
6188                 next = mb_find_next_bit(bitmap, end + 1, start);
6189
6190                 ext4_unlock_group(sb, group);
6191                 error = formatter(sb, group, start, next - start, priv);
6192                 if (error)
6193                         goto out_unload;
6194                 ext4_lock_group(sb, group);
6195
6196                 start = next + 1;
6197         }
6198
6199         ext4_unlock_group(sb, group);
6200 out_unload:
6201         ext4_mb_unload_buddy(&e4b);
6202
6203         return error;
6204 }