GNU Linux-libre 4.9.308-gnu1
[releases.git] / fs / btrfs / free-space-cache.c
1 /*
2  * Copyright (C) 2008 Red Hat.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/slab.h>
22 #include <linux/math64.h>
23 #include <linux/ratelimit.h>
24 #include "ctree.h"
25 #include "free-space-cache.h"
26 #include "transaction.h"
27 #include "disk-io.h"
28 #include "extent_io.h"
29 #include "inode-map.h"
30 #include "volumes.h"
31
32 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
33 #define MAX_CACHE_BYTES_PER_GIG SZ_32K
34
35 struct btrfs_trim_range {
36         u64 start;
37         u64 bytes;
38         struct list_head list;
39 };
40
41 static int link_free_space(struct btrfs_free_space_ctl *ctl,
42                            struct btrfs_free_space *info);
43 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
44                               struct btrfs_free_space *info);
45
46 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
47                                                struct btrfs_path *path,
48                                                u64 offset)
49 {
50         struct btrfs_key key;
51         struct btrfs_key location;
52         struct btrfs_disk_key disk_key;
53         struct btrfs_free_space_header *header;
54         struct extent_buffer *leaf;
55         struct inode *inode = NULL;
56         int ret;
57
58         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
59         key.offset = offset;
60         key.type = 0;
61
62         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
63         if (ret < 0)
64                 return ERR_PTR(ret);
65         if (ret > 0) {
66                 btrfs_release_path(path);
67                 return ERR_PTR(-ENOENT);
68         }
69
70         leaf = path->nodes[0];
71         header = btrfs_item_ptr(leaf, path->slots[0],
72                                 struct btrfs_free_space_header);
73         btrfs_free_space_key(leaf, header, &disk_key);
74         btrfs_disk_key_to_cpu(&location, &disk_key);
75         btrfs_release_path(path);
76
77         inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
78         if (!inode)
79                 return ERR_PTR(-ENOENT);
80         if (IS_ERR(inode))
81                 return inode;
82         if (is_bad_inode(inode)) {
83                 iput(inode);
84                 return ERR_PTR(-ENOENT);
85         }
86
87         mapping_set_gfp_mask(inode->i_mapping,
88                         mapping_gfp_constraint(inode->i_mapping,
89                         ~(__GFP_FS | __GFP_HIGHMEM)));
90
91         return inode;
92 }
93
94 struct inode *lookup_free_space_inode(struct btrfs_root *root,
95                                       struct btrfs_block_group_cache
96                                       *block_group, struct btrfs_path *path)
97 {
98         struct inode *inode = NULL;
99         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
100
101         spin_lock(&block_group->lock);
102         if (block_group->inode)
103                 inode = igrab(block_group->inode);
104         spin_unlock(&block_group->lock);
105         if (inode)
106                 return inode;
107
108         inode = __lookup_free_space_inode(root, path,
109                                           block_group->key.objectid);
110         if (IS_ERR(inode))
111                 return inode;
112
113         spin_lock(&block_group->lock);
114         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
115                 btrfs_info(root->fs_info,
116                         "Old style space inode found, converting.");
117                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
118                         BTRFS_INODE_NODATACOW;
119                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
120         }
121
122         if (!block_group->iref) {
123                 block_group->inode = igrab(inode);
124                 block_group->iref = 1;
125         }
126         spin_unlock(&block_group->lock);
127
128         return inode;
129 }
130
131 static int __create_free_space_inode(struct btrfs_root *root,
132                                      struct btrfs_trans_handle *trans,
133                                      struct btrfs_path *path,
134                                      u64 ino, u64 offset)
135 {
136         struct btrfs_key key;
137         struct btrfs_disk_key disk_key;
138         struct btrfs_free_space_header *header;
139         struct btrfs_inode_item *inode_item;
140         struct extent_buffer *leaf;
141         u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
142         int ret;
143
144         ret = btrfs_insert_empty_inode(trans, root, path, ino);
145         if (ret)
146                 return ret;
147
148         /* We inline crc's for the free disk space cache */
149         if (ino != BTRFS_FREE_INO_OBJECTID)
150                 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
151
152         leaf = path->nodes[0];
153         inode_item = btrfs_item_ptr(leaf, path->slots[0],
154                                     struct btrfs_inode_item);
155         btrfs_item_key(leaf, &disk_key, path->slots[0]);
156         memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
157                              sizeof(*inode_item));
158         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
159         btrfs_set_inode_size(leaf, inode_item, 0);
160         btrfs_set_inode_nbytes(leaf, inode_item, 0);
161         btrfs_set_inode_uid(leaf, inode_item, 0);
162         btrfs_set_inode_gid(leaf, inode_item, 0);
163         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
164         btrfs_set_inode_flags(leaf, inode_item, flags);
165         btrfs_set_inode_nlink(leaf, inode_item, 1);
166         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
167         btrfs_set_inode_block_group(leaf, inode_item, offset);
168         btrfs_mark_buffer_dirty(leaf);
169         btrfs_release_path(path);
170
171         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
172         key.offset = offset;
173         key.type = 0;
174         ret = btrfs_insert_empty_item(trans, root, path, &key,
175                                       sizeof(struct btrfs_free_space_header));
176         if (ret < 0) {
177                 btrfs_release_path(path);
178                 return ret;
179         }
180
181         leaf = path->nodes[0];
182         header = btrfs_item_ptr(leaf, path->slots[0],
183                                 struct btrfs_free_space_header);
184         memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
185         btrfs_set_free_space_key(leaf, header, &disk_key);
186         btrfs_mark_buffer_dirty(leaf);
187         btrfs_release_path(path);
188
189         return 0;
190 }
191
192 int create_free_space_inode(struct btrfs_root *root,
193                             struct btrfs_trans_handle *trans,
194                             struct btrfs_block_group_cache *block_group,
195                             struct btrfs_path *path)
196 {
197         int ret;
198         u64 ino;
199
200         ret = btrfs_find_free_objectid(root, &ino);
201         if (ret < 0)
202                 return ret;
203
204         return __create_free_space_inode(root, trans, path, ino,
205                                          block_group->key.objectid);
206 }
207
208 int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
209                                        struct btrfs_block_rsv *rsv)
210 {
211         u64 needed_bytes;
212         int ret;
213
214         /* 1 for slack space, 1 for updating the inode */
215         needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
216                 btrfs_calc_trans_metadata_size(root, 1);
217
218         spin_lock(&rsv->lock);
219         if (rsv->reserved < needed_bytes)
220                 ret = -ENOSPC;
221         else
222                 ret = 0;
223         spin_unlock(&rsv->lock);
224         return ret;
225 }
226
227 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
228                                     struct btrfs_trans_handle *trans,
229                                     struct btrfs_block_group_cache *block_group,
230                                     struct inode *inode)
231 {
232         int ret = 0;
233         struct btrfs_path *path = btrfs_alloc_path();
234         bool locked = false;
235
236         if (!path) {
237                 ret = -ENOMEM;
238                 goto fail;
239         }
240
241         if (block_group) {
242                 locked = true;
243                 mutex_lock(&trans->transaction->cache_write_mutex);
244                 if (!list_empty(&block_group->io_list)) {
245                         list_del_init(&block_group->io_list);
246
247                         btrfs_wait_cache_io(root, trans, block_group,
248                                             &block_group->io_ctl, path,
249                                             block_group->key.objectid);
250                         btrfs_put_block_group(block_group);
251                 }
252
253                 /*
254                  * now that we've truncated the cache away, its no longer
255                  * setup or written
256                  */
257                 spin_lock(&block_group->lock);
258                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
259                 spin_unlock(&block_group->lock);
260         }
261         btrfs_free_path(path);
262
263         btrfs_i_size_write(inode, 0);
264         truncate_pagecache(inode, 0);
265
266         /*
267          * We don't need an orphan item because truncating the free space cache
268          * will never be split across transactions.
269          * We don't need to check for -EAGAIN because we're a free space
270          * cache inode
271          */
272         ret = btrfs_truncate_inode_items(trans, root, inode,
273                                          0, BTRFS_EXTENT_DATA_KEY);
274         if (ret)
275                 goto fail;
276
277         ret = btrfs_update_inode(trans, root, inode);
278
279 fail:
280         if (locked)
281                 mutex_unlock(&trans->transaction->cache_write_mutex);
282         if (ret)
283                 btrfs_abort_transaction(trans, ret);
284
285         return ret;
286 }
287
288 static int readahead_cache(struct inode *inode)
289 {
290         struct file_ra_state *ra;
291         unsigned long last_index;
292
293         ra = kzalloc(sizeof(*ra), GFP_NOFS);
294         if (!ra)
295                 return -ENOMEM;
296
297         file_ra_state_init(ra, inode->i_mapping);
298         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
299
300         page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
301
302         kfree(ra);
303
304         return 0;
305 }
306
307 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
308                        struct btrfs_root *root, int write)
309 {
310         int num_pages;
311         int check_crcs = 0;
312
313         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
314
315         if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
316                 check_crcs = 1;
317
318         /* Make sure we can fit our crcs into the first page */
319         if (write && check_crcs &&
320             (num_pages * sizeof(u32)) >= PAGE_SIZE)
321                 return -ENOSPC;
322
323         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
324
325         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
326         if (!io_ctl->pages)
327                 return -ENOMEM;
328
329         io_ctl->num_pages = num_pages;
330         io_ctl->root = root;
331         io_ctl->check_crcs = check_crcs;
332         io_ctl->inode = inode;
333
334         return 0;
335 }
336
337 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
338 {
339         kfree(io_ctl->pages);
340         io_ctl->pages = NULL;
341 }
342
343 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
344 {
345         if (io_ctl->cur) {
346                 io_ctl->cur = NULL;
347                 io_ctl->orig = NULL;
348         }
349 }
350
351 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
352 {
353         ASSERT(io_ctl->index < io_ctl->num_pages);
354         io_ctl->page = io_ctl->pages[io_ctl->index++];
355         io_ctl->cur = page_address(io_ctl->page);
356         io_ctl->orig = io_ctl->cur;
357         io_ctl->size = PAGE_SIZE;
358         if (clear)
359                 memset(io_ctl->cur, 0, PAGE_SIZE);
360 }
361
362 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
363 {
364         int i;
365
366         io_ctl_unmap_page(io_ctl);
367
368         for (i = 0; i < io_ctl->num_pages; i++) {
369                 if (io_ctl->pages[i]) {
370                         ClearPageChecked(io_ctl->pages[i]);
371                         unlock_page(io_ctl->pages[i]);
372                         put_page(io_ctl->pages[i]);
373                 }
374         }
375 }
376
377 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
378                                 int uptodate)
379 {
380         struct page *page;
381         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
382         int i;
383
384         for (i = 0; i < io_ctl->num_pages; i++) {
385                 page = find_or_create_page(inode->i_mapping, i, mask);
386                 if (!page) {
387                         io_ctl_drop_pages(io_ctl);
388                         return -ENOMEM;
389                 }
390                 io_ctl->pages[i] = page;
391                 if (uptodate && !PageUptodate(page)) {
392                         btrfs_readpage(NULL, page);
393                         lock_page(page);
394                         if (page->mapping != inode->i_mapping) {
395                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
396                                           "free space cache page truncated");
397                                 io_ctl_drop_pages(io_ctl);
398                                 return -EIO;
399                         }
400                         if (!PageUptodate(page)) {
401                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
402                                            "error reading free space cache");
403                                 io_ctl_drop_pages(io_ctl);
404                                 return -EIO;
405                         }
406                 }
407         }
408
409         for (i = 0; i < io_ctl->num_pages; i++) {
410                 clear_page_dirty_for_io(io_ctl->pages[i]);
411                 set_page_extent_mapped(io_ctl->pages[i]);
412         }
413
414         return 0;
415 }
416
417 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
418 {
419         __le64 *val;
420
421         io_ctl_map_page(io_ctl, 1);
422
423         /*
424          * Skip the csum areas.  If we don't check crcs then we just have a
425          * 64bit chunk at the front of the first page.
426          */
427         if (io_ctl->check_crcs) {
428                 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
429                 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
430         } else {
431                 io_ctl->cur += sizeof(u64);
432                 io_ctl->size -= sizeof(u64) * 2;
433         }
434
435         val = io_ctl->cur;
436         *val = cpu_to_le64(generation);
437         io_ctl->cur += sizeof(u64);
438 }
439
440 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
441 {
442         __le64 *gen;
443
444         /*
445          * Skip the crc area.  If we don't check crcs then we just have a 64bit
446          * chunk at the front of the first page.
447          */
448         if (io_ctl->check_crcs) {
449                 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
450                 io_ctl->size -= sizeof(u64) +
451                         (sizeof(u32) * io_ctl->num_pages);
452         } else {
453                 io_ctl->cur += sizeof(u64);
454                 io_ctl->size -= sizeof(u64) * 2;
455         }
456
457         gen = io_ctl->cur;
458         if (le64_to_cpu(*gen) != generation) {
459                 btrfs_err_rl(io_ctl->root->fs_info,
460                         "space cache generation (%llu) does not match inode (%llu)",
461                                 *gen, generation);
462                 io_ctl_unmap_page(io_ctl);
463                 return -EIO;
464         }
465         io_ctl->cur += sizeof(u64);
466         return 0;
467 }
468
469 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
470 {
471         u32 *tmp;
472         u32 crc = ~(u32)0;
473         unsigned offset = 0;
474
475         if (!io_ctl->check_crcs) {
476                 io_ctl_unmap_page(io_ctl);
477                 return;
478         }
479
480         if (index == 0)
481                 offset = sizeof(u32) * io_ctl->num_pages;
482
483         crc = btrfs_csum_data(io_ctl->orig + offset, crc,
484                               PAGE_SIZE - offset);
485         btrfs_csum_final(crc, (char *)&crc);
486         io_ctl_unmap_page(io_ctl);
487         tmp = page_address(io_ctl->pages[0]);
488         tmp += index;
489         *tmp = crc;
490 }
491
492 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
493 {
494         u32 *tmp, val;
495         u32 crc = ~(u32)0;
496         unsigned offset = 0;
497
498         if (!io_ctl->check_crcs) {
499                 io_ctl_map_page(io_ctl, 0);
500                 return 0;
501         }
502
503         if (index == 0)
504                 offset = sizeof(u32) * io_ctl->num_pages;
505
506         tmp = page_address(io_ctl->pages[0]);
507         tmp += index;
508         val = *tmp;
509
510         io_ctl_map_page(io_ctl, 0);
511         crc = btrfs_csum_data(io_ctl->orig + offset, crc,
512                               PAGE_SIZE - offset);
513         btrfs_csum_final(crc, (char *)&crc);
514         if (val != crc) {
515                 btrfs_err_rl(io_ctl->root->fs_info,
516                         "csum mismatch on free space cache");
517                 io_ctl_unmap_page(io_ctl);
518                 return -EIO;
519         }
520
521         return 0;
522 }
523
524 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
525                             void *bitmap)
526 {
527         struct btrfs_free_space_entry *entry;
528
529         if (!io_ctl->cur)
530                 return -ENOSPC;
531
532         entry = io_ctl->cur;
533         entry->offset = cpu_to_le64(offset);
534         entry->bytes = cpu_to_le64(bytes);
535         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
536                 BTRFS_FREE_SPACE_EXTENT;
537         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
538         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
539
540         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
541                 return 0;
542
543         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
544
545         /* No more pages to map */
546         if (io_ctl->index >= io_ctl->num_pages)
547                 return 0;
548
549         /* map the next page */
550         io_ctl_map_page(io_ctl, 1);
551         return 0;
552 }
553
554 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
555 {
556         if (!io_ctl->cur)
557                 return -ENOSPC;
558
559         /*
560          * If we aren't at the start of the current page, unmap this one and
561          * map the next one if there is any left.
562          */
563         if (io_ctl->cur != io_ctl->orig) {
564                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
565                 if (io_ctl->index >= io_ctl->num_pages)
566                         return -ENOSPC;
567                 io_ctl_map_page(io_ctl, 0);
568         }
569
570         memcpy(io_ctl->cur, bitmap, PAGE_SIZE);
571         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
572         if (io_ctl->index < io_ctl->num_pages)
573                 io_ctl_map_page(io_ctl, 0);
574         return 0;
575 }
576
577 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
578 {
579         /*
580          * If we're not on the boundary we know we've modified the page and we
581          * need to crc the page.
582          */
583         if (io_ctl->cur != io_ctl->orig)
584                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
585         else
586                 io_ctl_unmap_page(io_ctl);
587
588         while (io_ctl->index < io_ctl->num_pages) {
589                 io_ctl_map_page(io_ctl, 1);
590                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
591         }
592 }
593
594 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
595                             struct btrfs_free_space *entry, u8 *type)
596 {
597         struct btrfs_free_space_entry *e;
598         int ret;
599
600         if (!io_ctl->cur) {
601                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
602                 if (ret)
603                         return ret;
604         }
605
606         e = io_ctl->cur;
607         entry->offset = le64_to_cpu(e->offset);
608         entry->bytes = le64_to_cpu(e->bytes);
609         *type = e->type;
610         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
611         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
612
613         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
614                 return 0;
615
616         io_ctl_unmap_page(io_ctl);
617
618         return 0;
619 }
620
621 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
622                               struct btrfs_free_space *entry)
623 {
624         int ret;
625
626         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
627         if (ret)
628                 return ret;
629
630         memcpy(entry->bitmap, io_ctl->cur, PAGE_SIZE);
631         io_ctl_unmap_page(io_ctl);
632
633         return 0;
634 }
635
636 /*
637  * Since we attach pinned extents after the fact we can have contiguous sections
638  * of free space that are split up in entries.  This poses a problem with the
639  * tree logging stuff since it could have allocated across what appears to be 2
640  * entries since we would have merged the entries when adding the pinned extents
641  * back to the free space cache.  So run through the space cache that we just
642  * loaded and merge contiguous entries.  This will make the log replay stuff not
643  * blow up and it will make for nicer allocator behavior.
644  */
645 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
646 {
647         struct btrfs_free_space *e, *prev = NULL;
648         struct rb_node *n;
649
650 again:
651         spin_lock(&ctl->tree_lock);
652         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
653                 e = rb_entry(n, struct btrfs_free_space, offset_index);
654                 if (!prev)
655                         goto next;
656                 if (e->bitmap || prev->bitmap)
657                         goto next;
658                 if (prev->offset + prev->bytes == e->offset) {
659                         unlink_free_space(ctl, prev);
660                         unlink_free_space(ctl, e);
661                         prev->bytes += e->bytes;
662                         kmem_cache_free(btrfs_free_space_cachep, e);
663                         link_free_space(ctl, prev);
664                         prev = NULL;
665                         spin_unlock(&ctl->tree_lock);
666                         goto again;
667                 }
668 next:
669                 prev = e;
670         }
671         spin_unlock(&ctl->tree_lock);
672 }
673
674 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
675                                    struct btrfs_free_space_ctl *ctl,
676                                    struct btrfs_path *path, u64 offset)
677 {
678         struct btrfs_free_space_header *header;
679         struct extent_buffer *leaf;
680         struct btrfs_io_ctl io_ctl;
681         struct btrfs_key key;
682         struct btrfs_free_space *e, *n;
683         LIST_HEAD(bitmaps);
684         u64 num_entries;
685         u64 num_bitmaps;
686         u64 generation;
687         u8 type;
688         int ret = 0;
689
690         /* Nothing in the space cache, goodbye */
691         if (!i_size_read(inode))
692                 return 0;
693
694         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
695         key.offset = offset;
696         key.type = 0;
697
698         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
699         if (ret < 0)
700                 return 0;
701         else if (ret > 0) {
702                 btrfs_release_path(path);
703                 return 0;
704         }
705
706         ret = -1;
707
708         leaf = path->nodes[0];
709         header = btrfs_item_ptr(leaf, path->slots[0],
710                                 struct btrfs_free_space_header);
711         num_entries = btrfs_free_space_entries(leaf, header);
712         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
713         generation = btrfs_free_space_generation(leaf, header);
714         btrfs_release_path(path);
715
716         if (!BTRFS_I(inode)->generation) {
717                 btrfs_info(root->fs_info,
718                            "The free space cache file (%llu) is invalid. skip it\n",
719                            offset);
720                 return 0;
721         }
722
723         if (BTRFS_I(inode)->generation != generation) {
724                 btrfs_err(root->fs_info,
725                         "free space inode generation (%llu) did not match free space cache generation (%llu)",
726                         BTRFS_I(inode)->generation, generation);
727                 return 0;
728         }
729
730         if (!num_entries)
731                 return 0;
732
733         ret = io_ctl_init(&io_ctl, inode, root, 0);
734         if (ret)
735                 return ret;
736
737         ret = readahead_cache(inode);
738         if (ret)
739                 goto out;
740
741         ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
742         if (ret)
743                 goto out;
744
745         ret = io_ctl_check_crc(&io_ctl, 0);
746         if (ret)
747                 goto free_cache;
748
749         ret = io_ctl_check_generation(&io_ctl, generation);
750         if (ret)
751                 goto free_cache;
752
753         while (num_entries) {
754                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
755                                       GFP_NOFS);
756                 if (!e) {
757                         ret = -ENOMEM;
758                         goto free_cache;
759                 }
760
761                 ret = io_ctl_read_entry(&io_ctl, e, &type);
762                 if (ret) {
763                         kmem_cache_free(btrfs_free_space_cachep, e);
764                         goto free_cache;
765                 }
766
767                 if (!e->bytes) {
768                         ret = -1;
769                         kmem_cache_free(btrfs_free_space_cachep, e);
770                         goto free_cache;
771                 }
772
773                 if (type == BTRFS_FREE_SPACE_EXTENT) {
774                         spin_lock(&ctl->tree_lock);
775                         ret = link_free_space(ctl, e);
776                         spin_unlock(&ctl->tree_lock);
777                         if (ret) {
778                                 btrfs_err(root->fs_info,
779                                         "Duplicate entries in free space cache, dumping");
780                                 kmem_cache_free(btrfs_free_space_cachep, e);
781                                 goto free_cache;
782                         }
783                 } else {
784                         ASSERT(num_bitmaps);
785                         num_bitmaps--;
786                         e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
787                         if (!e->bitmap) {
788                                 ret = -ENOMEM;
789                                 kmem_cache_free(
790                                         btrfs_free_space_cachep, e);
791                                 goto free_cache;
792                         }
793                         spin_lock(&ctl->tree_lock);
794                         ret = link_free_space(ctl, e);
795                         ctl->total_bitmaps++;
796                         ctl->op->recalc_thresholds(ctl);
797                         spin_unlock(&ctl->tree_lock);
798                         if (ret) {
799                                 btrfs_err(root->fs_info,
800                                         "Duplicate entries in free space cache, dumping");
801                                 kmem_cache_free(btrfs_free_space_cachep, e);
802                                 goto free_cache;
803                         }
804                         list_add_tail(&e->list, &bitmaps);
805                 }
806
807                 num_entries--;
808         }
809
810         io_ctl_unmap_page(&io_ctl);
811
812         /*
813          * We add the bitmaps at the end of the entries in order that
814          * the bitmap entries are added to the cache.
815          */
816         list_for_each_entry_safe(e, n, &bitmaps, list) {
817                 list_del_init(&e->list);
818                 ret = io_ctl_read_bitmap(&io_ctl, e);
819                 if (ret)
820                         goto free_cache;
821         }
822
823         io_ctl_drop_pages(&io_ctl);
824         merge_space_tree(ctl);
825         ret = 1;
826 out:
827         io_ctl_free(&io_ctl);
828         return ret;
829 free_cache:
830         io_ctl_drop_pages(&io_ctl);
831         __btrfs_remove_free_space_cache(ctl);
832         goto out;
833 }
834
835 int load_free_space_cache(struct btrfs_fs_info *fs_info,
836                           struct btrfs_block_group_cache *block_group)
837 {
838         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
839         struct btrfs_root *root = fs_info->tree_root;
840         struct inode *inode;
841         struct btrfs_path *path;
842         int ret = 0;
843         bool matched;
844         u64 used = btrfs_block_group_used(&block_group->item);
845
846         /*
847          * If this block group has been marked to be cleared for one reason or
848          * another then we can't trust the on disk cache, so just return.
849          */
850         spin_lock(&block_group->lock);
851         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
852                 spin_unlock(&block_group->lock);
853                 return 0;
854         }
855         spin_unlock(&block_group->lock);
856
857         path = btrfs_alloc_path();
858         if (!path)
859                 return 0;
860         path->search_commit_root = 1;
861         path->skip_locking = 1;
862
863         inode = lookup_free_space_inode(root, block_group, path);
864         if (IS_ERR(inode)) {
865                 btrfs_free_path(path);
866                 return 0;
867         }
868
869         /* We may have converted the inode and made the cache invalid. */
870         spin_lock(&block_group->lock);
871         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
872                 spin_unlock(&block_group->lock);
873                 btrfs_free_path(path);
874                 goto out;
875         }
876         spin_unlock(&block_group->lock);
877
878         ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
879                                       path, block_group->key.objectid);
880         btrfs_free_path(path);
881         if (ret <= 0)
882                 goto out;
883
884         spin_lock(&ctl->tree_lock);
885         matched = (ctl->free_space == (block_group->key.offset - used -
886                                        block_group->bytes_super));
887         spin_unlock(&ctl->tree_lock);
888
889         if (!matched) {
890                 __btrfs_remove_free_space_cache(ctl);
891                 btrfs_warn(fs_info,
892                            "block group %llu has wrong amount of free space",
893                            block_group->key.objectid);
894                 ret = -1;
895         }
896 out:
897         if (ret < 0) {
898                 /* This cache is bogus, make sure it gets cleared */
899                 spin_lock(&block_group->lock);
900                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
901                 spin_unlock(&block_group->lock);
902                 ret = 0;
903
904                 btrfs_warn(fs_info,
905                            "failed to load free space cache for block group %llu, rebuilding it now",
906                            block_group->key.objectid);
907         }
908
909         iput(inode);
910         return ret;
911 }
912
913 static noinline_for_stack
914 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
915                               struct btrfs_free_space_ctl *ctl,
916                               struct btrfs_block_group_cache *block_group,
917                               int *entries, int *bitmaps,
918                               struct list_head *bitmap_list)
919 {
920         int ret;
921         struct btrfs_free_cluster *cluster = NULL;
922         struct btrfs_free_cluster *cluster_locked = NULL;
923         struct rb_node *node = rb_first(&ctl->free_space_offset);
924         struct btrfs_trim_range *trim_entry;
925
926         /* Get the cluster for this block_group if it exists */
927         if (block_group && !list_empty(&block_group->cluster_list)) {
928                 cluster = list_entry(block_group->cluster_list.next,
929                                      struct btrfs_free_cluster,
930                                      block_group_list);
931         }
932
933         if (!node && cluster) {
934                 cluster_locked = cluster;
935                 spin_lock(&cluster_locked->lock);
936                 node = rb_first(&cluster->root);
937                 cluster = NULL;
938         }
939
940         /* Write out the extent entries */
941         while (node) {
942                 struct btrfs_free_space *e;
943
944                 e = rb_entry(node, struct btrfs_free_space, offset_index);
945                 *entries += 1;
946
947                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
948                                        e->bitmap);
949                 if (ret)
950                         goto fail;
951
952                 if (e->bitmap) {
953                         list_add_tail(&e->list, bitmap_list);
954                         *bitmaps += 1;
955                 }
956                 node = rb_next(node);
957                 if (!node && cluster) {
958                         node = rb_first(&cluster->root);
959                         cluster_locked = cluster;
960                         spin_lock(&cluster_locked->lock);
961                         cluster = NULL;
962                 }
963         }
964         if (cluster_locked) {
965                 spin_unlock(&cluster_locked->lock);
966                 cluster_locked = NULL;
967         }
968
969         /*
970          * Make sure we don't miss any range that was removed from our rbtree
971          * because trimming is running. Otherwise after a umount+mount (or crash
972          * after committing the transaction) we would leak free space and get
973          * an inconsistent free space cache report from fsck.
974          */
975         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
976                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
977                                        trim_entry->bytes, NULL);
978                 if (ret)
979                         goto fail;
980                 *entries += 1;
981         }
982
983         return 0;
984 fail:
985         if (cluster_locked)
986                 spin_unlock(&cluster_locked->lock);
987         return -ENOSPC;
988 }
989
990 static noinline_for_stack int
991 update_cache_item(struct btrfs_trans_handle *trans,
992                   struct btrfs_root *root,
993                   struct inode *inode,
994                   struct btrfs_path *path, u64 offset,
995                   int entries, int bitmaps)
996 {
997         struct btrfs_key key;
998         struct btrfs_free_space_header *header;
999         struct extent_buffer *leaf;
1000         int ret;
1001
1002         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1003         key.offset = offset;
1004         key.type = 0;
1005
1006         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1007         if (ret < 0) {
1008                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1009                                  EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1010                                  GFP_NOFS);
1011                 goto fail;
1012         }
1013         leaf = path->nodes[0];
1014         if (ret > 0) {
1015                 struct btrfs_key found_key;
1016                 ASSERT(path->slots[0]);
1017                 path->slots[0]--;
1018                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1019                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1020                     found_key.offset != offset) {
1021                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1022                                          inode->i_size - 1,
1023                                          EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
1024                                          NULL, GFP_NOFS);
1025                         btrfs_release_path(path);
1026                         goto fail;
1027                 }
1028         }
1029
1030         BTRFS_I(inode)->generation = trans->transid;
1031         header = btrfs_item_ptr(leaf, path->slots[0],
1032                                 struct btrfs_free_space_header);
1033         btrfs_set_free_space_entries(leaf, header, entries);
1034         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1035         btrfs_set_free_space_generation(leaf, header, trans->transid);
1036         btrfs_mark_buffer_dirty(leaf);
1037         btrfs_release_path(path);
1038
1039         return 0;
1040
1041 fail:
1042         return -1;
1043 }
1044
1045 static noinline_for_stack int
1046 write_pinned_extent_entries(struct btrfs_root *root,
1047                             struct btrfs_block_group_cache *block_group,
1048                             struct btrfs_io_ctl *io_ctl,
1049                             int *entries)
1050 {
1051         u64 start, extent_start, extent_end, len;
1052         struct extent_io_tree *unpin = NULL;
1053         int ret;
1054
1055         if (!block_group)
1056                 return 0;
1057
1058         /*
1059          * We want to add any pinned extents to our free space cache
1060          * so we don't leak the space
1061          *
1062          * We shouldn't have switched the pinned extents yet so this is the
1063          * right one
1064          */
1065         unpin = root->fs_info->pinned_extents;
1066
1067         start = block_group->key.objectid;
1068
1069         while (start < block_group->key.objectid + block_group->key.offset) {
1070                 ret = find_first_extent_bit(unpin, start,
1071                                             &extent_start, &extent_end,
1072                                             EXTENT_DIRTY, NULL);
1073                 if (ret)
1074                         return 0;
1075
1076                 /* This pinned extent is out of our range */
1077                 if (extent_start >= block_group->key.objectid +
1078                     block_group->key.offset)
1079                         return 0;
1080
1081                 extent_start = max(extent_start, start);
1082                 extent_end = min(block_group->key.objectid +
1083                                  block_group->key.offset, extent_end + 1);
1084                 len = extent_end - extent_start;
1085
1086                 *entries += 1;
1087                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1088                 if (ret)
1089                         return -ENOSPC;
1090
1091                 start = extent_end;
1092         }
1093
1094         return 0;
1095 }
1096
1097 static noinline_for_stack int
1098 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1099 {
1100         struct btrfs_free_space *entry, *next;
1101         int ret;
1102
1103         /* Write out the bitmaps */
1104         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1105                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1106                 if (ret)
1107                         return -ENOSPC;
1108                 list_del_init(&entry->list);
1109         }
1110
1111         return 0;
1112 }
1113
1114 static int flush_dirty_cache(struct inode *inode)
1115 {
1116         int ret;
1117
1118         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1119         if (ret)
1120                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1121                                  EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1122                                  GFP_NOFS);
1123
1124         return ret;
1125 }
1126
1127 static void noinline_for_stack
1128 cleanup_bitmap_list(struct list_head *bitmap_list)
1129 {
1130         struct btrfs_free_space *entry, *next;
1131
1132         list_for_each_entry_safe(entry, next, bitmap_list, list)
1133                 list_del_init(&entry->list);
1134 }
1135
1136 static void noinline_for_stack
1137 cleanup_write_cache_enospc(struct inode *inode,
1138                            struct btrfs_io_ctl *io_ctl,
1139                            struct extent_state **cached_state,
1140                            struct list_head *bitmap_list)
1141 {
1142         io_ctl_drop_pages(io_ctl);
1143         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1144                              i_size_read(inode) - 1, cached_state,
1145                              GFP_NOFS);
1146 }
1147
1148 int btrfs_wait_cache_io(struct btrfs_root *root,
1149                         struct btrfs_trans_handle *trans,
1150                         struct btrfs_block_group_cache *block_group,
1151                         struct btrfs_io_ctl *io_ctl,
1152                         struct btrfs_path *path, u64 offset)
1153 {
1154         int ret;
1155         struct inode *inode = io_ctl->inode;
1156
1157         if (!inode)
1158                 return 0;
1159
1160         if (block_group)
1161                 root = root->fs_info->tree_root;
1162
1163         /* Flush the dirty pages in the cache file. */
1164         ret = flush_dirty_cache(inode);
1165         if (ret)
1166                 goto out;
1167
1168         /* Update the cache item to tell everyone this cache file is valid. */
1169         ret = update_cache_item(trans, root, inode, path, offset,
1170                                 io_ctl->entries, io_ctl->bitmaps);
1171 out:
1172         if (ret) {
1173                 invalidate_inode_pages2(inode->i_mapping);
1174                 BTRFS_I(inode)->generation = 0;
1175                 if (block_group) {
1176 #ifdef DEBUG
1177                         btrfs_err(root->fs_info,
1178                                 "failed to write free space cache for block group %llu",
1179                                 block_group->key.objectid);
1180 #endif
1181                 }
1182         }
1183         btrfs_update_inode(trans, root, inode);
1184
1185         if (block_group) {
1186                 /* the dirty list is protected by the dirty_bgs_lock */
1187                 spin_lock(&trans->transaction->dirty_bgs_lock);
1188
1189                 /* the disk_cache_state is protected by the block group lock */
1190                 spin_lock(&block_group->lock);
1191
1192                 /*
1193                  * only mark this as written if we didn't get put back on
1194                  * the dirty list while waiting for IO.   Otherwise our
1195                  * cache state won't be right, and we won't get written again
1196                  */
1197                 if (!ret && list_empty(&block_group->dirty_list))
1198                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1199                 else if (ret)
1200                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1201
1202                 spin_unlock(&block_group->lock);
1203                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1204                 io_ctl->inode = NULL;
1205                 iput(inode);
1206         }
1207
1208         return ret;
1209
1210 }
1211
1212 /**
1213  * __btrfs_write_out_cache - write out cached info to an inode
1214  * @root - the root the inode belongs to
1215  * @ctl - the free space cache we are going to write out
1216  * @block_group - the block_group for this cache if it belongs to a block_group
1217  * @trans - the trans handle
1218  * @path - the path to use
1219  * @offset - the offset for the key we'll insert
1220  *
1221  * This function writes out a free space cache struct to disk for quick recovery
1222  * on mount.  This will return 0 if it was successful in writing the cache out,
1223  * or an errno if it was not.
1224  */
1225 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1226                                    struct btrfs_free_space_ctl *ctl,
1227                                    struct btrfs_block_group_cache *block_group,
1228                                    struct btrfs_io_ctl *io_ctl,
1229                                    struct btrfs_trans_handle *trans,
1230                                    struct btrfs_path *path, u64 offset)
1231 {
1232         struct extent_state *cached_state = NULL;
1233         LIST_HEAD(bitmap_list);
1234         int entries = 0;
1235         int bitmaps = 0;
1236         int ret;
1237         int must_iput = 0;
1238
1239         if (!i_size_read(inode))
1240                 return -EIO;
1241
1242         WARN_ON(io_ctl->pages);
1243         ret = io_ctl_init(io_ctl, inode, root, 1);
1244         if (ret)
1245                 return ret;
1246
1247         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1248                 down_write(&block_group->data_rwsem);
1249                 spin_lock(&block_group->lock);
1250                 if (block_group->delalloc_bytes) {
1251                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1252                         spin_unlock(&block_group->lock);
1253                         up_write(&block_group->data_rwsem);
1254                         BTRFS_I(inode)->generation = 0;
1255                         ret = 0;
1256                         must_iput = 1;
1257                         goto out;
1258                 }
1259                 spin_unlock(&block_group->lock);
1260         }
1261
1262         /* Lock all pages first so we can lock the extent safely. */
1263         ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1264         if (ret)
1265                 goto out_unlock;
1266
1267         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1268                          &cached_state);
1269
1270         io_ctl_set_generation(io_ctl, trans->transid);
1271
1272         mutex_lock(&ctl->cache_writeout_mutex);
1273         /* Write out the extent entries in the free space cache */
1274         spin_lock(&ctl->tree_lock);
1275         ret = write_cache_extent_entries(io_ctl, ctl,
1276                                          block_group, &entries, &bitmaps,
1277                                          &bitmap_list);
1278         if (ret)
1279                 goto out_nospc_locked;
1280
1281         /*
1282          * Some spaces that are freed in the current transaction are pinned,
1283          * they will be added into free space cache after the transaction is
1284          * committed, we shouldn't lose them.
1285          *
1286          * If this changes while we are working we'll get added back to
1287          * the dirty list and redo it.  No locking needed
1288          */
1289         ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
1290         if (ret)
1291                 goto out_nospc_locked;
1292
1293         /*
1294          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1295          * locked while doing it because a concurrent trim can be manipulating
1296          * or freeing the bitmap.
1297          */
1298         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1299         spin_unlock(&ctl->tree_lock);
1300         mutex_unlock(&ctl->cache_writeout_mutex);
1301         if (ret)
1302                 goto out_nospc;
1303
1304         /* Zero out the rest of the pages just to make sure */
1305         io_ctl_zero_remaining_pages(io_ctl);
1306
1307         /* Everything is written out, now we dirty the pages in the file. */
1308         ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
1309                                 0, i_size_read(inode), &cached_state);
1310         if (ret)
1311                 goto out_nospc;
1312
1313         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1314                 up_write(&block_group->data_rwsem);
1315         /*
1316          * Release the pages and unlock the extent, we will flush
1317          * them out later
1318          */
1319         io_ctl_drop_pages(io_ctl);
1320         io_ctl_free(io_ctl);
1321
1322         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1323                              i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1324
1325         /*
1326          * at this point the pages are under IO and we're happy,
1327          * The caller is responsible for waiting on them and updating the
1328          * the cache and the inode
1329          */
1330         io_ctl->entries = entries;
1331         io_ctl->bitmaps = bitmaps;
1332
1333         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1334         if (ret)
1335                 goto out;
1336
1337         return 0;
1338
1339 out:
1340         io_ctl->inode = NULL;
1341         io_ctl_free(io_ctl);
1342         if (ret) {
1343                 invalidate_inode_pages2(inode->i_mapping);
1344                 BTRFS_I(inode)->generation = 0;
1345         }
1346         btrfs_update_inode(trans, root, inode);
1347         if (must_iput)
1348                 iput(inode);
1349         return ret;
1350
1351 out_nospc_locked:
1352         cleanup_bitmap_list(&bitmap_list);
1353         spin_unlock(&ctl->tree_lock);
1354         mutex_unlock(&ctl->cache_writeout_mutex);
1355
1356 out_nospc:
1357         cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
1358
1359 out_unlock:
1360         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1361                 up_write(&block_group->data_rwsem);
1362
1363         goto out;
1364 }
1365
1366 int btrfs_write_out_cache(struct btrfs_root *root,
1367                           struct btrfs_trans_handle *trans,
1368                           struct btrfs_block_group_cache *block_group,
1369                           struct btrfs_path *path)
1370 {
1371         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1372         struct inode *inode;
1373         int ret = 0;
1374
1375         root = root->fs_info->tree_root;
1376
1377         spin_lock(&block_group->lock);
1378         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1379                 spin_unlock(&block_group->lock);
1380                 return 0;
1381         }
1382         spin_unlock(&block_group->lock);
1383
1384         inode = lookup_free_space_inode(root, block_group, path);
1385         if (IS_ERR(inode))
1386                 return 0;
1387
1388         ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
1389                                       &block_group->io_ctl, trans,
1390                                       path, block_group->key.objectid);
1391         if (ret) {
1392 #ifdef DEBUG
1393                 btrfs_err(root->fs_info,
1394                         "failed to write free space cache for block group %llu",
1395                         block_group->key.objectid);
1396 #endif
1397                 spin_lock(&block_group->lock);
1398                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1399                 spin_unlock(&block_group->lock);
1400
1401                 block_group->io_ctl.inode = NULL;
1402                 iput(inode);
1403         }
1404
1405         /*
1406          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1407          * to wait for IO and put the inode
1408          */
1409
1410         return ret;
1411 }
1412
1413 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1414                                           u64 offset)
1415 {
1416         ASSERT(offset >= bitmap_start);
1417         offset -= bitmap_start;
1418         return (unsigned long)(div_u64(offset, unit));
1419 }
1420
1421 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1422 {
1423         return (unsigned long)(div_u64(bytes, unit));
1424 }
1425
1426 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1427                                    u64 offset)
1428 {
1429         u64 bitmap_start;
1430         u64 bytes_per_bitmap;
1431
1432         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1433         bitmap_start = offset - ctl->start;
1434         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1435         bitmap_start *= bytes_per_bitmap;
1436         bitmap_start += ctl->start;
1437
1438         return bitmap_start;
1439 }
1440
1441 static int tree_insert_offset(struct rb_root *root, u64 offset,
1442                               struct rb_node *node, int bitmap)
1443 {
1444         struct rb_node **p = &root->rb_node;
1445         struct rb_node *parent = NULL;
1446         struct btrfs_free_space *info;
1447
1448         while (*p) {
1449                 parent = *p;
1450                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1451
1452                 if (offset < info->offset) {
1453                         p = &(*p)->rb_left;
1454                 } else if (offset > info->offset) {
1455                         p = &(*p)->rb_right;
1456                 } else {
1457                         /*
1458                          * we could have a bitmap entry and an extent entry
1459                          * share the same offset.  If this is the case, we want
1460                          * the extent entry to always be found first if we do a
1461                          * linear search through the tree, since we want to have
1462                          * the quickest allocation time, and allocating from an
1463                          * extent is faster than allocating from a bitmap.  So
1464                          * if we're inserting a bitmap and we find an entry at
1465                          * this offset, we want to go right, or after this entry
1466                          * logically.  If we are inserting an extent and we've
1467                          * found a bitmap, we want to go left, or before
1468                          * logically.
1469                          */
1470                         if (bitmap) {
1471                                 if (info->bitmap) {
1472                                         WARN_ON_ONCE(1);
1473                                         return -EEXIST;
1474                                 }
1475                                 p = &(*p)->rb_right;
1476                         } else {
1477                                 if (!info->bitmap) {
1478                                         WARN_ON_ONCE(1);
1479                                         return -EEXIST;
1480                                 }
1481                                 p = &(*p)->rb_left;
1482                         }
1483                 }
1484         }
1485
1486         rb_link_node(node, parent, p);
1487         rb_insert_color(node, root);
1488
1489         return 0;
1490 }
1491
1492 /*
1493  * searches the tree for the given offset.
1494  *
1495  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1496  * want a section that has at least bytes size and comes at or after the given
1497  * offset.
1498  */
1499 static struct btrfs_free_space *
1500 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1501                    u64 offset, int bitmap_only, int fuzzy)
1502 {
1503         struct rb_node *n = ctl->free_space_offset.rb_node;
1504         struct btrfs_free_space *entry, *prev = NULL;
1505
1506         /* find entry that is closest to the 'offset' */
1507         while (1) {
1508                 if (!n) {
1509                         entry = NULL;
1510                         break;
1511                 }
1512
1513                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1514                 prev = entry;
1515
1516                 if (offset < entry->offset)
1517                         n = n->rb_left;
1518                 else if (offset > entry->offset)
1519                         n = n->rb_right;
1520                 else
1521                         break;
1522         }
1523
1524         if (bitmap_only) {
1525                 if (!entry)
1526                         return NULL;
1527                 if (entry->bitmap)
1528                         return entry;
1529
1530                 /*
1531                  * bitmap entry and extent entry may share same offset,
1532                  * in that case, bitmap entry comes after extent entry.
1533                  */
1534                 n = rb_next(n);
1535                 if (!n)
1536                         return NULL;
1537                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1538                 if (entry->offset != offset)
1539                         return NULL;
1540
1541                 WARN_ON(!entry->bitmap);
1542                 return entry;
1543         } else if (entry) {
1544                 if (entry->bitmap) {
1545                         /*
1546                          * if previous extent entry covers the offset,
1547                          * we should return it instead of the bitmap entry
1548                          */
1549                         n = rb_prev(&entry->offset_index);
1550                         if (n) {
1551                                 prev = rb_entry(n, struct btrfs_free_space,
1552                                                 offset_index);
1553                                 if (!prev->bitmap &&
1554                                     prev->offset + prev->bytes > offset)
1555                                         entry = prev;
1556                         }
1557                 }
1558                 return entry;
1559         }
1560
1561         if (!prev)
1562                 return NULL;
1563
1564         /* find last entry before the 'offset' */
1565         entry = prev;
1566         if (entry->offset > offset) {
1567                 n = rb_prev(&entry->offset_index);
1568                 if (n) {
1569                         entry = rb_entry(n, struct btrfs_free_space,
1570                                         offset_index);
1571                         ASSERT(entry->offset <= offset);
1572                 } else {
1573                         if (fuzzy)
1574                                 return entry;
1575                         else
1576                                 return NULL;
1577                 }
1578         }
1579
1580         if (entry->bitmap) {
1581                 n = rb_prev(&entry->offset_index);
1582                 if (n) {
1583                         prev = rb_entry(n, struct btrfs_free_space,
1584                                         offset_index);
1585                         if (!prev->bitmap &&
1586                             prev->offset + prev->bytes > offset)
1587                                 return prev;
1588                 }
1589                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1590                         return entry;
1591         } else if (entry->offset + entry->bytes > offset)
1592                 return entry;
1593
1594         if (!fuzzy)
1595                 return NULL;
1596
1597         while (1) {
1598                 if (entry->bitmap) {
1599                         if (entry->offset + BITS_PER_BITMAP *
1600                             ctl->unit > offset)
1601                                 break;
1602                 } else {
1603                         if (entry->offset + entry->bytes > offset)
1604                                 break;
1605                 }
1606
1607                 n = rb_next(&entry->offset_index);
1608                 if (!n)
1609                         return NULL;
1610                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1611         }
1612         return entry;
1613 }
1614
1615 static inline void
1616 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1617                     struct btrfs_free_space *info)
1618 {
1619         rb_erase(&info->offset_index, &ctl->free_space_offset);
1620         ctl->free_extents--;
1621 }
1622
1623 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1624                               struct btrfs_free_space *info)
1625 {
1626         __unlink_free_space(ctl, info);
1627         ctl->free_space -= info->bytes;
1628 }
1629
1630 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1631                            struct btrfs_free_space *info)
1632 {
1633         int ret = 0;
1634
1635         ASSERT(info->bytes || info->bitmap);
1636         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1637                                  &info->offset_index, (info->bitmap != NULL));
1638         if (ret)
1639                 return ret;
1640
1641         ctl->free_space += info->bytes;
1642         ctl->free_extents++;
1643         return ret;
1644 }
1645
1646 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1647 {
1648         struct btrfs_block_group_cache *block_group = ctl->private;
1649         u64 max_bytes;
1650         u64 bitmap_bytes;
1651         u64 extent_bytes;
1652         u64 size = block_group->key.offset;
1653         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1654         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1655
1656         max_bitmaps = max_t(u64, max_bitmaps, 1);
1657
1658         ASSERT(ctl->total_bitmaps <= max_bitmaps);
1659
1660         /*
1661          * The goal is to keep the total amount of memory used per 1gb of space
1662          * at or below 32k, so we need to adjust how much memory we allow to be
1663          * used by extent based free space tracking
1664          */
1665         if (size < SZ_1G)
1666                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1667         else
1668                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1669
1670         /*
1671          * we want to account for 1 more bitmap than what we have so we can make
1672          * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1673          * we add more bitmaps.
1674          */
1675         bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
1676
1677         if (bitmap_bytes >= max_bytes) {
1678                 ctl->extents_thresh = 0;
1679                 return;
1680         }
1681
1682         /*
1683          * we want the extent entry threshold to always be at most 1/2 the max
1684          * bytes we can have, or whatever is less than that.
1685          */
1686         extent_bytes = max_bytes - bitmap_bytes;
1687         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1688
1689         ctl->extents_thresh =
1690                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1691 }
1692
1693 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1694                                        struct btrfs_free_space *info,
1695                                        u64 offset, u64 bytes)
1696 {
1697         unsigned long start, count;
1698
1699         start = offset_to_bit(info->offset, ctl->unit, offset);
1700         count = bytes_to_bits(bytes, ctl->unit);
1701         ASSERT(start + count <= BITS_PER_BITMAP);
1702
1703         bitmap_clear(info->bitmap, start, count);
1704
1705         info->bytes -= bytes;
1706         if (info->max_extent_size > ctl->unit)
1707                 info->max_extent_size = 0;
1708 }
1709
1710 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1711                               struct btrfs_free_space *info, u64 offset,
1712                               u64 bytes)
1713 {
1714         __bitmap_clear_bits(ctl, info, offset, bytes);
1715         ctl->free_space -= bytes;
1716 }
1717
1718 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1719                             struct btrfs_free_space *info, u64 offset,
1720                             u64 bytes)
1721 {
1722         unsigned long start, count;
1723
1724         start = offset_to_bit(info->offset, ctl->unit, offset);
1725         count = bytes_to_bits(bytes, ctl->unit);
1726         ASSERT(start + count <= BITS_PER_BITMAP);
1727
1728         bitmap_set(info->bitmap, start, count);
1729
1730         info->bytes += bytes;
1731         ctl->free_space += bytes;
1732 }
1733
1734 /*
1735  * If we can not find suitable extent, we will use bytes to record
1736  * the size of the max extent.
1737  */
1738 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1739                          struct btrfs_free_space *bitmap_info, u64 *offset,
1740                          u64 *bytes, bool for_alloc)
1741 {
1742         unsigned long found_bits = 0;
1743         unsigned long max_bits = 0;
1744         unsigned long bits, i;
1745         unsigned long next_zero;
1746         unsigned long extent_bits;
1747
1748         /*
1749          * Skip searching the bitmap if we don't have a contiguous section that
1750          * is large enough for this allocation.
1751          */
1752         if (for_alloc &&
1753             bitmap_info->max_extent_size &&
1754             bitmap_info->max_extent_size < *bytes) {
1755                 *bytes = bitmap_info->max_extent_size;
1756                 return -1;
1757         }
1758
1759         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1760                           max_t(u64, *offset, bitmap_info->offset));
1761         bits = bytes_to_bits(*bytes, ctl->unit);
1762
1763         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1764                 if (for_alloc && bits == 1) {
1765                         found_bits = 1;
1766                         break;
1767                 }
1768                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1769                                                BITS_PER_BITMAP, i);
1770                 extent_bits = next_zero - i;
1771                 if (extent_bits >= bits) {
1772                         found_bits = extent_bits;
1773                         break;
1774                 } else if (extent_bits > max_bits) {
1775                         max_bits = extent_bits;
1776                 }
1777                 i = next_zero;
1778         }
1779
1780         if (found_bits) {
1781                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1782                 *bytes = (u64)(found_bits) * ctl->unit;
1783                 return 0;
1784         }
1785
1786         *bytes = (u64)(max_bits) * ctl->unit;
1787         bitmap_info->max_extent_size = *bytes;
1788         return -1;
1789 }
1790
1791 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1792 {
1793         if (entry->bitmap)
1794                 return entry->max_extent_size;
1795         return entry->bytes;
1796 }
1797
1798 /* Cache the size of the max extent in bytes */
1799 static struct btrfs_free_space *
1800 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1801                 unsigned long align, u64 *max_extent_size)
1802 {
1803         struct btrfs_free_space *entry;
1804         struct rb_node *node;
1805         u64 tmp;
1806         u64 align_off;
1807         int ret;
1808
1809         if (!ctl->free_space_offset.rb_node)
1810                 goto out;
1811
1812         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1813         if (!entry)
1814                 goto out;
1815
1816         for (node = &entry->offset_index; node; node = rb_next(node)) {
1817                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1818                 if (entry->bytes < *bytes) {
1819                         *max_extent_size = max(get_max_extent_size(entry),
1820                                                *max_extent_size);
1821                         continue;
1822                 }
1823
1824                 /* make sure the space returned is big enough
1825                  * to match our requested alignment
1826                  */
1827                 if (*bytes >= align) {
1828                         tmp = entry->offset - ctl->start + align - 1;
1829                         tmp = div64_u64(tmp, align);
1830                         tmp = tmp * align + ctl->start;
1831                         align_off = tmp - entry->offset;
1832                 } else {
1833                         align_off = 0;
1834                         tmp = entry->offset;
1835                 }
1836
1837                 if (entry->bytes < *bytes + align_off) {
1838                         *max_extent_size = max(get_max_extent_size(entry),
1839                                                *max_extent_size);
1840                         continue;
1841                 }
1842
1843                 if (entry->bitmap) {
1844                         u64 size = *bytes;
1845
1846                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
1847                         if (!ret) {
1848                                 *offset = tmp;
1849                                 *bytes = size;
1850                                 return entry;
1851                         } else {
1852                                 *max_extent_size =
1853                                         max(get_max_extent_size(entry),
1854                                             *max_extent_size);
1855                         }
1856                         continue;
1857                 }
1858
1859                 *offset = tmp;
1860                 *bytes = entry->bytes - align_off;
1861                 return entry;
1862         }
1863 out:
1864         return NULL;
1865 }
1866
1867 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1868                            struct btrfs_free_space *info, u64 offset)
1869 {
1870         info->offset = offset_to_bitmap(ctl, offset);
1871         info->bytes = 0;
1872         INIT_LIST_HEAD(&info->list);
1873         link_free_space(ctl, info);
1874         ctl->total_bitmaps++;
1875
1876         ctl->op->recalc_thresholds(ctl);
1877 }
1878
1879 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1880                         struct btrfs_free_space *bitmap_info)
1881 {
1882         unlink_free_space(ctl, bitmap_info);
1883         kfree(bitmap_info->bitmap);
1884         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1885         ctl->total_bitmaps--;
1886         ctl->op->recalc_thresholds(ctl);
1887 }
1888
1889 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1890                               struct btrfs_free_space *bitmap_info,
1891                               u64 *offset, u64 *bytes)
1892 {
1893         u64 end;
1894         u64 search_start, search_bytes;
1895         int ret;
1896
1897 again:
1898         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1899
1900         /*
1901          * We need to search for bits in this bitmap.  We could only cover some
1902          * of the extent in this bitmap thanks to how we add space, so we need
1903          * to search for as much as it as we can and clear that amount, and then
1904          * go searching for the next bit.
1905          */
1906         search_start = *offset;
1907         search_bytes = ctl->unit;
1908         search_bytes = min(search_bytes, end - search_start + 1);
1909         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1910                             false);
1911         if (ret < 0 || search_start != *offset)
1912                 return -EINVAL;
1913
1914         /* We may have found more bits than what we need */
1915         search_bytes = min(search_bytes, *bytes);
1916
1917         /* Cannot clear past the end of the bitmap */
1918         search_bytes = min(search_bytes, end - search_start + 1);
1919
1920         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1921         *offset += search_bytes;
1922         *bytes -= search_bytes;
1923
1924         if (*bytes) {
1925                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1926                 if (!bitmap_info->bytes)
1927                         free_bitmap(ctl, bitmap_info);
1928
1929                 /*
1930                  * no entry after this bitmap, but we still have bytes to
1931                  * remove, so something has gone wrong.
1932                  */
1933                 if (!next)
1934                         return -EINVAL;
1935
1936                 bitmap_info = rb_entry(next, struct btrfs_free_space,
1937                                        offset_index);
1938
1939                 /*
1940                  * if the next entry isn't a bitmap we need to return to let the
1941                  * extent stuff do its work.
1942                  */
1943                 if (!bitmap_info->bitmap)
1944                         return -EAGAIN;
1945
1946                 /*
1947                  * Ok the next item is a bitmap, but it may not actually hold
1948                  * the information for the rest of this free space stuff, so
1949                  * look for it, and if we don't find it return so we can try
1950                  * everything over again.
1951                  */
1952                 search_start = *offset;
1953                 search_bytes = ctl->unit;
1954                 ret = search_bitmap(ctl, bitmap_info, &search_start,
1955                                     &search_bytes, false);
1956                 if (ret < 0 || search_start != *offset)
1957                         return -EAGAIN;
1958
1959                 goto again;
1960         } else if (!bitmap_info->bytes)
1961                 free_bitmap(ctl, bitmap_info);
1962
1963         return 0;
1964 }
1965
1966 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1967                                struct btrfs_free_space *info, u64 offset,
1968                                u64 bytes)
1969 {
1970         u64 bytes_to_set = 0;
1971         u64 end;
1972
1973         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1974
1975         bytes_to_set = min(end - offset, bytes);
1976
1977         bitmap_set_bits(ctl, info, offset, bytes_to_set);
1978
1979         /*
1980          * We set some bytes, we have no idea what the max extent size is
1981          * anymore.
1982          */
1983         info->max_extent_size = 0;
1984
1985         return bytes_to_set;
1986
1987 }
1988
1989 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1990                       struct btrfs_free_space *info)
1991 {
1992         struct btrfs_block_group_cache *block_group = ctl->private;
1993         bool forced = false;
1994
1995 #ifdef CONFIG_BTRFS_DEBUG
1996         if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
1997                                              block_group))
1998                 forced = true;
1999 #endif
2000
2001         /*
2002          * If we are below the extents threshold then we can add this as an
2003          * extent, and don't have to deal with the bitmap
2004          */
2005         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2006                 /*
2007                  * If this block group has some small extents we don't want to
2008                  * use up all of our free slots in the cache with them, we want
2009                  * to reserve them to larger extents, however if we have plenty
2010                  * of cache left then go ahead an dadd them, no sense in adding
2011                  * the overhead of a bitmap if we don't have to.
2012                  */
2013                 if (info->bytes <= block_group->sectorsize * 4) {
2014                         if (ctl->free_extents * 2 <= ctl->extents_thresh)
2015                                 return false;
2016                 } else {
2017                         return false;
2018                 }
2019         }
2020
2021         /*
2022          * The original block groups from mkfs can be really small, like 8
2023          * megabytes, so don't bother with a bitmap for those entries.  However
2024          * some block groups can be smaller than what a bitmap would cover but
2025          * are still large enough that they could overflow the 32k memory limit,
2026          * so allow those block groups to still be allowed to have a bitmap
2027          * entry.
2028          */
2029         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
2030                 return false;
2031
2032         return true;
2033 }
2034
2035 static const struct btrfs_free_space_op free_space_op = {
2036         .recalc_thresholds      = recalculate_thresholds,
2037         .use_bitmap             = use_bitmap,
2038 };
2039
2040 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2041                               struct btrfs_free_space *info)
2042 {
2043         struct btrfs_free_space *bitmap_info;
2044         struct btrfs_block_group_cache *block_group = NULL;
2045         int added = 0;
2046         u64 bytes, offset, bytes_added;
2047         int ret;
2048
2049         bytes = info->bytes;
2050         offset = info->offset;
2051
2052         if (!ctl->op->use_bitmap(ctl, info))
2053                 return 0;
2054
2055         if (ctl->op == &free_space_op)
2056                 block_group = ctl->private;
2057 again:
2058         /*
2059          * Since we link bitmaps right into the cluster we need to see if we
2060          * have a cluster here, and if so and it has our bitmap we need to add
2061          * the free space to that bitmap.
2062          */
2063         if (block_group && !list_empty(&block_group->cluster_list)) {
2064                 struct btrfs_free_cluster *cluster;
2065                 struct rb_node *node;
2066                 struct btrfs_free_space *entry;
2067
2068                 cluster = list_entry(block_group->cluster_list.next,
2069                                      struct btrfs_free_cluster,
2070                                      block_group_list);
2071                 spin_lock(&cluster->lock);
2072                 node = rb_first(&cluster->root);
2073                 if (!node) {
2074                         spin_unlock(&cluster->lock);
2075                         goto no_cluster_bitmap;
2076                 }
2077
2078                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2079                 if (!entry->bitmap) {
2080                         spin_unlock(&cluster->lock);
2081                         goto no_cluster_bitmap;
2082                 }
2083
2084                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2085                         bytes_added = add_bytes_to_bitmap(ctl, entry,
2086                                                           offset, bytes);
2087                         bytes -= bytes_added;
2088                         offset += bytes_added;
2089                 }
2090                 spin_unlock(&cluster->lock);
2091                 if (!bytes) {
2092                         ret = 1;
2093                         goto out;
2094                 }
2095         }
2096
2097 no_cluster_bitmap:
2098         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2099                                          1, 0);
2100         if (!bitmap_info) {
2101                 ASSERT(added == 0);
2102                 goto new_bitmap;
2103         }
2104
2105         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
2106         bytes -= bytes_added;
2107         offset += bytes_added;
2108         added = 0;
2109
2110         if (!bytes) {
2111                 ret = 1;
2112                 goto out;
2113         } else
2114                 goto again;
2115
2116 new_bitmap:
2117         if (info && info->bitmap) {
2118                 add_new_bitmap(ctl, info, offset);
2119                 added = 1;
2120                 info = NULL;
2121                 goto again;
2122         } else {
2123                 spin_unlock(&ctl->tree_lock);
2124
2125                 /* no pre-allocated info, allocate a new one */
2126                 if (!info) {
2127                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2128                                                  GFP_NOFS);
2129                         if (!info) {
2130                                 spin_lock(&ctl->tree_lock);
2131                                 ret = -ENOMEM;
2132                                 goto out;
2133                         }
2134                 }
2135
2136                 /* allocate the bitmap */
2137                 info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
2138                 spin_lock(&ctl->tree_lock);
2139                 if (!info->bitmap) {
2140                         ret = -ENOMEM;
2141                         goto out;
2142                 }
2143                 goto again;
2144         }
2145
2146 out:
2147         if (info) {
2148                 if (info->bitmap)
2149                         kfree(info->bitmap);
2150                 kmem_cache_free(btrfs_free_space_cachep, info);
2151         }
2152
2153         return ret;
2154 }
2155
2156 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2157                           struct btrfs_free_space *info, bool update_stat)
2158 {
2159         struct btrfs_free_space *left_info = NULL;
2160         struct btrfs_free_space *right_info;
2161         bool merged = false;
2162         u64 offset = info->offset;
2163         u64 bytes = info->bytes;
2164
2165         /*
2166          * first we want to see if there is free space adjacent to the range we
2167          * are adding, if there is remove that struct and add a new one to
2168          * cover the entire range
2169          */
2170         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2171         if (right_info && rb_prev(&right_info->offset_index))
2172                 left_info = rb_entry(rb_prev(&right_info->offset_index),
2173                                      struct btrfs_free_space, offset_index);
2174         else if (!right_info)
2175                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2176
2177         if (right_info && !right_info->bitmap) {
2178                 if (update_stat)
2179                         unlink_free_space(ctl, right_info);
2180                 else
2181                         __unlink_free_space(ctl, right_info);
2182                 info->bytes += right_info->bytes;
2183                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2184                 merged = true;
2185         }
2186
2187         if (left_info && !left_info->bitmap &&
2188             left_info->offset + left_info->bytes == offset) {
2189                 if (update_stat)
2190                         unlink_free_space(ctl, left_info);
2191                 else
2192                         __unlink_free_space(ctl, left_info);
2193                 info->offset = left_info->offset;
2194                 info->bytes += left_info->bytes;
2195                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2196                 merged = true;
2197         }
2198
2199         return merged;
2200 }
2201
2202 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2203                                      struct btrfs_free_space *info,
2204                                      bool update_stat)
2205 {
2206         struct btrfs_free_space *bitmap;
2207         unsigned long i;
2208         unsigned long j;
2209         const u64 end = info->offset + info->bytes;
2210         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2211         u64 bytes;
2212
2213         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2214         if (!bitmap)
2215                 return false;
2216
2217         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2218         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2219         if (j == i)
2220                 return false;
2221         bytes = (j - i) * ctl->unit;
2222         info->bytes += bytes;
2223
2224         if (update_stat)
2225                 bitmap_clear_bits(ctl, bitmap, end, bytes);
2226         else
2227                 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2228
2229         if (!bitmap->bytes)
2230                 free_bitmap(ctl, bitmap);
2231
2232         return true;
2233 }
2234
2235 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2236                                        struct btrfs_free_space *info,
2237                                        bool update_stat)
2238 {
2239         struct btrfs_free_space *bitmap;
2240         u64 bitmap_offset;
2241         unsigned long i;
2242         unsigned long j;
2243         unsigned long prev_j;
2244         u64 bytes;
2245
2246         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2247         /* If we're on a boundary, try the previous logical bitmap. */
2248         if (bitmap_offset == info->offset) {
2249                 if (info->offset == 0)
2250                         return false;
2251                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2252         }
2253
2254         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2255         if (!bitmap)
2256                 return false;
2257
2258         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2259         j = 0;
2260         prev_j = (unsigned long)-1;
2261         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2262                 if (j > i)
2263                         break;
2264                 prev_j = j;
2265         }
2266         if (prev_j == i)
2267                 return false;
2268
2269         if (prev_j == (unsigned long)-1)
2270                 bytes = (i + 1) * ctl->unit;
2271         else
2272                 bytes = (i - prev_j) * ctl->unit;
2273
2274         info->offset -= bytes;
2275         info->bytes += bytes;
2276
2277         if (update_stat)
2278                 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2279         else
2280                 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2281
2282         if (!bitmap->bytes)
2283                 free_bitmap(ctl, bitmap);
2284
2285         return true;
2286 }
2287
2288 /*
2289  * We prefer always to allocate from extent entries, both for clustered and
2290  * non-clustered allocation requests. So when attempting to add a new extent
2291  * entry, try to see if there's adjacent free space in bitmap entries, and if
2292  * there is, migrate that space from the bitmaps to the extent.
2293  * Like this we get better chances of satisfying space allocation requests
2294  * because we attempt to satisfy them based on a single cache entry, and never
2295  * on 2 or more entries - even if the entries represent a contiguous free space
2296  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2297  * ends).
2298  */
2299 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2300                               struct btrfs_free_space *info,
2301                               bool update_stat)
2302 {
2303         /*
2304          * Only work with disconnected entries, as we can change their offset,
2305          * and must be extent entries.
2306          */
2307         ASSERT(!info->bitmap);
2308         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2309
2310         if (ctl->total_bitmaps > 0) {
2311                 bool stole_end;
2312                 bool stole_front = false;
2313
2314                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2315                 if (ctl->total_bitmaps > 0)
2316                         stole_front = steal_from_bitmap_to_front(ctl, info,
2317                                                                  update_stat);
2318
2319                 if (stole_end || stole_front)
2320                         try_merge_free_space(ctl, info, update_stat);
2321         }
2322 }
2323
2324 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2325                            struct btrfs_free_space_ctl *ctl,
2326                            u64 offset, u64 bytes)
2327 {
2328         struct btrfs_free_space *info;
2329         int ret = 0;
2330
2331         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2332         if (!info)
2333                 return -ENOMEM;
2334
2335         info->offset = offset;
2336         info->bytes = bytes;
2337         RB_CLEAR_NODE(&info->offset_index);
2338
2339         spin_lock(&ctl->tree_lock);
2340
2341         if (try_merge_free_space(ctl, info, true))
2342                 goto link;
2343
2344         /*
2345          * There was no extent directly to the left or right of this new
2346          * extent then we know we're going to have to allocate a new extent, so
2347          * before we do that see if we need to drop this into a bitmap
2348          */
2349         ret = insert_into_bitmap(ctl, info);
2350         if (ret < 0) {
2351                 goto out;
2352         } else if (ret) {
2353                 ret = 0;
2354                 goto out;
2355         }
2356 link:
2357         /*
2358          * Only steal free space from adjacent bitmaps if we're sure we're not
2359          * going to add the new free space to existing bitmap entries - because
2360          * that would mean unnecessary work that would be reverted. Therefore
2361          * attempt to steal space from bitmaps if we're adding an extent entry.
2362          */
2363         steal_from_bitmap(ctl, info, true);
2364
2365         ret = link_free_space(ctl, info);
2366         if (ret)
2367                 kmem_cache_free(btrfs_free_space_cachep, info);
2368 out:
2369         spin_unlock(&ctl->tree_lock);
2370
2371         if (ret) {
2372                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2373                 ASSERT(ret != -EEXIST);
2374         }
2375
2376         return ret;
2377 }
2378
2379 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2380                             u64 offset, u64 bytes)
2381 {
2382         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2383         struct btrfs_free_space *info;
2384         int ret;
2385         bool re_search = false;
2386
2387         spin_lock(&ctl->tree_lock);
2388
2389 again:
2390         ret = 0;
2391         if (!bytes)
2392                 goto out_lock;
2393
2394         info = tree_search_offset(ctl, offset, 0, 0);
2395         if (!info) {
2396                 /*
2397                  * oops didn't find an extent that matched the space we wanted
2398                  * to remove, look for a bitmap instead
2399                  */
2400                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2401                                           1, 0);
2402                 if (!info) {
2403                         /*
2404                          * If we found a partial bit of our free space in a
2405                          * bitmap but then couldn't find the other part this may
2406                          * be a problem, so WARN about it.
2407                          */
2408                         WARN_ON(re_search);
2409                         goto out_lock;
2410                 }
2411         }
2412
2413         re_search = false;
2414         if (!info->bitmap) {
2415                 unlink_free_space(ctl, info);
2416                 if (offset == info->offset) {
2417                         u64 to_free = min(bytes, info->bytes);
2418
2419                         info->bytes -= to_free;
2420                         info->offset += to_free;
2421                         if (info->bytes) {
2422                                 ret = link_free_space(ctl, info);
2423                                 WARN_ON(ret);
2424                         } else {
2425                                 kmem_cache_free(btrfs_free_space_cachep, info);
2426                         }
2427
2428                         offset += to_free;
2429                         bytes -= to_free;
2430                         goto again;
2431                 } else {
2432                         u64 old_end = info->bytes + info->offset;
2433
2434                         info->bytes = offset - info->offset;
2435                         ret = link_free_space(ctl, info);
2436                         WARN_ON(ret);
2437                         if (ret)
2438                                 goto out_lock;
2439
2440                         /* Not enough bytes in this entry to satisfy us */
2441                         if (old_end < offset + bytes) {
2442                                 bytes -= old_end - offset;
2443                                 offset = old_end;
2444                                 goto again;
2445                         } else if (old_end == offset + bytes) {
2446                                 /* all done */
2447                                 goto out_lock;
2448                         }
2449                         spin_unlock(&ctl->tree_lock);
2450
2451                         ret = btrfs_add_free_space(block_group, offset + bytes,
2452                                                    old_end - (offset + bytes));
2453                         WARN_ON(ret);
2454                         goto out;
2455                 }
2456         }
2457
2458         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2459         if (ret == -EAGAIN) {
2460                 re_search = true;
2461                 goto again;
2462         }
2463 out_lock:
2464         spin_unlock(&ctl->tree_lock);
2465 out:
2466         return ret;
2467 }
2468
2469 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2470                            u64 bytes)
2471 {
2472         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2473         struct btrfs_free_space *info;
2474         struct rb_node *n;
2475         int count = 0;
2476
2477         spin_lock(&ctl->tree_lock);
2478         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2479                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2480                 if (info->bytes >= bytes && !block_group->ro)
2481                         count++;
2482                 btrfs_crit(block_group->fs_info,
2483                            "entry offset %llu, bytes %llu, bitmap %s",
2484                            info->offset, info->bytes,
2485                        (info->bitmap) ? "yes" : "no");
2486         }
2487         spin_unlock(&ctl->tree_lock);
2488         btrfs_info(block_group->fs_info, "block group has cluster?: %s",
2489                list_empty(&block_group->cluster_list) ? "no" : "yes");
2490         btrfs_info(block_group->fs_info,
2491                    "%d blocks of free space at or bigger than bytes is", count);
2492 }
2493
2494 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2495 {
2496         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2497
2498         spin_lock_init(&ctl->tree_lock);
2499         ctl->unit = block_group->sectorsize;
2500         ctl->start = block_group->key.objectid;
2501         ctl->private = block_group;
2502         ctl->op = &free_space_op;
2503         INIT_LIST_HEAD(&ctl->trimming_ranges);
2504         mutex_init(&ctl->cache_writeout_mutex);
2505
2506         /*
2507          * we only want to have 32k of ram per block group for keeping
2508          * track of free space, and if we pass 1/2 of that we want to
2509          * start converting things over to using bitmaps
2510          */
2511         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2512 }
2513
2514 /*
2515  * for a given cluster, put all of its extents back into the free
2516  * space cache.  If the block group passed doesn't match the block group
2517  * pointed to by the cluster, someone else raced in and freed the
2518  * cluster already.  In that case, we just return without changing anything
2519  */
2520 static int
2521 __btrfs_return_cluster_to_free_space(
2522                              struct btrfs_block_group_cache *block_group,
2523                              struct btrfs_free_cluster *cluster)
2524 {
2525         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2526         struct btrfs_free_space *entry;
2527         struct rb_node *node;
2528
2529         spin_lock(&cluster->lock);
2530         if (cluster->block_group != block_group)
2531                 goto out;
2532
2533         cluster->block_group = NULL;
2534         cluster->window_start = 0;
2535         list_del_init(&cluster->block_group_list);
2536
2537         node = rb_first(&cluster->root);
2538         while (node) {
2539                 bool bitmap;
2540
2541                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2542                 node = rb_next(&entry->offset_index);
2543                 rb_erase(&entry->offset_index, &cluster->root);
2544                 RB_CLEAR_NODE(&entry->offset_index);
2545
2546                 bitmap = (entry->bitmap != NULL);
2547                 if (!bitmap) {
2548                         try_merge_free_space(ctl, entry, false);
2549                         steal_from_bitmap(ctl, entry, false);
2550                 }
2551                 tree_insert_offset(&ctl->free_space_offset,
2552                                    entry->offset, &entry->offset_index, bitmap);
2553         }
2554         cluster->root = RB_ROOT;
2555
2556 out:
2557         spin_unlock(&cluster->lock);
2558         btrfs_put_block_group(block_group);
2559         return 0;
2560 }
2561
2562 static void __btrfs_remove_free_space_cache_locked(
2563                                 struct btrfs_free_space_ctl *ctl)
2564 {
2565         struct btrfs_free_space *info;
2566         struct rb_node *node;
2567
2568         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2569                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2570                 if (!info->bitmap) {
2571                         unlink_free_space(ctl, info);
2572                         kmem_cache_free(btrfs_free_space_cachep, info);
2573                 } else {
2574                         free_bitmap(ctl, info);
2575                 }
2576
2577                 cond_resched_lock(&ctl->tree_lock);
2578         }
2579 }
2580
2581 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2582 {
2583         spin_lock(&ctl->tree_lock);
2584         __btrfs_remove_free_space_cache_locked(ctl);
2585         spin_unlock(&ctl->tree_lock);
2586 }
2587
2588 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2589 {
2590         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2591         struct btrfs_free_cluster *cluster;
2592         struct list_head *head;
2593
2594         spin_lock(&ctl->tree_lock);
2595         while ((head = block_group->cluster_list.next) !=
2596                &block_group->cluster_list) {
2597                 cluster = list_entry(head, struct btrfs_free_cluster,
2598                                      block_group_list);
2599
2600                 WARN_ON(cluster->block_group != block_group);
2601                 __btrfs_return_cluster_to_free_space(block_group, cluster);
2602
2603                 cond_resched_lock(&ctl->tree_lock);
2604         }
2605         __btrfs_remove_free_space_cache_locked(ctl);
2606         spin_unlock(&ctl->tree_lock);
2607
2608 }
2609
2610 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2611                                u64 offset, u64 bytes, u64 empty_size,
2612                                u64 *max_extent_size)
2613 {
2614         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2615         struct btrfs_free_space *entry = NULL;
2616         u64 bytes_search = bytes + empty_size;
2617         u64 ret = 0;
2618         u64 align_gap = 0;
2619         u64 align_gap_len = 0;
2620
2621         spin_lock(&ctl->tree_lock);
2622         entry = find_free_space(ctl, &offset, &bytes_search,
2623                                 block_group->full_stripe_len, max_extent_size);
2624         if (!entry)
2625                 goto out;
2626
2627         ret = offset;
2628         if (entry->bitmap) {
2629                 bitmap_clear_bits(ctl, entry, offset, bytes);
2630                 if (!entry->bytes)
2631                         free_bitmap(ctl, entry);
2632         } else {
2633                 unlink_free_space(ctl, entry);
2634                 align_gap_len = offset - entry->offset;
2635                 align_gap = entry->offset;
2636
2637                 entry->offset = offset + bytes;
2638                 WARN_ON(entry->bytes < bytes + align_gap_len);
2639
2640                 entry->bytes -= bytes + align_gap_len;
2641                 if (!entry->bytes)
2642                         kmem_cache_free(btrfs_free_space_cachep, entry);
2643                 else
2644                         link_free_space(ctl, entry);
2645         }
2646 out:
2647         spin_unlock(&ctl->tree_lock);
2648
2649         if (align_gap_len)
2650                 __btrfs_add_free_space(block_group->fs_info, ctl,
2651                                        align_gap, align_gap_len);
2652         return ret;
2653 }
2654
2655 /*
2656  * given a cluster, put all of its extents back into the free space
2657  * cache.  If a block group is passed, this function will only free
2658  * a cluster that belongs to the passed block group.
2659  *
2660  * Otherwise, it'll get a reference on the block group pointed to by the
2661  * cluster and remove the cluster from it.
2662  */
2663 int btrfs_return_cluster_to_free_space(
2664                                struct btrfs_block_group_cache *block_group,
2665                                struct btrfs_free_cluster *cluster)
2666 {
2667         struct btrfs_free_space_ctl *ctl;
2668         int ret;
2669
2670         /* first, get a safe pointer to the block group */
2671         spin_lock(&cluster->lock);
2672         if (!block_group) {
2673                 block_group = cluster->block_group;
2674                 if (!block_group) {
2675                         spin_unlock(&cluster->lock);
2676                         return 0;
2677                 }
2678         } else if (cluster->block_group != block_group) {
2679                 /* someone else has already freed it don't redo their work */
2680                 spin_unlock(&cluster->lock);
2681                 return 0;
2682         }
2683         atomic_inc(&block_group->count);
2684         spin_unlock(&cluster->lock);
2685
2686         ctl = block_group->free_space_ctl;
2687
2688         /* now return any extents the cluster had on it */
2689         spin_lock(&ctl->tree_lock);
2690         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2691         spin_unlock(&ctl->tree_lock);
2692
2693         /* finally drop our ref */
2694         btrfs_put_block_group(block_group);
2695         return ret;
2696 }
2697
2698 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2699                                    struct btrfs_free_cluster *cluster,
2700                                    struct btrfs_free_space *entry,
2701                                    u64 bytes, u64 min_start,
2702                                    u64 *max_extent_size)
2703 {
2704         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2705         int err;
2706         u64 search_start = cluster->window_start;
2707         u64 search_bytes = bytes;
2708         u64 ret = 0;
2709
2710         search_start = min_start;
2711         search_bytes = bytes;
2712
2713         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2714         if (err) {
2715                 *max_extent_size = max(get_max_extent_size(entry),
2716                                        *max_extent_size);
2717                 return 0;
2718         }
2719
2720         ret = search_start;
2721         __bitmap_clear_bits(ctl, entry, ret, bytes);
2722
2723         return ret;
2724 }
2725
2726 /*
2727  * given a cluster, try to allocate 'bytes' from it, returns 0
2728  * if it couldn't find anything suitably large, or a logical disk offset
2729  * if things worked out
2730  */
2731 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2732                              struct btrfs_free_cluster *cluster, u64 bytes,
2733                              u64 min_start, u64 *max_extent_size)
2734 {
2735         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2736         struct btrfs_free_space *entry = NULL;
2737         struct rb_node *node;
2738         u64 ret = 0;
2739
2740         spin_lock(&cluster->lock);
2741         if (bytes > cluster->max_size)
2742                 goto out;
2743
2744         if (cluster->block_group != block_group)
2745                 goto out;
2746
2747         node = rb_first(&cluster->root);
2748         if (!node)
2749                 goto out;
2750
2751         entry = rb_entry(node, struct btrfs_free_space, offset_index);
2752         while (1) {
2753                 if (entry->bytes < bytes)
2754                         *max_extent_size = max(get_max_extent_size(entry),
2755                                                *max_extent_size);
2756
2757                 if (entry->bytes < bytes ||
2758                     (!entry->bitmap && entry->offset < min_start)) {
2759                         node = rb_next(&entry->offset_index);
2760                         if (!node)
2761                                 break;
2762                         entry = rb_entry(node, struct btrfs_free_space,
2763                                          offset_index);
2764                         continue;
2765                 }
2766
2767                 if (entry->bitmap) {
2768                         ret = btrfs_alloc_from_bitmap(block_group,
2769                                                       cluster, entry, bytes,
2770                                                       cluster->window_start,
2771                                                       max_extent_size);
2772                         if (ret == 0) {
2773                                 node = rb_next(&entry->offset_index);
2774                                 if (!node)
2775                                         break;
2776                                 entry = rb_entry(node, struct btrfs_free_space,
2777                                                  offset_index);
2778                                 continue;
2779                         }
2780                         cluster->window_start += bytes;
2781                 } else {
2782                         ret = entry->offset;
2783
2784                         entry->offset += bytes;
2785                         entry->bytes -= bytes;
2786                 }
2787
2788                 if (entry->bytes == 0)
2789                         rb_erase(&entry->offset_index, &cluster->root);
2790                 break;
2791         }
2792 out:
2793         spin_unlock(&cluster->lock);
2794
2795         if (!ret)
2796                 return 0;
2797
2798         spin_lock(&ctl->tree_lock);
2799
2800         ctl->free_space -= bytes;
2801         if (entry->bytes == 0) {
2802                 ctl->free_extents--;
2803                 if (entry->bitmap) {
2804                         kfree(entry->bitmap);
2805                         ctl->total_bitmaps--;
2806                         ctl->op->recalc_thresholds(ctl);
2807                 }
2808                 kmem_cache_free(btrfs_free_space_cachep, entry);
2809         }
2810
2811         spin_unlock(&ctl->tree_lock);
2812
2813         return ret;
2814 }
2815
2816 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2817                                 struct btrfs_free_space *entry,
2818                                 struct btrfs_free_cluster *cluster,
2819                                 u64 offset, u64 bytes,
2820                                 u64 cont1_bytes, u64 min_bytes)
2821 {
2822         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2823         unsigned long next_zero;
2824         unsigned long i;
2825         unsigned long want_bits;
2826         unsigned long min_bits;
2827         unsigned long found_bits;
2828         unsigned long max_bits = 0;
2829         unsigned long start = 0;
2830         unsigned long total_found = 0;
2831         int ret;
2832
2833         i = offset_to_bit(entry->offset, ctl->unit,
2834                           max_t(u64, offset, entry->offset));
2835         want_bits = bytes_to_bits(bytes, ctl->unit);
2836         min_bits = bytes_to_bits(min_bytes, ctl->unit);
2837
2838         /*
2839          * Don't bother looking for a cluster in this bitmap if it's heavily
2840          * fragmented.
2841          */
2842         if (entry->max_extent_size &&
2843             entry->max_extent_size < cont1_bytes)
2844                 return -ENOSPC;
2845 again:
2846         found_bits = 0;
2847         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2848                 next_zero = find_next_zero_bit(entry->bitmap,
2849                                                BITS_PER_BITMAP, i);
2850                 if (next_zero - i >= min_bits) {
2851                         found_bits = next_zero - i;
2852                         if (found_bits > max_bits)
2853                                 max_bits = found_bits;
2854                         break;
2855                 }
2856                 if (next_zero - i > max_bits)
2857                         max_bits = next_zero - i;
2858                 i = next_zero;
2859         }
2860
2861         if (!found_bits) {
2862                 entry->max_extent_size = (u64)max_bits * ctl->unit;
2863                 return -ENOSPC;
2864         }
2865
2866         if (!total_found) {
2867                 start = i;
2868                 cluster->max_size = 0;
2869         }
2870
2871         total_found += found_bits;
2872
2873         if (cluster->max_size < found_bits * ctl->unit)
2874                 cluster->max_size = found_bits * ctl->unit;
2875
2876         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2877                 i = next_zero + 1;
2878                 goto again;
2879         }
2880
2881         cluster->window_start = start * ctl->unit + entry->offset;
2882         rb_erase(&entry->offset_index, &ctl->free_space_offset);
2883         ret = tree_insert_offset(&cluster->root, entry->offset,
2884                                  &entry->offset_index, 1);
2885         ASSERT(!ret); /* -EEXIST; Logic error */
2886
2887         trace_btrfs_setup_cluster(block_group, cluster,
2888                                   total_found * ctl->unit, 1);
2889         return 0;
2890 }
2891
2892 /*
2893  * This searches the block group for just extents to fill the cluster with.
2894  * Try to find a cluster with at least bytes total bytes, at least one
2895  * extent of cont1_bytes, and other clusters of at least min_bytes.
2896  */
2897 static noinline int
2898 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2899                         struct btrfs_free_cluster *cluster,
2900                         struct list_head *bitmaps, u64 offset, u64 bytes,
2901                         u64 cont1_bytes, u64 min_bytes)
2902 {
2903         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2904         struct btrfs_free_space *first = NULL;
2905         struct btrfs_free_space *entry = NULL;
2906         struct btrfs_free_space *last;
2907         struct rb_node *node;
2908         u64 window_free;
2909         u64 max_extent;
2910         u64 total_size = 0;
2911
2912         entry = tree_search_offset(ctl, offset, 0, 1);
2913         if (!entry)
2914                 return -ENOSPC;
2915
2916         /*
2917          * We don't want bitmaps, so just move along until we find a normal
2918          * extent entry.
2919          */
2920         while (entry->bitmap || entry->bytes < min_bytes) {
2921                 if (entry->bitmap && list_empty(&entry->list))
2922                         list_add_tail(&entry->list, bitmaps);
2923                 node = rb_next(&entry->offset_index);
2924                 if (!node)
2925                         return -ENOSPC;
2926                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2927         }
2928
2929         window_free = entry->bytes;
2930         max_extent = entry->bytes;
2931         first = entry;
2932         last = entry;
2933
2934         for (node = rb_next(&entry->offset_index); node;
2935              node = rb_next(&entry->offset_index)) {
2936                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2937
2938                 if (entry->bitmap) {
2939                         if (list_empty(&entry->list))
2940                                 list_add_tail(&entry->list, bitmaps);
2941                         continue;
2942                 }
2943
2944                 if (entry->bytes < min_bytes)
2945                         continue;
2946
2947                 last = entry;
2948                 window_free += entry->bytes;
2949                 if (entry->bytes > max_extent)
2950                         max_extent = entry->bytes;
2951         }
2952
2953         if (window_free < bytes || max_extent < cont1_bytes)
2954                 return -ENOSPC;
2955
2956         cluster->window_start = first->offset;
2957
2958         node = &first->offset_index;
2959
2960         /*
2961          * now we've found our entries, pull them out of the free space
2962          * cache and put them into the cluster rbtree
2963          */
2964         do {
2965                 int ret;
2966
2967                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2968                 node = rb_next(&entry->offset_index);
2969                 if (entry->bitmap || entry->bytes < min_bytes)
2970                         continue;
2971
2972                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2973                 ret = tree_insert_offset(&cluster->root, entry->offset,
2974                                          &entry->offset_index, 0);
2975                 total_size += entry->bytes;
2976                 ASSERT(!ret); /* -EEXIST; Logic error */
2977         } while (node && entry != last);
2978
2979         cluster->max_size = max_extent;
2980         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2981         return 0;
2982 }
2983
2984 /*
2985  * This specifically looks for bitmaps that may work in the cluster, we assume
2986  * that we have already failed to find extents that will work.
2987  */
2988 static noinline int
2989 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2990                      struct btrfs_free_cluster *cluster,
2991                      struct list_head *bitmaps, u64 offset, u64 bytes,
2992                      u64 cont1_bytes, u64 min_bytes)
2993 {
2994         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2995         struct btrfs_free_space *entry = NULL;
2996         int ret = -ENOSPC;
2997         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2998
2999         if (ctl->total_bitmaps == 0)
3000                 return -ENOSPC;
3001
3002         /*
3003          * The bitmap that covers offset won't be in the list unless offset
3004          * is just its start offset.
3005          */
3006         if (!list_empty(bitmaps))
3007                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3008
3009         if (!entry || entry->offset != bitmap_offset) {
3010                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3011                 if (entry && list_empty(&entry->list))
3012                         list_add(&entry->list, bitmaps);
3013         }
3014
3015         list_for_each_entry(entry, bitmaps, list) {
3016                 if (entry->bytes < bytes)
3017                         continue;
3018                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3019                                            bytes, cont1_bytes, min_bytes);
3020                 if (!ret)
3021                         return 0;
3022         }
3023
3024         /*
3025          * The bitmaps list has all the bitmaps that record free space
3026          * starting after offset, so no more search is required.
3027          */
3028         return -ENOSPC;
3029 }
3030
3031 /*
3032  * here we try to find a cluster of blocks in a block group.  The goal
3033  * is to find at least bytes+empty_size.
3034  * We might not find them all in one contiguous area.
3035  *
3036  * returns zero and sets up cluster if things worked out, otherwise
3037  * it returns -enospc
3038  */
3039 int btrfs_find_space_cluster(struct btrfs_root *root,
3040                              struct btrfs_block_group_cache *block_group,
3041                              struct btrfs_free_cluster *cluster,
3042                              u64 offset, u64 bytes, u64 empty_size)
3043 {
3044         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3045         struct btrfs_free_space *entry, *tmp;
3046         LIST_HEAD(bitmaps);
3047         u64 min_bytes;
3048         u64 cont1_bytes;
3049         int ret;
3050
3051         /*
3052          * Choose the minimum extent size we'll require for this
3053          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3054          * For metadata, allow allocates with smaller extents.  For
3055          * data, keep it dense.
3056          */
3057         if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) {
3058                 cont1_bytes = min_bytes = bytes + empty_size;
3059         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3060                 cont1_bytes = bytes;
3061                 min_bytes = block_group->sectorsize;
3062         } else {
3063                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3064                 min_bytes = block_group->sectorsize;
3065         }
3066
3067         spin_lock(&ctl->tree_lock);
3068
3069         /*
3070          * If we know we don't have enough space to make a cluster don't even
3071          * bother doing all the work to try and find one.
3072          */
3073         if (ctl->free_space < bytes) {
3074                 spin_unlock(&ctl->tree_lock);
3075                 return -ENOSPC;
3076         }
3077
3078         spin_lock(&cluster->lock);
3079
3080         /* someone already found a cluster, hooray */
3081         if (cluster->block_group) {
3082                 ret = 0;
3083                 goto out;
3084         }
3085
3086         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3087                                  min_bytes);
3088
3089         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3090                                       bytes + empty_size,
3091                                       cont1_bytes, min_bytes);
3092         if (ret)
3093                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3094                                            offset, bytes + empty_size,
3095                                            cont1_bytes, min_bytes);
3096
3097         /* Clear our temporary list */
3098         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3099                 list_del_init(&entry->list);
3100
3101         if (!ret) {
3102                 atomic_inc(&block_group->count);
3103                 list_add_tail(&cluster->block_group_list,
3104                               &block_group->cluster_list);
3105                 cluster->block_group = block_group;
3106         } else {
3107                 trace_btrfs_failed_cluster_setup(block_group);
3108         }
3109 out:
3110         spin_unlock(&cluster->lock);
3111         spin_unlock(&ctl->tree_lock);
3112
3113         return ret;
3114 }
3115
3116 /*
3117  * simple code to zero out a cluster
3118  */
3119 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3120 {
3121         spin_lock_init(&cluster->lock);
3122         spin_lock_init(&cluster->refill_lock);
3123         cluster->root = RB_ROOT;
3124         cluster->max_size = 0;
3125         cluster->fragmented = false;
3126         INIT_LIST_HEAD(&cluster->block_group_list);
3127         cluster->block_group = NULL;
3128 }
3129
3130 static int do_trimming(struct btrfs_block_group_cache *block_group,
3131                        u64 *total_trimmed, u64 start, u64 bytes,
3132                        u64 reserved_start, u64 reserved_bytes,
3133                        struct btrfs_trim_range *trim_entry)
3134 {
3135         struct btrfs_space_info *space_info = block_group->space_info;
3136         struct btrfs_fs_info *fs_info = block_group->fs_info;
3137         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3138         int ret;
3139         int update = 0;
3140         u64 trimmed = 0;
3141
3142         spin_lock(&space_info->lock);
3143         spin_lock(&block_group->lock);
3144         if (!block_group->ro) {
3145                 block_group->reserved += reserved_bytes;
3146                 space_info->bytes_reserved += reserved_bytes;
3147                 update = 1;
3148         }
3149         spin_unlock(&block_group->lock);
3150         spin_unlock(&space_info->lock);
3151
3152         ret = btrfs_discard_extent(fs_info->extent_root,
3153                                    start, bytes, &trimmed);
3154         if (!ret)
3155                 *total_trimmed += trimmed;
3156
3157         mutex_lock(&ctl->cache_writeout_mutex);
3158         btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
3159         list_del(&trim_entry->list);
3160         mutex_unlock(&ctl->cache_writeout_mutex);
3161
3162         if (update) {
3163                 spin_lock(&space_info->lock);
3164                 spin_lock(&block_group->lock);
3165                 if (block_group->ro)
3166                         space_info->bytes_readonly += reserved_bytes;
3167                 block_group->reserved -= reserved_bytes;
3168                 space_info->bytes_reserved -= reserved_bytes;
3169                 spin_unlock(&space_info->lock);
3170                 spin_unlock(&block_group->lock);
3171         }
3172
3173         return ret;
3174 }
3175
3176 static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3177                           u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3178 {
3179         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3180         struct btrfs_free_space *entry;
3181         struct rb_node *node;
3182         int ret = 0;
3183         u64 extent_start;
3184         u64 extent_bytes;
3185         u64 bytes;
3186
3187         while (start < end) {
3188                 struct btrfs_trim_range trim_entry;
3189
3190                 mutex_lock(&ctl->cache_writeout_mutex);
3191                 spin_lock(&ctl->tree_lock);
3192
3193                 if (ctl->free_space < minlen) {
3194                         spin_unlock(&ctl->tree_lock);
3195                         mutex_unlock(&ctl->cache_writeout_mutex);
3196                         break;
3197                 }
3198
3199                 entry = tree_search_offset(ctl, start, 0, 1);
3200                 if (!entry) {
3201                         spin_unlock(&ctl->tree_lock);
3202                         mutex_unlock(&ctl->cache_writeout_mutex);
3203                         break;
3204                 }
3205
3206                 /* skip bitmaps */
3207                 while (entry->bitmap) {
3208                         node = rb_next(&entry->offset_index);
3209                         if (!node) {
3210                                 spin_unlock(&ctl->tree_lock);
3211                                 mutex_unlock(&ctl->cache_writeout_mutex);
3212                                 goto out;
3213                         }
3214                         entry = rb_entry(node, struct btrfs_free_space,
3215                                          offset_index);
3216                 }
3217
3218                 if (entry->offset >= end) {
3219                         spin_unlock(&ctl->tree_lock);
3220                         mutex_unlock(&ctl->cache_writeout_mutex);
3221                         break;
3222                 }
3223
3224                 extent_start = entry->offset;
3225                 extent_bytes = entry->bytes;
3226                 start = max(start, extent_start);
3227                 bytes = min(extent_start + extent_bytes, end) - start;
3228                 if (bytes < minlen) {
3229                         spin_unlock(&ctl->tree_lock);
3230                         mutex_unlock(&ctl->cache_writeout_mutex);
3231                         goto next;
3232                 }
3233
3234                 unlink_free_space(ctl, entry);
3235                 kmem_cache_free(btrfs_free_space_cachep, entry);
3236
3237                 spin_unlock(&ctl->tree_lock);
3238                 trim_entry.start = extent_start;
3239                 trim_entry.bytes = extent_bytes;
3240                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3241                 mutex_unlock(&ctl->cache_writeout_mutex);
3242
3243                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3244                                   extent_start, extent_bytes, &trim_entry);
3245                 if (ret)
3246                         break;
3247 next:
3248                 start += bytes;
3249
3250                 if (fatal_signal_pending(current)) {
3251                         ret = -ERESTARTSYS;
3252                         break;
3253                 }
3254
3255                 cond_resched();
3256         }
3257 out:
3258         return ret;
3259 }
3260
3261 static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3262                         u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3263 {
3264         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3265         struct btrfs_free_space *entry;
3266         int ret = 0;
3267         int ret2;
3268         u64 bytes;
3269         u64 offset = offset_to_bitmap(ctl, start);
3270
3271         while (offset < end) {
3272                 bool next_bitmap = false;
3273                 struct btrfs_trim_range trim_entry;
3274
3275                 mutex_lock(&ctl->cache_writeout_mutex);
3276                 spin_lock(&ctl->tree_lock);
3277
3278                 if (ctl->free_space < minlen) {
3279                         spin_unlock(&ctl->tree_lock);
3280                         mutex_unlock(&ctl->cache_writeout_mutex);
3281                         break;
3282                 }
3283
3284                 entry = tree_search_offset(ctl, offset, 1, 0);
3285                 if (!entry) {
3286                         spin_unlock(&ctl->tree_lock);
3287                         mutex_unlock(&ctl->cache_writeout_mutex);
3288                         next_bitmap = true;
3289                         goto next;
3290                 }
3291
3292                 bytes = minlen;
3293                 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3294                 if (ret2 || start >= end) {
3295                         spin_unlock(&ctl->tree_lock);
3296                         mutex_unlock(&ctl->cache_writeout_mutex);
3297                         next_bitmap = true;
3298                         goto next;
3299                 }
3300
3301                 bytes = min(bytes, end - start);
3302                 if (bytes < minlen) {
3303                         spin_unlock(&ctl->tree_lock);
3304                         mutex_unlock(&ctl->cache_writeout_mutex);
3305                         goto next;
3306                 }
3307
3308                 bitmap_clear_bits(ctl, entry, start, bytes);
3309                 if (entry->bytes == 0)
3310                         free_bitmap(ctl, entry);
3311
3312                 spin_unlock(&ctl->tree_lock);
3313                 trim_entry.start = start;
3314                 trim_entry.bytes = bytes;
3315                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3316                 mutex_unlock(&ctl->cache_writeout_mutex);
3317
3318                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3319                                   start, bytes, &trim_entry);
3320                 if (ret)
3321                         break;
3322 next:
3323                 if (next_bitmap) {
3324                         offset += BITS_PER_BITMAP * ctl->unit;
3325                 } else {
3326                         start += bytes;
3327                         if (start >= offset + BITS_PER_BITMAP * ctl->unit)
3328                                 offset += BITS_PER_BITMAP * ctl->unit;
3329                 }
3330
3331                 if (fatal_signal_pending(current)) {
3332                         ret = -ERESTARTSYS;
3333                         break;
3334                 }
3335
3336                 cond_resched();
3337         }
3338
3339         return ret;
3340 }
3341
3342 void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
3343 {
3344         atomic_inc(&cache->trimming);
3345 }
3346
3347 void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
3348 {
3349         struct extent_map_tree *em_tree;
3350         struct extent_map *em;
3351         bool cleanup;
3352
3353         spin_lock(&block_group->lock);
3354         cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3355                    block_group->removed);
3356         spin_unlock(&block_group->lock);
3357
3358         if (cleanup) {
3359                 lock_chunks(block_group->fs_info->chunk_root);
3360                 em_tree = &block_group->fs_info->mapping_tree.map_tree;
3361                 write_lock(&em_tree->lock);
3362                 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3363                                            1);
3364                 BUG_ON(!em); /* logic error, can't happen */
3365                 /*
3366                  * remove_extent_mapping() will delete us from the pinned_chunks
3367                  * list, which is protected by the chunk mutex.
3368                  */
3369                 remove_extent_mapping(em_tree, em);
3370                 write_unlock(&em_tree->lock);
3371                 unlock_chunks(block_group->fs_info->chunk_root);
3372
3373                 /* once for us and once for the tree */
3374                 free_extent_map(em);
3375                 free_extent_map(em);
3376
3377                 /*
3378                  * We've left one free space entry and other tasks trimming
3379                  * this block group have left 1 entry each one. Free them.
3380                  */
3381                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3382         }
3383 }
3384
3385 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3386                            u64 *trimmed, u64 start, u64 end, u64 minlen)
3387 {
3388         int ret;
3389
3390         *trimmed = 0;
3391
3392         spin_lock(&block_group->lock);
3393         if (block_group->removed) {
3394                 spin_unlock(&block_group->lock);
3395                 return 0;
3396         }
3397         btrfs_get_block_group_trimming(block_group);
3398         spin_unlock(&block_group->lock);
3399
3400         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3401         if (ret)
3402                 goto out;
3403
3404         ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3405 out:
3406         btrfs_put_block_group_trimming(block_group);
3407         return ret;
3408 }
3409
3410 /*
3411  * Find the left-most item in the cache tree, and then return the
3412  * smallest inode number in the item.
3413  *
3414  * Note: the returned inode number may not be the smallest one in
3415  * the tree, if the left-most item is a bitmap.
3416  */
3417 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3418 {
3419         struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3420         struct btrfs_free_space *entry = NULL;
3421         u64 ino = 0;
3422
3423         spin_lock(&ctl->tree_lock);
3424
3425         if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3426                 goto out;
3427
3428         entry = rb_entry(rb_first(&ctl->free_space_offset),
3429                          struct btrfs_free_space, offset_index);
3430
3431         if (!entry->bitmap) {
3432                 ino = entry->offset;
3433
3434                 unlink_free_space(ctl, entry);
3435                 entry->offset++;
3436                 entry->bytes--;
3437                 if (!entry->bytes)
3438                         kmem_cache_free(btrfs_free_space_cachep, entry);
3439                 else
3440                         link_free_space(ctl, entry);
3441         } else {
3442                 u64 offset = 0;
3443                 u64 count = 1;
3444                 int ret;
3445
3446                 ret = search_bitmap(ctl, entry, &offset, &count, true);
3447                 /* Logic error; Should be empty if it can't find anything */
3448                 ASSERT(!ret);
3449
3450                 ino = offset;
3451                 bitmap_clear_bits(ctl, entry, offset, 1);
3452                 if (entry->bytes == 0)
3453                         free_bitmap(ctl, entry);
3454         }
3455 out:
3456         spin_unlock(&ctl->tree_lock);
3457
3458         return ino;
3459 }
3460
3461 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3462                                     struct btrfs_path *path)
3463 {
3464         struct inode *inode = NULL;
3465
3466         spin_lock(&root->ino_cache_lock);
3467         if (root->ino_cache_inode)
3468                 inode = igrab(root->ino_cache_inode);
3469         spin_unlock(&root->ino_cache_lock);
3470         if (inode)
3471                 return inode;
3472
3473         inode = __lookup_free_space_inode(root, path, 0);
3474         if (IS_ERR(inode))
3475                 return inode;
3476
3477         spin_lock(&root->ino_cache_lock);
3478         if (!btrfs_fs_closing(root->fs_info))
3479                 root->ino_cache_inode = igrab(inode);
3480         spin_unlock(&root->ino_cache_lock);
3481
3482         return inode;
3483 }
3484
3485 int create_free_ino_inode(struct btrfs_root *root,
3486                           struct btrfs_trans_handle *trans,
3487                           struct btrfs_path *path)
3488 {
3489         return __create_free_space_inode(root, trans, path,
3490                                          BTRFS_FREE_INO_OBJECTID, 0);
3491 }
3492
3493 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3494 {
3495         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3496         struct btrfs_path *path;
3497         struct inode *inode;
3498         int ret = 0;
3499         u64 root_gen = btrfs_root_generation(&root->root_item);
3500
3501         if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
3502                 return 0;
3503
3504         /*
3505          * If we're unmounting then just return, since this does a search on the
3506          * normal root and not the commit root and we could deadlock.
3507          */
3508         if (btrfs_fs_closing(fs_info))
3509                 return 0;
3510
3511         path = btrfs_alloc_path();
3512         if (!path)
3513                 return 0;
3514
3515         inode = lookup_free_ino_inode(root, path);
3516         if (IS_ERR(inode))
3517                 goto out;
3518
3519         if (root_gen != BTRFS_I(inode)->generation)
3520                 goto out_put;
3521
3522         ret = __load_free_space_cache(root, inode, ctl, path, 0);
3523
3524         if (ret < 0)
3525                 btrfs_err(fs_info,
3526                         "failed to load free ino cache for root %llu",
3527                         root->root_key.objectid);
3528 out_put:
3529         iput(inode);
3530 out:
3531         btrfs_free_path(path);
3532         return ret;
3533 }
3534
3535 int btrfs_write_out_ino_cache(struct btrfs_root *root,
3536                               struct btrfs_trans_handle *trans,
3537                               struct btrfs_path *path,
3538                               struct inode *inode)
3539 {
3540         struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3541         int ret;
3542         struct btrfs_io_ctl io_ctl;
3543         bool release_metadata = true;
3544
3545         if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
3546                 return 0;
3547
3548         memset(&io_ctl, 0, sizeof(io_ctl));
3549         ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
3550                                       trans, path, 0);
3551         if (!ret) {
3552                 /*
3553                  * At this point writepages() didn't error out, so our metadata
3554                  * reservation is released when the writeback finishes, at
3555                  * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3556                  * with or without an error.
3557                  */
3558                 release_metadata = false;
3559                 ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
3560         }
3561
3562         if (ret) {
3563                 if (release_metadata)
3564                         btrfs_delalloc_release_metadata(inode, inode->i_size);
3565 #ifdef DEBUG
3566                 btrfs_err(root->fs_info,
3567                         "failed to write free ino cache for root %llu",
3568                         root->root_key.objectid);
3569 #endif
3570         }
3571
3572         return ret;
3573 }
3574
3575 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3576 /*
3577  * Use this if you need to make a bitmap or extent entry specifically, it
3578  * doesn't do any of the merging that add_free_space does, this acts a lot like
3579  * how the free space cache loading stuff works, so you can get really weird
3580  * configurations.
3581  */
3582 int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
3583                               u64 offset, u64 bytes, bool bitmap)
3584 {
3585         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3586         struct btrfs_free_space *info = NULL, *bitmap_info;
3587         void *map = NULL;
3588         u64 bytes_added;
3589         int ret;
3590
3591 again:
3592         if (!info) {
3593                 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3594                 if (!info)
3595                         return -ENOMEM;
3596         }
3597
3598         if (!bitmap) {
3599                 spin_lock(&ctl->tree_lock);
3600                 info->offset = offset;
3601                 info->bytes = bytes;
3602                 info->max_extent_size = 0;
3603                 ret = link_free_space(ctl, info);
3604                 spin_unlock(&ctl->tree_lock);
3605                 if (ret)
3606                         kmem_cache_free(btrfs_free_space_cachep, info);
3607                 return ret;
3608         }
3609
3610         if (!map) {
3611                 map = kzalloc(PAGE_SIZE, GFP_NOFS);
3612                 if (!map) {
3613                         kmem_cache_free(btrfs_free_space_cachep, info);
3614                         return -ENOMEM;
3615                 }
3616         }
3617
3618         spin_lock(&ctl->tree_lock);
3619         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3620                                          1, 0);
3621         if (!bitmap_info) {
3622                 info->bitmap = map;
3623                 map = NULL;
3624                 add_new_bitmap(ctl, info, offset);
3625                 bitmap_info = info;
3626                 info = NULL;
3627         }
3628
3629         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3630
3631         bytes -= bytes_added;
3632         offset += bytes_added;
3633         spin_unlock(&ctl->tree_lock);
3634
3635         if (bytes)
3636                 goto again;
3637
3638         if (info)
3639                 kmem_cache_free(btrfs_free_space_cachep, info);
3640         if (map)
3641                 kfree(map);
3642         return 0;
3643 }
3644
3645 /*
3646  * Checks to see if the given range is in the free space cache.  This is really
3647  * just used to check the absence of space, so if there is free space in the
3648  * range at all we will return 1.
3649  */
3650 int test_check_exists(struct btrfs_block_group_cache *cache,
3651                       u64 offset, u64 bytes)
3652 {
3653         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3654         struct btrfs_free_space *info;
3655         int ret = 0;
3656
3657         spin_lock(&ctl->tree_lock);
3658         info = tree_search_offset(ctl, offset, 0, 0);
3659         if (!info) {
3660                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3661                                           1, 0);
3662                 if (!info)
3663                         goto out;
3664         }
3665
3666 have_info:
3667         if (info->bitmap) {
3668                 u64 bit_off, bit_bytes;
3669                 struct rb_node *n;
3670                 struct btrfs_free_space *tmp;
3671
3672                 bit_off = offset;
3673                 bit_bytes = ctl->unit;
3674                 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3675                 if (!ret) {
3676                         if (bit_off == offset) {
3677                                 ret = 1;
3678                                 goto out;
3679                         } else if (bit_off > offset &&
3680                                    offset + bytes > bit_off) {
3681                                 ret = 1;
3682                                 goto out;
3683                         }
3684                 }
3685
3686                 n = rb_prev(&info->offset_index);
3687                 while (n) {
3688                         tmp = rb_entry(n, struct btrfs_free_space,
3689                                        offset_index);
3690                         if (tmp->offset + tmp->bytes < offset)
3691                                 break;
3692                         if (offset + bytes < tmp->offset) {
3693                                 n = rb_prev(&tmp->offset_index);
3694                                 continue;
3695                         }
3696                         info = tmp;
3697                         goto have_info;
3698                 }
3699
3700                 n = rb_next(&info->offset_index);
3701                 while (n) {
3702                         tmp = rb_entry(n, struct btrfs_free_space,
3703                                        offset_index);
3704                         if (offset + bytes < tmp->offset)
3705                                 break;
3706                         if (tmp->offset + tmp->bytes < offset) {
3707                                 n = rb_next(&tmp->offset_index);
3708                                 continue;
3709                         }
3710                         info = tmp;
3711                         goto have_info;
3712                 }
3713
3714                 ret = 0;
3715                 goto out;
3716         }
3717
3718         if (info->offset == offset) {
3719                 ret = 1;
3720                 goto out;
3721         }
3722
3723         if (offset > info->offset && offset < info->offset + info->bytes)
3724                 ret = 1;
3725 out:
3726         spin_unlock(&ctl->tree_lock);
3727         return ret;
3728 }
3729 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */