GNU Linux-libre 4.9.292-gnu1
[releases.git] / fs / btrfs / inode.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/bit_spinlock.h>
36 #include <linux/xattr.h>
37 #include <linux/posix_acl.h>
38 #include <linux/falloc.h>
39 #include <linux/slab.h>
40 #include <linux/ratelimit.h>
41 #include <linux/mount.h>
42 #include <linux/btrfs.h>
43 #include <linux/blkdev.h>
44 #include <linux/posix_acl_xattr.h>
45 #include <linux/uio.h>
46 #include "ctree.h"
47 #include "disk-io.h"
48 #include "transaction.h"
49 #include "btrfs_inode.h"
50 #include "print-tree.h"
51 #include "ordered-data.h"
52 #include "xattr.h"
53 #include "tree-log.h"
54 #include "volumes.h"
55 #include "compression.h"
56 #include "locking.h"
57 #include "free-space-cache.h"
58 #include "inode-map.h"
59 #include "backref.h"
60 #include "hash.h"
61 #include "props.h"
62 #include "qgroup.h"
63 #include "dedupe.h"
64
65 struct btrfs_iget_args {
66         struct btrfs_key *location;
67         struct btrfs_root *root;
68 };
69
70 struct btrfs_dio_data {
71         u64 outstanding_extents;
72         u64 reserve;
73         u64 unsubmitted_oe_range_start;
74         u64 unsubmitted_oe_range_end;
75 };
76
77 static const struct inode_operations btrfs_dir_inode_operations;
78 static const struct inode_operations btrfs_symlink_inode_operations;
79 static const struct inode_operations btrfs_dir_ro_inode_operations;
80 static const struct inode_operations btrfs_special_inode_operations;
81 static const struct inode_operations btrfs_file_inode_operations;
82 static const struct address_space_operations btrfs_aops;
83 static const struct address_space_operations btrfs_symlink_aops;
84 static const struct file_operations btrfs_dir_file_operations;
85 static const struct extent_io_ops btrfs_extent_io_ops;
86
87 static struct kmem_cache *btrfs_inode_cachep;
88 struct kmem_cache *btrfs_trans_handle_cachep;
89 struct kmem_cache *btrfs_transaction_cachep;
90 struct kmem_cache *btrfs_path_cachep;
91 struct kmem_cache *btrfs_free_space_cachep;
92
93 #define S_SHIFT 12
94 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
95         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
96         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
97         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
98         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
99         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
100         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
101         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
102 };
103
104 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
105 static int btrfs_truncate(struct inode *inode);
106 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
107 static noinline int cow_file_range(struct inode *inode,
108                                    struct page *locked_page,
109                                    u64 start, u64 end, u64 delalloc_end,
110                                    int *page_started, unsigned long *nr_written,
111                                    int unlock, struct btrfs_dedupe_hash *hash);
112 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
113                                            u64 len, u64 orig_start,
114                                            u64 block_start, u64 block_len,
115                                            u64 orig_block_len, u64 ram_bytes,
116                                            int type);
117
118 static int btrfs_dirty_inode(struct inode *inode);
119
120 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
121 void btrfs_test_inode_set_ops(struct inode *inode)
122 {
123         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
124 }
125 #endif
126
127 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
128                                      struct inode *inode,  struct inode *dir,
129                                      const struct qstr *qstr)
130 {
131         int err;
132
133         err = btrfs_init_acl(trans, inode, dir);
134         if (!err)
135                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
136         return err;
137 }
138
139 /*
140  * this does all the hard work for inserting an inline extent into
141  * the btree.  The caller should have done a btrfs_drop_extents so that
142  * no overlapping inline items exist in the btree
143  */
144 static int insert_inline_extent(struct btrfs_trans_handle *trans,
145                                 struct btrfs_path *path, int extent_inserted,
146                                 struct btrfs_root *root, struct inode *inode,
147                                 u64 start, size_t size, size_t compressed_size,
148                                 int compress_type,
149                                 struct page **compressed_pages)
150 {
151         struct extent_buffer *leaf;
152         struct page *page = NULL;
153         char *kaddr;
154         unsigned long ptr;
155         struct btrfs_file_extent_item *ei;
156         int err = 0;
157         int ret;
158         size_t cur_size = size;
159         unsigned long offset;
160
161         if (compressed_size && compressed_pages)
162                 cur_size = compressed_size;
163
164         inode_add_bytes(inode, size);
165
166         if (!extent_inserted) {
167                 struct btrfs_key key;
168                 size_t datasize;
169
170                 key.objectid = btrfs_ino(inode);
171                 key.offset = start;
172                 key.type = BTRFS_EXTENT_DATA_KEY;
173
174                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
175                 path->leave_spinning = 1;
176                 ret = btrfs_insert_empty_item(trans, root, path, &key,
177                                               datasize);
178                 if (ret) {
179                         err = ret;
180                         goto fail;
181                 }
182         }
183         leaf = path->nodes[0];
184         ei = btrfs_item_ptr(leaf, path->slots[0],
185                             struct btrfs_file_extent_item);
186         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
187         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
188         btrfs_set_file_extent_encryption(leaf, ei, 0);
189         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
190         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
191         ptr = btrfs_file_extent_inline_start(ei);
192
193         if (compress_type != BTRFS_COMPRESS_NONE) {
194                 struct page *cpage;
195                 int i = 0;
196                 while (compressed_size > 0) {
197                         cpage = compressed_pages[i];
198                         cur_size = min_t(unsigned long, compressed_size,
199                                        PAGE_SIZE);
200
201                         kaddr = kmap_atomic(cpage);
202                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
203                         kunmap_atomic(kaddr);
204
205                         i++;
206                         ptr += cur_size;
207                         compressed_size -= cur_size;
208                 }
209                 btrfs_set_file_extent_compression(leaf, ei,
210                                                   compress_type);
211         } else {
212                 page = find_get_page(inode->i_mapping,
213                                      start >> PAGE_SHIFT);
214                 btrfs_set_file_extent_compression(leaf, ei, 0);
215                 kaddr = kmap_atomic(page);
216                 offset = start & (PAGE_SIZE - 1);
217                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
218                 kunmap_atomic(kaddr);
219                 put_page(page);
220         }
221         btrfs_mark_buffer_dirty(leaf);
222         btrfs_release_path(path);
223
224         /*
225          * we're an inline extent, so nobody can
226          * extend the file past i_size without locking
227          * a page we already have locked.
228          *
229          * We must do any isize and inode updates
230          * before we unlock the pages.  Otherwise we
231          * could end up racing with unlink.
232          */
233         BTRFS_I(inode)->disk_i_size = inode->i_size;
234         ret = btrfs_update_inode(trans, root, inode);
235
236         return ret;
237 fail:
238         return err;
239 }
240
241
242 /*
243  * conditionally insert an inline extent into the file.  This
244  * does the checks required to make sure the data is small enough
245  * to fit as an inline extent.
246  */
247 static noinline int cow_file_range_inline(struct btrfs_root *root,
248                                           struct inode *inode, u64 start,
249                                           u64 end, size_t compressed_size,
250                                           int compress_type,
251                                           struct page **compressed_pages)
252 {
253         struct btrfs_trans_handle *trans;
254         u64 isize = i_size_read(inode);
255         u64 actual_end = min(end + 1, isize);
256         u64 inline_len = actual_end - start;
257         u64 aligned_end = ALIGN(end, root->sectorsize);
258         u64 data_len = inline_len;
259         int ret;
260         struct btrfs_path *path;
261         int extent_inserted = 0;
262         u32 extent_item_size;
263
264         if (compressed_size)
265                 data_len = compressed_size;
266
267         if (start > 0 ||
268             actual_end > root->sectorsize ||
269             data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
270             (!compressed_size &&
271             (actual_end & (root->sectorsize - 1)) == 0) ||
272             end + 1 < isize ||
273             data_len > root->fs_info->max_inline) {
274                 return 1;
275         }
276
277         path = btrfs_alloc_path();
278         if (!path)
279                 return -ENOMEM;
280
281         trans = btrfs_join_transaction(root);
282         if (IS_ERR(trans)) {
283                 btrfs_free_path(path);
284                 return PTR_ERR(trans);
285         }
286         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
287
288         if (compressed_size && compressed_pages)
289                 extent_item_size = btrfs_file_extent_calc_inline_size(
290                    compressed_size);
291         else
292                 extent_item_size = btrfs_file_extent_calc_inline_size(
293                     inline_len);
294
295         ret = __btrfs_drop_extents(trans, root, inode, path,
296                                    start, aligned_end, NULL,
297                                    1, 1, extent_item_size, &extent_inserted);
298         if (ret) {
299                 btrfs_abort_transaction(trans, ret);
300                 goto out;
301         }
302
303         if (isize > actual_end)
304                 inline_len = min_t(u64, isize, actual_end);
305         ret = insert_inline_extent(trans, path, extent_inserted,
306                                    root, inode, start,
307                                    inline_len, compressed_size,
308                                    compress_type, compressed_pages);
309         if (ret && ret != -ENOSPC) {
310                 btrfs_abort_transaction(trans, ret);
311                 goto out;
312         } else if (ret == -ENOSPC) {
313                 ret = 1;
314                 goto out;
315         }
316
317         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
318         btrfs_delalloc_release_metadata(inode, end + 1 - start);
319         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
320 out:
321         /*
322          * Don't forget to free the reserved space, as for inlined extent
323          * it won't count as data extent, free them directly here.
324          * And at reserve time, it's always aligned to page size, so
325          * just free one page here.
326          */
327         btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
328         btrfs_free_path(path);
329         btrfs_end_transaction(trans, root);
330         return ret;
331 }
332
333 struct async_extent {
334         u64 start;
335         u64 ram_size;
336         u64 compressed_size;
337         struct page **pages;
338         unsigned long nr_pages;
339         int compress_type;
340         struct list_head list;
341 };
342
343 struct async_cow {
344         struct inode *inode;
345         struct btrfs_root *root;
346         struct page *locked_page;
347         u64 start;
348         u64 end;
349         struct list_head extents;
350         struct btrfs_work work;
351 };
352
353 static noinline int add_async_extent(struct async_cow *cow,
354                                      u64 start, u64 ram_size,
355                                      u64 compressed_size,
356                                      struct page **pages,
357                                      unsigned long nr_pages,
358                                      int compress_type)
359 {
360         struct async_extent *async_extent;
361
362         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
363         BUG_ON(!async_extent); /* -ENOMEM */
364         async_extent->start = start;
365         async_extent->ram_size = ram_size;
366         async_extent->compressed_size = compressed_size;
367         async_extent->pages = pages;
368         async_extent->nr_pages = nr_pages;
369         async_extent->compress_type = compress_type;
370         list_add_tail(&async_extent->list, &cow->extents);
371         return 0;
372 }
373
374 static inline int inode_need_compress(struct inode *inode)
375 {
376         struct btrfs_root *root = BTRFS_I(inode)->root;
377
378         /* force compress */
379         if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
380                 return 1;
381         /* bad compression ratios */
382         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
383                 return 0;
384         if (btrfs_test_opt(root->fs_info, COMPRESS) ||
385             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
386             BTRFS_I(inode)->force_compress)
387                 return 1;
388         return 0;
389 }
390
391 /*
392  * we create compressed extents in two phases.  The first
393  * phase compresses a range of pages that have already been
394  * locked (both pages and state bits are locked).
395  *
396  * This is done inside an ordered work queue, and the compression
397  * is spread across many cpus.  The actual IO submission is step
398  * two, and the ordered work queue takes care of making sure that
399  * happens in the same order things were put onto the queue by
400  * writepages and friends.
401  *
402  * If this code finds it can't get good compression, it puts an
403  * entry onto the work queue to write the uncompressed bytes.  This
404  * makes sure that both compressed inodes and uncompressed inodes
405  * are written in the same order that the flusher thread sent them
406  * down.
407  */
408 static noinline void compress_file_range(struct inode *inode,
409                                         struct page *locked_page,
410                                         u64 start, u64 end,
411                                         struct async_cow *async_cow,
412                                         int *num_added)
413 {
414         struct btrfs_root *root = BTRFS_I(inode)->root;
415         u64 num_bytes;
416         u64 blocksize = root->sectorsize;
417         u64 actual_end;
418         u64 isize = i_size_read(inode);
419         int ret = 0;
420         struct page **pages = NULL;
421         unsigned long nr_pages;
422         unsigned long nr_pages_ret = 0;
423         unsigned long total_compressed = 0;
424         unsigned long total_in = 0;
425         unsigned long max_compressed = SZ_128K;
426         unsigned long max_uncompressed = SZ_128K;
427         int i;
428         int will_compress;
429         int compress_type = root->fs_info->compress_type;
430         int redirty = 0;
431
432         /* if this is a small write inside eof, kick off a defrag */
433         if ((end - start + 1) < SZ_16K &&
434             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
435                 btrfs_add_inode_defrag(NULL, inode);
436
437         actual_end = min_t(u64, isize, end + 1);
438 again:
439         will_compress = 0;
440         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
441         nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
442
443         /*
444          * we don't want to send crud past the end of i_size through
445          * compression, that's just a waste of CPU time.  So, if the
446          * end of the file is before the start of our current
447          * requested range of bytes, we bail out to the uncompressed
448          * cleanup code that can deal with all of this.
449          *
450          * It isn't really the fastest way to fix things, but this is a
451          * very uncommon corner.
452          */
453         if (actual_end <= start)
454                 goto cleanup_and_bail_uncompressed;
455
456         total_compressed = actual_end - start;
457
458         /*
459          * skip compression for a small file range(<=blocksize) that
460          * isn't an inline extent, since it doesn't save disk space at all.
461          */
462         if (total_compressed <= blocksize &&
463            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
464                 goto cleanup_and_bail_uncompressed;
465
466         /* we want to make sure that amount of ram required to uncompress
467          * an extent is reasonable, so we limit the total size in ram
468          * of a compressed extent to 128k.  This is a crucial number
469          * because it also controls how easily we can spread reads across
470          * cpus for decompression.
471          *
472          * We also want to make sure the amount of IO required to do
473          * a random read is reasonably small, so we limit the size of
474          * a compressed extent to 128k.
475          */
476         total_compressed = min(total_compressed, max_uncompressed);
477         num_bytes = ALIGN(end - start + 1, blocksize);
478         num_bytes = max(blocksize,  num_bytes);
479         total_in = 0;
480         ret = 0;
481
482         /*
483          * we do compression for mount -o compress and when the
484          * inode has not been flagged as nocompress.  This flag can
485          * change at any time if we discover bad compression ratios.
486          */
487         if (inode_need_compress(inode)) {
488                 WARN_ON(pages);
489                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
490                 if (!pages) {
491                         /* just bail out to the uncompressed code */
492                         nr_pages = 0;
493                         goto cont;
494                 }
495
496                 if (BTRFS_I(inode)->force_compress)
497                         compress_type = BTRFS_I(inode)->force_compress;
498
499                 /*
500                  * we need to call clear_page_dirty_for_io on each
501                  * page in the range.  Otherwise applications with the file
502                  * mmap'd can wander in and change the page contents while
503                  * we are compressing them.
504                  *
505                  * If the compression fails for any reason, we set the pages
506                  * dirty again later on.
507                  */
508                 extent_range_clear_dirty_for_io(inode, start, end);
509                 redirty = 1;
510                 ret = btrfs_compress_pages(compress_type,
511                                            inode->i_mapping, start,
512                                            total_compressed, pages,
513                                            nr_pages, &nr_pages_ret,
514                                            &total_in,
515                                            &total_compressed,
516                                            max_compressed);
517
518                 if (!ret) {
519                         unsigned long offset = total_compressed &
520                                 (PAGE_SIZE - 1);
521                         struct page *page = pages[nr_pages_ret - 1];
522                         char *kaddr;
523
524                         /* zero the tail end of the last page, we might be
525                          * sending it down to disk
526                          */
527                         if (offset) {
528                                 kaddr = kmap_atomic(page);
529                                 memset(kaddr + offset, 0,
530                                        PAGE_SIZE - offset);
531                                 kunmap_atomic(kaddr);
532                         }
533                         will_compress = 1;
534                 }
535         }
536 cont:
537         if (start == 0) {
538                 /* lets try to make an inline extent */
539                 if (ret || total_in < (actual_end - start)) {
540                         /* we didn't compress the entire range, try
541                          * to make an uncompressed inline extent.
542                          */
543                         ret = cow_file_range_inline(root, inode, start, end,
544                                                     0, 0, NULL);
545                 } else {
546                         /* try making a compressed inline extent */
547                         ret = cow_file_range_inline(root, inode, start, end,
548                                                     total_compressed,
549                                                     compress_type, pages);
550                 }
551                 if (ret <= 0) {
552                         unsigned long clear_flags = EXTENT_DELALLOC |
553                                 EXTENT_DEFRAG;
554                         unsigned long page_error_op;
555
556                         clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
557                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
558
559                         /*
560                          * inline extent creation worked or returned error,
561                          * we don't need to create any more async work items.
562                          * Unlock and free up our temp pages.
563                          */
564                         extent_clear_unlock_delalloc(inode, start, end, end,
565                                                      NULL, clear_flags,
566                                                      PAGE_UNLOCK |
567                                                      PAGE_CLEAR_DIRTY |
568                                                      PAGE_SET_WRITEBACK |
569                                                      page_error_op |
570                                                      PAGE_END_WRITEBACK);
571                         if (ret == 0)
572                                 btrfs_free_reserved_data_space_noquota(inode,
573                                                                start,
574                                                                end - start + 1);
575                         goto free_pages_out;
576                 }
577         }
578
579         if (will_compress) {
580                 /*
581                  * we aren't doing an inline extent round the compressed size
582                  * up to a block size boundary so the allocator does sane
583                  * things
584                  */
585                 total_compressed = ALIGN(total_compressed, blocksize);
586
587                 /*
588                  * one last check to make sure the compression is really a
589                  * win, compare the page count read with the blocks on disk
590                  */
591                 total_in = ALIGN(total_in, PAGE_SIZE);
592                 if (total_compressed >= total_in) {
593                         will_compress = 0;
594                 } else {
595                         num_bytes = total_in;
596                         *num_added += 1;
597
598                         /*
599                          * The async work queues will take care of doing actual
600                          * allocation on disk for these compressed pages, and
601                          * will submit them to the elevator.
602                          */
603                         add_async_extent(async_cow, start, num_bytes,
604                                         total_compressed, pages, nr_pages_ret,
605                                         compress_type);
606
607                         if (start + num_bytes < end) {
608                                 start += num_bytes;
609                                 pages = NULL;
610                                 cond_resched();
611                                 goto again;
612                         }
613                         return;
614                 }
615         }
616         if (pages) {
617                 /*
618                  * the compression code ran but failed to make things smaller,
619                  * free any pages it allocated and our page pointer array
620                  */
621                 for (i = 0; i < nr_pages_ret; i++) {
622                         WARN_ON(pages[i]->mapping);
623                         put_page(pages[i]);
624                 }
625                 kfree(pages);
626                 pages = NULL;
627                 total_compressed = 0;
628                 nr_pages_ret = 0;
629
630                 /* flag the file so we don't compress in the future */
631                 if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
632                     !(BTRFS_I(inode)->force_compress)) {
633                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
634                 }
635         }
636 cleanup_and_bail_uncompressed:
637         /*
638          * No compression, but we still need to write the pages in the file
639          * we've been given so far.  redirty the locked page if it corresponds
640          * to our extent and set things up for the async work queue to run
641          * cow_file_range to do the normal delalloc dance.
642          */
643         if (page_offset(locked_page) >= start &&
644             page_offset(locked_page) <= end)
645                 __set_page_dirty_nobuffers(locked_page);
646                 /* unlocked later on in the async handlers */
647
648         if (redirty)
649                 extent_range_redirty_for_io(inode, start, end);
650         add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
651                          BTRFS_COMPRESS_NONE);
652         *num_added += 1;
653
654         return;
655
656 free_pages_out:
657         for (i = 0; i < nr_pages_ret; i++) {
658                 WARN_ON(pages[i]->mapping);
659                 put_page(pages[i]);
660         }
661         kfree(pages);
662 }
663
664 static void free_async_extent_pages(struct async_extent *async_extent)
665 {
666         int i;
667
668         if (!async_extent->pages)
669                 return;
670
671         for (i = 0; i < async_extent->nr_pages; i++) {
672                 WARN_ON(async_extent->pages[i]->mapping);
673                 put_page(async_extent->pages[i]);
674         }
675         kfree(async_extent->pages);
676         async_extent->nr_pages = 0;
677         async_extent->pages = NULL;
678 }
679
680 /*
681  * phase two of compressed writeback.  This is the ordered portion
682  * of the code, which only gets called in the order the work was
683  * queued.  We walk all the async extents created by compress_file_range
684  * and send them down to the disk.
685  */
686 static noinline void submit_compressed_extents(struct inode *inode,
687                                               struct async_cow *async_cow)
688 {
689         struct async_extent *async_extent;
690         u64 alloc_hint = 0;
691         struct btrfs_key ins;
692         struct extent_map *em;
693         struct btrfs_root *root = BTRFS_I(inode)->root;
694         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
695         struct extent_io_tree *io_tree;
696         int ret = 0;
697
698 again:
699         while (!list_empty(&async_cow->extents)) {
700                 async_extent = list_entry(async_cow->extents.next,
701                                           struct async_extent, list);
702                 list_del(&async_extent->list);
703
704                 io_tree = &BTRFS_I(inode)->io_tree;
705
706 retry:
707                 /* did the compression code fall back to uncompressed IO? */
708                 if (!async_extent->pages) {
709                         int page_started = 0;
710                         unsigned long nr_written = 0;
711
712                         lock_extent(io_tree, async_extent->start,
713                                          async_extent->start +
714                                          async_extent->ram_size - 1);
715
716                         /* allocate blocks */
717                         ret = cow_file_range(inode, async_cow->locked_page,
718                                              async_extent->start,
719                                              async_extent->start +
720                                              async_extent->ram_size - 1,
721                                              async_extent->start +
722                                              async_extent->ram_size - 1,
723                                              &page_started, &nr_written, 0,
724                                              NULL);
725
726                         /* JDM XXX */
727
728                         /*
729                          * if page_started, cow_file_range inserted an
730                          * inline extent and took care of all the unlocking
731                          * and IO for us.  Otherwise, we need to submit
732                          * all those pages down to the drive.
733                          */
734                         if (!page_started && !ret)
735                                 extent_write_locked_range(io_tree,
736                                                   inode, async_extent->start,
737                                                   async_extent->start +
738                                                   async_extent->ram_size - 1,
739                                                   btrfs_get_extent,
740                                                   WB_SYNC_ALL);
741                         else if (ret)
742                                 unlock_page(async_cow->locked_page);
743                         kfree(async_extent);
744                         cond_resched();
745                         continue;
746                 }
747
748                 lock_extent(io_tree, async_extent->start,
749                             async_extent->start + async_extent->ram_size - 1);
750
751                 ret = btrfs_reserve_extent(root, async_extent->ram_size,
752                                            async_extent->compressed_size,
753                                            async_extent->compressed_size,
754                                            0, alloc_hint, &ins, 1, 1);
755                 if (ret) {
756                         free_async_extent_pages(async_extent);
757
758                         if (ret == -ENOSPC) {
759                                 unlock_extent(io_tree, async_extent->start,
760                                               async_extent->start +
761                                               async_extent->ram_size - 1);
762
763                                 /*
764                                  * we need to redirty the pages if we decide to
765                                  * fallback to uncompressed IO, otherwise we
766                                  * will not submit these pages down to lower
767                                  * layers.
768                                  */
769                                 extent_range_redirty_for_io(inode,
770                                                 async_extent->start,
771                                                 async_extent->start +
772                                                 async_extent->ram_size - 1);
773
774                                 goto retry;
775                         }
776                         goto out_free;
777                 }
778                 /*
779                  * here we're doing allocation and writeback of the
780                  * compressed pages
781                  */
782                 btrfs_drop_extent_cache(inode, async_extent->start,
783                                         async_extent->start +
784                                         async_extent->ram_size - 1, 0);
785
786                 em = alloc_extent_map();
787                 if (!em) {
788                         ret = -ENOMEM;
789                         goto out_free_reserve;
790                 }
791                 em->start = async_extent->start;
792                 em->len = async_extent->ram_size;
793                 em->orig_start = em->start;
794                 em->mod_start = em->start;
795                 em->mod_len = em->len;
796
797                 em->block_start = ins.objectid;
798                 em->block_len = ins.offset;
799                 em->orig_block_len = ins.offset;
800                 em->ram_bytes = async_extent->ram_size;
801                 em->bdev = root->fs_info->fs_devices->latest_bdev;
802                 em->compress_type = async_extent->compress_type;
803                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
804                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
805                 em->generation = -1;
806
807                 while (1) {
808                         write_lock(&em_tree->lock);
809                         ret = add_extent_mapping(em_tree, em, 1);
810                         write_unlock(&em_tree->lock);
811                         if (ret != -EEXIST) {
812                                 free_extent_map(em);
813                                 break;
814                         }
815                         btrfs_drop_extent_cache(inode, async_extent->start,
816                                                 async_extent->start +
817                                                 async_extent->ram_size - 1, 0);
818                 }
819
820                 if (ret)
821                         goto out_free_reserve;
822
823                 ret = btrfs_add_ordered_extent_compress(inode,
824                                                 async_extent->start,
825                                                 ins.objectid,
826                                                 async_extent->ram_size,
827                                                 ins.offset,
828                                                 BTRFS_ORDERED_COMPRESSED,
829                                                 async_extent->compress_type);
830                 if (ret) {
831                         btrfs_drop_extent_cache(inode, async_extent->start,
832                                                 async_extent->start +
833                                                 async_extent->ram_size - 1, 0);
834                         goto out_free_reserve;
835                 }
836                 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
837
838                 /*
839                  * clear dirty, set writeback and unlock the pages.
840                  */
841                 extent_clear_unlock_delalloc(inode, async_extent->start,
842                                 async_extent->start +
843                                 async_extent->ram_size - 1,
844                                 async_extent->start +
845                                 async_extent->ram_size - 1,
846                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
847                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
848                                 PAGE_SET_WRITEBACK);
849                 ret = btrfs_submit_compressed_write(inode,
850                                     async_extent->start,
851                                     async_extent->ram_size,
852                                     ins.objectid,
853                                     ins.offset, async_extent->pages,
854                                     async_extent->nr_pages);
855                 if (ret) {
856                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
857                         struct page *p = async_extent->pages[0];
858                         const u64 start = async_extent->start;
859                         const u64 end = start + async_extent->ram_size - 1;
860
861                         p->mapping = inode->i_mapping;
862                         tree->ops->writepage_end_io_hook(p, start, end,
863                                                          NULL, 0);
864                         p->mapping = NULL;
865                         extent_clear_unlock_delalloc(inode, start, end, end,
866                                                      NULL, 0,
867                                                      PAGE_END_WRITEBACK |
868                                                      PAGE_SET_ERROR);
869                         free_async_extent_pages(async_extent);
870                 }
871                 alloc_hint = ins.objectid + ins.offset;
872                 kfree(async_extent);
873                 cond_resched();
874         }
875         return;
876 out_free_reserve:
877         btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
878         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
879 out_free:
880         extent_clear_unlock_delalloc(inode, async_extent->start,
881                                      async_extent->start +
882                                      async_extent->ram_size - 1,
883                                      async_extent->start +
884                                      async_extent->ram_size - 1,
885                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
886                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
887                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
888                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
889                                      PAGE_SET_ERROR);
890         free_async_extent_pages(async_extent);
891         kfree(async_extent);
892         goto again;
893 }
894
895 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
896                                       u64 num_bytes)
897 {
898         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
899         struct extent_map *em;
900         u64 alloc_hint = 0;
901
902         read_lock(&em_tree->lock);
903         em = search_extent_mapping(em_tree, start, num_bytes);
904         if (em) {
905                 /*
906                  * if block start isn't an actual block number then find the
907                  * first block in this inode and use that as a hint.  If that
908                  * block is also bogus then just don't worry about it.
909                  */
910                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
911                         free_extent_map(em);
912                         em = search_extent_mapping(em_tree, 0, 0);
913                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
914                                 alloc_hint = em->block_start;
915                         if (em)
916                                 free_extent_map(em);
917                 } else {
918                         alloc_hint = em->block_start;
919                         free_extent_map(em);
920                 }
921         }
922         read_unlock(&em_tree->lock);
923
924         return alloc_hint;
925 }
926
927 /*
928  * when extent_io.c finds a delayed allocation range in the file,
929  * the call backs end up in this code.  The basic idea is to
930  * allocate extents on disk for the range, and create ordered data structs
931  * in ram to track those extents.
932  *
933  * locked_page is the page that writepage had locked already.  We use
934  * it to make sure we don't do extra locks or unlocks.
935  *
936  * *page_started is set to one if we unlock locked_page and do everything
937  * required to start IO on it.  It may be clean and already done with
938  * IO when we return.
939  */
940 static noinline int cow_file_range(struct inode *inode,
941                                    struct page *locked_page,
942                                    u64 start, u64 end, u64 delalloc_end,
943                                    int *page_started, unsigned long *nr_written,
944                                    int unlock, struct btrfs_dedupe_hash *hash)
945 {
946         struct btrfs_root *root = BTRFS_I(inode)->root;
947         u64 alloc_hint = 0;
948         u64 num_bytes;
949         unsigned long ram_size;
950         u64 min_alloc_size;
951         u64 cur_alloc_size;
952         u64 blocksize = root->sectorsize;
953         struct btrfs_key ins;
954         struct extent_map *em;
955         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
956         int ret = 0;
957
958         if (btrfs_is_free_space_inode(inode)) {
959                 WARN_ON_ONCE(1);
960                 ret = -EINVAL;
961                 goto out_unlock;
962         }
963
964         num_bytes = ALIGN(end - start + 1, blocksize);
965         num_bytes = max(blocksize,  num_bytes);
966
967         /* if this is a small write inside eof, kick off defrag */
968         if (num_bytes < SZ_64K &&
969             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
970                 btrfs_add_inode_defrag(NULL, inode);
971
972         if (start == 0) {
973                 /* lets try to make an inline extent */
974                 ret = cow_file_range_inline(root, inode, start, end, 0, 0,
975                                             NULL);
976                 if (ret == 0) {
977                         extent_clear_unlock_delalloc(inode, start, end,
978                                      delalloc_end, NULL,
979                                      EXTENT_LOCKED | EXTENT_DELALLOC |
980                                      EXTENT_DEFRAG, PAGE_UNLOCK |
981                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
982                                      PAGE_END_WRITEBACK);
983                         btrfs_free_reserved_data_space_noquota(inode, start,
984                                                 end - start + 1);
985                         *nr_written = *nr_written +
986                              (end - start + PAGE_SIZE) / PAGE_SIZE;
987                         *page_started = 1;
988                         goto out;
989                 } else if (ret < 0) {
990                         goto out_unlock;
991                 }
992         }
993
994         BUG_ON(num_bytes > btrfs_super_total_bytes(root->fs_info->super_copy));
995
996         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
997         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
998
999         /*
1000          * Relocation relies on the relocated extents to have exactly the same
1001          * size as the original extents. Normally writeback for relocation data
1002          * extents follows a NOCOW path because relocation preallocates the
1003          * extents. However, due to an operation such as scrub turning a block
1004          * group to RO mode, it may fallback to COW mode, so we must make sure
1005          * an extent allocated during COW has exactly the requested size and can
1006          * not be split into smaller extents, otherwise relocation breaks and
1007          * fails during the stage where it updates the bytenr of file extent
1008          * items.
1009          */
1010         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1011                 min_alloc_size = num_bytes;
1012         else
1013                 min_alloc_size = root->sectorsize;
1014
1015         while (num_bytes > 0) {
1016                 unsigned long op;
1017
1018                 cur_alloc_size = num_bytes;
1019                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1020                                            min_alloc_size, 0, alloc_hint,
1021                                            &ins, 1, 1);
1022                 if (ret < 0)
1023                         goto out_unlock;
1024
1025                 em = alloc_extent_map();
1026                 if (!em) {
1027                         ret = -ENOMEM;
1028                         goto out_reserve;
1029                 }
1030                 em->start = start;
1031                 em->orig_start = em->start;
1032                 ram_size = ins.offset;
1033                 em->len = ins.offset;
1034                 em->mod_start = em->start;
1035                 em->mod_len = em->len;
1036
1037                 em->block_start = ins.objectid;
1038                 em->block_len = ins.offset;
1039                 em->orig_block_len = ins.offset;
1040                 em->ram_bytes = ram_size;
1041                 em->bdev = root->fs_info->fs_devices->latest_bdev;
1042                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1043                 em->generation = -1;
1044
1045                 while (1) {
1046                         write_lock(&em_tree->lock);
1047                         ret = add_extent_mapping(em_tree, em, 1);
1048                         write_unlock(&em_tree->lock);
1049                         if (ret != -EEXIST) {
1050                                 free_extent_map(em);
1051                                 break;
1052                         }
1053                         btrfs_drop_extent_cache(inode, start,
1054                                                 start + ram_size - 1, 0);
1055                 }
1056                 if (ret)
1057                         goto out_reserve;
1058
1059                 cur_alloc_size = ins.offset;
1060                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1061                                                ram_size, cur_alloc_size, 0);
1062                 if (ret)
1063                         goto out_drop_extent_cache;
1064
1065                 if (root->root_key.objectid ==
1066                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1067                         ret = btrfs_reloc_clone_csums(inode, start,
1068                                                       cur_alloc_size);
1069                         if (ret)
1070                                 goto out_drop_extent_cache;
1071                 }
1072
1073                 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1074
1075                 if (num_bytes < cur_alloc_size)
1076                         break;
1077
1078                 /* we're not doing compressed IO, don't unlock the first
1079                  * page (which the caller expects to stay locked), don't
1080                  * clear any dirty bits and don't set any writeback bits
1081                  *
1082                  * Do set the Private2 bit so we know this page was properly
1083                  * setup for writepage
1084                  */
1085                 op = unlock ? PAGE_UNLOCK : 0;
1086                 op |= PAGE_SET_PRIVATE2;
1087
1088                 extent_clear_unlock_delalloc(inode, start,
1089                                              start + ram_size - 1,
1090                                              delalloc_end, locked_page,
1091                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1092                                              op);
1093                 if (num_bytes < cur_alloc_size)
1094                         num_bytes = 0;
1095                 else
1096                         num_bytes -= cur_alloc_size;
1097                 alloc_hint = ins.objectid + ins.offset;
1098                 start += cur_alloc_size;
1099         }
1100 out:
1101         return ret;
1102
1103 out_drop_extent_cache:
1104         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1105 out_reserve:
1106         btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1107         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1108 out_unlock:
1109         extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1110                                      locked_page,
1111                                      EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1112                                      EXTENT_DELALLOC | EXTENT_DEFRAG,
1113                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1114                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1115         goto out;
1116 }
1117
1118 /*
1119  * work queue call back to started compression on a file and pages
1120  */
1121 static noinline void async_cow_start(struct btrfs_work *work)
1122 {
1123         struct async_cow *async_cow;
1124         int num_added = 0;
1125         async_cow = container_of(work, struct async_cow, work);
1126
1127         compress_file_range(async_cow->inode, async_cow->locked_page,
1128                             async_cow->start, async_cow->end, async_cow,
1129                             &num_added);
1130         if (num_added == 0) {
1131                 btrfs_add_delayed_iput(async_cow->inode);
1132                 async_cow->inode = NULL;
1133         }
1134 }
1135
1136 /*
1137  * work queue call back to submit previously compressed pages
1138  */
1139 static noinline void async_cow_submit(struct btrfs_work *work)
1140 {
1141         struct async_cow *async_cow;
1142         struct btrfs_root *root;
1143         unsigned long nr_pages;
1144
1145         async_cow = container_of(work, struct async_cow, work);
1146
1147         root = async_cow->root;
1148         nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1149                 PAGE_SHIFT;
1150
1151         /*
1152          * atomic_sub_return implies a barrier for waitqueue_active
1153          */
1154         if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1155             5 * SZ_1M &&
1156             waitqueue_active(&root->fs_info->async_submit_wait))
1157                 wake_up(&root->fs_info->async_submit_wait);
1158
1159         if (async_cow->inode)
1160                 submit_compressed_extents(async_cow->inode, async_cow);
1161 }
1162
1163 static noinline void async_cow_free(struct btrfs_work *work)
1164 {
1165         struct async_cow *async_cow;
1166         async_cow = container_of(work, struct async_cow, work);
1167         if (async_cow->inode)
1168                 btrfs_add_delayed_iput(async_cow->inode);
1169         kfree(async_cow);
1170 }
1171
1172 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1173                                 u64 start, u64 end, int *page_started,
1174                                 unsigned long *nr_written)
1175 {
1176         struct async_cow *async_cow;
1177         struct btrfs_root *root = BTRFS_I(inode)->root;
1178         unsigned long nr_pages;
1179         u64 cur_end;
1180         int limit = 10 * SZ_1M;
1181
1182         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1183                          1, 0, NULL, GFP_NOFS);
1184         while (start < end) {
1185                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1186                 BUG_ON(!async_cow); /* -ENOMEM */
1187                 async_cow->inode = igrab(inode);
1188                 async_cow->root = root;
1189                 async_cow->locked_page = locked_page;
1190                 async_cow->start = start;
1191
1192                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1193                     !btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
1194                         cur_end = end;
1195                 else
1196                         cur_end = min(end, start + SZ_512K - 1);
1197
1198                 async_cow->end = cur_end;
1199                 INIT_LIST_HEAD(&async_cow->extents);
1200
1201                 btrfs_init_work(&async_cow->work,
1202                                 btrfs_delalloc_helper,
1203                                 async_cow_start, async_cow_submit,
1204                                 async_cow_free);
1205
1206                 nr_pages = (cur_end - start + PAGE_SIZE) >>
1207                         PAGE_SHIFT;
1208                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1209
1210                 btrfs_queue_work(root->fs_info->delalloc_workers,
1211                                  &async_cow->work);
1212
1213                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1214                         wait_event(root->fs_info->async_submit_wait,
1215                            (atomic_read(&root->fs_info->async_delalloc_pages) <
1216                             limit));
1217                 }
1218
1219                 while (atomic_read(&root->fs_info->async_submit_draining) &&
1220                       atomic_read(&root->fs_info->async_delalloc_pages)) {
1221                         wait_event(root->fs_info->async_submit_wait,
1222                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
1223                            0));
1224                 }
1225
1226                 *nr_written += nr_pages;
1227                 start = cur_end + 1;
1228         }
1229         *page_started = 1;
1230         return 0;
1231 }
1232
1233 static noinline int csum_exist_in_range(struct btrfs_root *root,
1234                                         u64 bytenr, u64 num_bytes)
1235 {
1236         int ret;
1237         struct btrfs_ordered_sum *sums;
1238         LIST_HEAD(list);
1239
1240         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1241                                        bytenr + num_bytes - 1, &list, 0);
1242         if (ret == 0 && list_empty(&list))
1243                 return 0;
1244
1245         while (!list_empty(&list)) {
1246                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1247                 list_del(&sums->list);
1248                 kfree(sums);
1249         }
1250         if (ret < 0)
1251                 return ret;
1252         return 1;
1253 }
1254
1255 /*
1256  * when nowcow writeback call back.  This checks for snapshots or COW copies
1257  * of the extents that exist in the file, and COWs the file as required.
1258  *
1259  * If no cow copies or snapshots exist, we write directly to the existing
1260  * blocks on disk
1261  */
1262 static noinline int run_delalloc_nocow(struct inode *inode,
1263                                        struct page *locked_page,
1264                               u64 start, u64 end, int *page_started, int force,
1265                               unsigned long *nr_written)
1266 {
1267         struct btrfs_root *root = BTRFS_I(inode)->root;
1268         struct btrfs_trans_handle *trans;
1269         struct extent_buffer *leaf;
1270         struct btrfs_path *path;
1271         struct btrfs_file_extent_item *fi;
1272         struct btrfs_key found_key;
1273         u64 cow_start;
1274         u64 cur_offset;
1275         u64 extent_end;
1276         u64 extent_offset;
1277         u64 disk_bytenr;
1278         u64 num_bytes;
1279         u64 disk_num_bytes;
1280         u64 ram_bytes;
1281         int extent_type;
1282         int ret, err;
1283         int type;
1284         int nocow;
1285         int check_prev = 1;
1286         bool nolock;
1287         u64 ino = btrfs_ino(inode);
1288
1289         path = btrfs_alloc_path();
1290         if (!path) {
1291                 extent_clear_unlock_delalloc(inode, start, end, end,
1292                                              locked_page,
1293                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1294                                              EXTENT_DO_ACCOUNTING |
1295                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1296                                              PAGE_CLEAR_DIRTY |
1297                                              PAGE_SET_WRITEBACK |
1298                                              PAGE_END_WRITEBACK);
1299                 return -ENOMEM;
1300         }
1301
1302         nolock = btrfs_is_free_space_inode(inode);
1303
1304         if (nolock)
1305                 trans = btrfs_join_transaction_nolock(root);
1306         else
1307                 trans = btrfs_join_transaction(root);
1308
1309         if (IS_ERR(trans)) {
1310                 extent_clear_unlock_delalloc(inode, start, end, end,
1311                                              locked_page,
1312                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1313                                              EXTENT_DO_ACCOUNTING |
1314                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1315                                              PAGE_CLEAR_DIRTY |
1316                                              PAGE_SET_WRITEBACK |
1317                                              PAGE_END_WRITEBACK);
1318                 btrfs_free_path(path);
1319                 return PTR_ERR(trans);
1320         }
1321
1322         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1323
1324         cow_start = (u64)-1;
1325         cur_offset = start;
1326         while (1) {
1327                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1328                                                cur_offset, 0);
1329                 if (ret < 0)
1330                         goto error;
1331                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1332                         leaf = path->nodes[0];
1333                         btrfs_item_key_to_cpu(leaf, &found_key,
1334                                               path->slots[0] - 1);
1335                         if (found_key.objectid == ino &&
1336                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1337                                 path->slots[0]--;
1338                 }
1339                 check_prev = 0;
1340 next_slot:
1341                 leaf = path->nodes[0];
1342                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1343                         ret = btrfs_next_leaf(root, path);
1344                         if (ret < 0) {
1345                                 if (cow_start != (u64)-1)
1346                                         cur_offset = cow_start;
1347                                 goto error;
1348                         }
1349                         if (ret > 0)
1350                                 break;
1351                         leaf = path->nodes[0];
1352                 }
1353
1354                 nocow = 0;
1355                 disk_bytenr = 0;
1356                 num_bytes = 0;
1357                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1358
1359                 if (found_key.objectid > ino)
1360                         break;
1361                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
1362                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
1363                         path->slots[0]++;
1364                         goto next_slot;
1365                 }
1366                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1367                     found_key.offset > end)
1368                         break;
1369
1370                 if (found_key.offset > cur_offset) {
1371                         extent_end = found_key.offset;
1372                         extent_type = 0;
1373                         goto out_check;
1374                 }
1375
1376                 fi = btrfs_item_ptr(leaf, path->slots[0],
1377                                     struct btrfs_file_extent_item);
1378                 extent_type = btrfs_file_extent_type(leaf, fi);
1379
1380                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1381                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1382                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1383                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1384                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1385                         extent_end = found_key.offset +
1386                                 btrfs_file_extent_num_bytes(leaf, fi);
1387                         disk_num_bytes =
1388                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1389                         if (extent_end <= start) {
1390                                 path->slots[0]++;
1391                                 goto next_slot;
1392                         }
1393                         if (disk_bytenr == 0)
1394                                 goto out_check;
1395                         if (btrfs_file_extent_compression(leaf, fi) ||
1396                             btrfs_file_extent_encryption(leaf, fi) ||
1397                             btrfs_file_extent_other_encoding(leaf, fi))
1398                                 goto out_check;
1399                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1400                                 goto out_check;
1401                         if (btrfs_extent_readonly(root, disk_bytenr))
1402                                 goto out_check;
1403                         ret = btrfs_cross_ref_exist(trans, root, ino,
1404                                                   found_key.offset -
1405                                                   extent_offset, disk_bytenr);
1406                         if (ret) {
1407                                 /*
1408                                  * ret could be -EIO if the above fails to read
1409                                  * metadata.
1410                                  */
1411                                 if (ret < 0) {
1412                                         if (cow_start != (u64)-1)
1413                                                 cur_offset = cow_start;
1414                                         goto error;
1415                                 }
1416
1417                                 WARN_ON_ONCE(nolock);
1418                                 goto out_check;
1419                         }
1420                         disk_bytenr += extent_offset;
1421                         disk_bytenr += cur_offset - found_key.offset;
1422                         num_bytes = min(end + 1, extent_end) - cur_offset;
1423                         /*
1424                          * if there are pending snapshots for this root,
1425                          * we fall into common COW way.
1426                          */
1427                         if (!nolock) {
1428                                 err = btrfs_start_write_no_snapshoting(root);
1429                                 if (!err)
1430                                         goto out_check;
1431                         }
1432                         /*
1433                          * force cow if csum exists in the range.
1434                          * this ensure that csum for a given extent are
1435                          * either valid or do not exist.
1436                          */
1437                         ret = csum_exist_in_range(root, disk_bytenr, num_bytes);
1438                         if (ret) {
1439                                 /*
1440                                  * ret could be -EIO if the above fails to read
1441                                  * metadata.
1442                                  */
1443                                 if (ret < 0) {
1444                                         if (cow_start != (u64)-1)
1445                                                 cur_offset = cow_start;
1446                                         goto error;
1447                                 }
1448                                 WARN_ON_ONCE(nolock);
1449                                 goto out_check;
1450                         }
1451                         if (!btrfs_inc_nocow_writers(root->fs_info,
1452                                                      disk_bytenr))
1453                                 goto out_check;
1454                         nocow = 1;
1455                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1456                         extent_end = found_key.offset +
1457                                 btrfs_file_extent_inline_len(leaf,
1458                                                      path->slots[0], fi);
1459                         extent_end = ALIGN(extent_end, root->sectorsize);
1460                 } else {
1461                         BUG_ON(1);
1462                 }
1463 out_check:
1464                 if (extent_end <= start) {
1465                         path->slots[0]++;
1466                         if (!nolock && nocow)
1467                                 btrfs_end_write_no_snapshoting(root);
1468                         if (nocow)
1469                                 btrfs_dec_nocow_writers(root->fs_info,
1470                                                         disk_bytenr);
1471                         goto next_slot;
1472                 }
1473                 if (!nocow) {
1474                         if (cow_start == (u64)-1)
1475                                 cow_start = cur_offset;
1476                         cur_offset = extent_end;
1477                         if (cur_offset > end)
1478                                 break;
1479                         path->slots[0]++;
1480                         goto next_slot;
1481                 }
1482
1483                 btrfs_release_path(path);
1484                 if (cow_start != (u64)-1) {
1485                         ret = cow_file_range(inode, locked_page,
1486                                              cow_start, found_key.offset - 1,
1487                                              end, page_started, nr_written, 1,
1488                                              NULL);
1489                         if (ret) {
1490                                 if (!nolock && nocow)
1491                                         btrfs_end_write_no_snapshoting(root);
1492                                 if (nocow)
1493                                         btrfs_dec_nocow_writers(root->fs_info,
1494                                                                 disk_bytenr);
1495                                 goto error;
1496                         }
1497                         cow_start = (u64)-1;
1498                 }
1499
1500                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1501                         struct extent_map *em;
1502                         struct extent_map_tree *em_tree;
1503                         em_tree = &BTRFS_I(inode)->extent_tree;
1504                         em = alloc_extent_map();
1505                         BUG_ON(!em); /* -ENOMEM */
1506                         em->start = cur_offset;
1507                         em->orig_start = found_key.offset - extent_offset;
1508                         em->len = num_bytes;
1509                         em->block_len = num_bytes;
1510                         em->block_start = disk_bytenr;
1511                         em->orig_block_len = disk_num_bytes;
1512                         em->ram_bytes = ram_bytes;
1513                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1514                         em->mod_start = em->start;
1515                         em->mod_len = em->len;
1516                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1517                         set_bit(EXTENT_FLAG_FILLING, &em->flags);
1518                         em->generation = -1;
1519                         while (1) {
1520                                 write_lock(&em_tree->lock);
1521                                 ret = add_extent_mapping(em_tree, em, 1);
1522                                 write_unlock(&em_tree->lock);
1523                                 if (ret != -EEXIST) {
1524                                         free_extent_map(em);
1525                                         break;
1526                                 }
1527                                 btrfs_drop_extent_cache(inode, em->start,
1528                                                 em->start + em->len - 1, 0);
1529                         }
1530                         type = BTRFS_ORDERED_PREALLOC;
1531                 } else {
1532                         type = BTRFS_ORDERED_NOCOW;
1533                 }
1534
1535                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1536                                                num_bytes, num_bytes, type);
1537                 if (nocow)
1538                         btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
1539                 BUG_ON(ret); /* -ENOMEM */
1540
1541                 if (root->root_key.objectid ==
1542                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1543                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1544                                                       num_bytes);
1545                         if (ret) {
1546                                 if (!nolock && nocow)
1547                                         btrfs_end_write_no_snapshoting(root);
1548                                 goto error;
1549                         }
1550                 }
1551
1552                 extent_clear_unlock_delalloc(inode, cur_offset,
1553                                              cur_offset + num_bytes - 1, end,
1554                                              locked_page, EXTENT_LOCKED |
1555                                              EXTENT_DELALLOC |
1556                                              EXTENT_CLEAR_DATA_RESV,
1557                                              PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1558
1559                 if (!nolock && nocow)
1560                         btrfs_end_write_no_snapshoting(root);
1561                 cur_offset = extent_end;
1562                 if (cur_offset > end)
1563                         break;
1564         }
1565         btrfs_release_path(path);
1566
1567         if (cur_offset <= end && cow_start == (u64)-1)
1568                 cow_start = cur_offset;
1569
1570         if (cow_start != (u64)-1) {
1571                 cur_offset = end;
1572                 ret = cow_file_range(inode, locked_page, cow_start, end, end,
1573                                      page_started, nr_written, 1, NULL);
1574                 if (ret)
1575                         goto error;
1576         }
1577
1578 error:
1579         err = btrfs_end_transaction(trans, root);
1580         if (!ret)
1581                 ret = err;
1582
1583         if (ret && cur_offset < end)
1584                 extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1585                                              locked_page, EXTENT_LOCKED |
1586                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1587                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1588                                              PAGE_CLEAR_DIRTY |
1589                                              PAGE_SET_WRITEBACK |
1590                                              PAGE_END_WRITEBACK);
1591         btrfs_free_path(path);
1592         return ret;
1593 }
1594
1595 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1596 {
1597
1598         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1599             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1600                 return 0;
1601
1602         /*
1603          * @defrag_bytes is a hint value, no spinlock held here,
1604          * if is not zero, it means the file is defragging.
1605          * Force cow if given extent needs to be defragged.
1606          */
1607         if (BTRFS_I(inode)->defrag_bytes &&
1608             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1609                            EXTENT_DEFRAG, 0, NULL))
1610                 return 1;
1611
1612         return 0;
1613 }
1614
1615 /*
1616  * extent_io.c call back to do delayed allocation processing
1617  */
1618 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1619                               u64 start, u64 end, int *page_started,
1620                               unsigned long *nr_written)
1621 {
1622         int ret;
1623         int force_cow = need_force_cow(inode, start, end);
1624
1625         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1626                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1627                                          page_started, 1, nr_written);
1628         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1629                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1630                                          page_started, 0, nr_written);
1631         } else if (!inode_need_compress(inode)) {
1632                 ret = cow_file_range(inode, locked_page, start, end, end,
1633                                       page_started, nr_written, 1, NULL);
1634         } else {
1635                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1636                         &BTRFS_I(inode)->runtime_flags);
1637                 ret = cow_file_range_async(inode, locked_page, start, end,
1638                                            page_started, nr_written);
1639         }
1640         return ret;
1641 }
1642
1643 static void btrfs_split_extent_hook(struct inode *inode,
1644                                     struct extent_state *orig, u64 split)
1645 {
1646         u64 size;
1647
1648         /* not delalloc, ignore it */
1649         if (!(orig->state & EXTENT_DELALLOC))
1650                 return;
1651
1652         size = orig->end - orig->start + 1;
1653         if (size > BTRFS_MAX_EXTENT_SIZE) {
1654                 u64 num_extents;
1655                 u64 new_size;
1656
1657                 /*
1658                  * See the explanation in btrfs_merge_extent_hook, the same
1659                  * applies here, just in reverse.
1660                  */
1661                 new_size = orig->end - split + 1;
1662                 num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1663                                         BTRFS_MAX_EXTENT_SIZE);
1664                 new_size = split - orig->start;
1665                 num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1666                                         BTRFS_MAX_EXTENT_SIZE);
1667                 if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
1668                               BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1669                         return;
1670         }
1671
1672         spin_lock(&BTRFS_I(inode)->lock);
1673         BTRFS_I(inode)->outstanding_extents++;
1674         spin_unlock(&BTRFS_I(inode)->lock);
1675 }
1676
1677 /*
1678  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1679  * extents so we can keep track of new extents that are just merged onto old
1680  * extents, such as when we are doing sequential writes, so we can properly
1681  * account for the metadata space we'll need.
1682  */
1683 static void btrfs_merge_extent_hook(struct inode *inode,
1684                                     struct extent_state *new,
1685                                     struct extent_state *other)
1686 {
1687         u64 new_size, old_size;
1688         u64 num_extents;
1689
1690         /* not delalloc, ignore it */
1691         if (!(other->state & EXTENT_DELALLOC))
1692                 return;
1693
1694         if (new->start > other->start)
1695                 new_size = new->end - other->start + 1;
1696         else
1697                 new_size = other->end - new->start + 1;
1698
1699         /* we're not bigger than the max, unreserve the space and go */
1700         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1701                 spin_lock(&BTRFS_I(inode)->lock);
1702                 BTRFS_I(inode)->outstanding_extents--;
1703                 spin_unlock(&BTRFS_I(inode)->lock);
1704                 return;
1705         }
1706
1707         /*
1708          * We have to add up either side to figure out how many extents were
1709          * accounted for before we merged into one big extent.  If the number of
1710          * extents we accounted for is <= the amount we need for the new range
1711          * then we can return, otherwise drop.  Think of it like this
1712          *
1713          * [ 4k][MAX_SIZE]
1714          *
1715          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1716          * need 2 outstanding extents, on one side we have 1 and the other side
1717          * we have 1 so they are == and we can return.  But in this case
1718          *
1719          * [MAX_SIZE+4k][MAX_SIZE+4k]
1720          *
1721          * Each range on their own accounts for 2 extents, but merged together
1722          * they are only 3 extents worth of accounting, so we need to drop in
1723          * this case.
1724          */
1725         old_size = other->end - other->start + 1;
1726         num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1727                                 BTRFS_MAX_EXTENT_SIZE);
1728         old_size = new->end - new->start + 1;
1729         num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1730                                  BTRFS_MAX_EXTENT_SIZE);
1731
1732         if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1733                       BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1734                 return;
1735
1736         spin_lock(&BTRFS_I(inode)->lock);
1737         BTRFS_I(inode)->outstanding_extents--;
1738         spin_unlock(&BTRFS_I(inode)->lock);
1739 }
1740
1741 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1742                                       struct inode *inode)
1743 {
1744         spin_lock(&root->delalloc_lock);
1745         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1746                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1747                               &root->delalloc_inodes);
1748                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1749                         &BTRFS_I(inode)->runtime_flags);
1750                 root->nr_delalloc_inodes++;
1751                 if (root->nr_delalloc_inodes == 1) {
1752                         spin_lock(&root->fs_info->delalloc_root_lock);
1753                         BUG_ON(!list_empty(&root->delalloc_root));
1754                         list_add_tail(&root->delalloc_root,
1755                                       &root->fs_info->delalloc_roots);
1756                         spin_unlock(&root->fs_info->delalloc_root_lock);
1757                 }
1758         }
1759         spin_unlock(&root->delalloc_lock);
1760 }
1761
1762 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1763                                      struct inode *inode)
1764 {
1765         spin_lock(&root->delalloc_lock);
1766         if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1767                 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1768                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1769                           &BTRFS_I(inode)->runtime_flags);
1770                 root->nr_delalloc_inodes--;
1771                 if (!root->nr_delalloc_inodes) {
1772                         spin_lock(&root->fs_info->delalloc_root_lock);
1773                         BUG_ON(list_empty(&root->delalloc_root));
1774                         list_del_init(&root->delalloc_root);
1775                         spin_unlock(&root->fs_info->delalloc_root_lock);
1776                 }
1777         }
1778         spin_unlock(&root->delalloc_lock);
1779 }
1780
1781 /*
1782  * extent_io.c set_bit_hook, used to track delayed allocation
1783  * bytes in this file, and to maintain the list of inodes that
1784  * have pending delalloc work to be done.
1785  */
1786 static void btrfs_set_bit_hook(struct inode *inode,
1787                                struct extent_state *state, unsigned *bits)
1788 {
1789
1790         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1791                 WARN_ON(1);
1792         /*
1793          * set_bit and clear bit hooks normally require _irqsave/restore
1794          * but in this case, we are only testing for the DELALLOC
1795          * bit, which is only set or cleared with irqs on
1796          */
1797         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1798                 struct btrfs_root *root = BTRFS_I(inode)->root;
1799                 u64 len = state->end + 1 - state->start;
1800                 bool do_list = !btrfs_is_free_space_inode(inode);
1801
1802                 if (*bits & EXTENT_FIRST_DELALLOC) {
1803                         *bits &= ~EXTENT_FIRST_DELALLOC;
1804                 } else {
1805                         spin_lock(&BTRFS_I(inode)->lock);
1806                         BTRFS_I(inode)->outstanding_extents++;
1807                         spin_unlock(&BTRFS_I(inode)->lock);
1808                 }
1809
1810                 /* For sanity tests */
1811                 if (btrfs_is_testing(root->fs_info))
1812                         return;
1813
1814                 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1815                                      root->fs_info->delalloc_batch);
1816                 spin_lock(&BTRFS_I(inode)->lock);
1817                 BTRFS_I(inode)->delalloc_bytes += len;
1818                 if (*bits & EXTENT_DEFRAG)
1819                         BTRFS_I(inode)->defrag_bytes += len;
1820                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1821                                          &BTRFS_I(inode)->runtime_flags))
1822                         btrfs_add_delalloc_inodes(root, inode);
1823                 spin_unlock(&BTRFS_I(inode)->lock);
1824         }
1825 }
1826
1827 /*
1828  * extent_io.c clear_bit_hook, see set_bit_hook for why
1829  */
1830 static void btrfs_clear_bit_hook(struct inode *inode,
1831                                  struct extent_state *state,
1832                                  unsigned *bits)
1833 {
1834         u64 len = state->end + 1 - state->start;
1835         u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
1836                                     BTRFS_MAX_EXTENT_SIZE);
1837
1838         spin_lock(&BTRFS_I(inode)->lock);
1839         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1840                 BTRFS_I(inode)->defrag_bytes -= len;
1841         spin_unlock(&BTRFS_I(inode)->lock);
1842
1843         /*
1844          * set_bit and clear bit hooks normally require _irqsave/restore
1845          * but in this case, we are only testing for the DELALLOC
1846          * bit, which is only set or cleared with irqs on
1847          */
1848         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1849                 struct btrfs_root *root = BTRFS_I(inode)->root;
1850                 bool do_list = !btrfs_is_free_space_inode(inode);
1851
1852                 if (*bits & EXTENT_FIRST_DELALLOC) {
1853                         *bits &= ~EXTENT_FIRST_DELALLOC;
1854                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1855                         spin_lock(&BTRFS_I(inode)->lock);
1856                         BTRFS_I(inode)->outstanding_extents -= num_extents;
1857                         spin_unlock(&BTRFS_I(inode)->lock);
1858                 }
1859
1860                 /*
1861                  * We don't reserve metadata space for space cache inodes so we
1862                  * don't need to call dellalloc_release_metadata if there is an
1863                  * error.
1864                  */
1865                 if (*bits & EXTENT_DO_ACCOUNTING &&
1866                     root != root->fs_info->tree_root)
1867                         btrfs_delalloc_release_metadata(inode, len);
1868
1869                 /* For sanity tests. */
1870                 if (btrfs_is_testing(root->fs_info))
1871                         return;
1872
1873                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1874                     && do_list && !(state->state & EXTENT_NORESERVE)
1875                     && (*bits & (EXTENT_DO_ACCOUNTING |
1876                     EXTENT_CLEAR_DATA_RESV)))
1877                         btrfs_free_reserved_data_space_noquota(inode,
1878                                         state->start, len);
1879
1880                 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1881                                      root->fs_info->delalloc_batch);
1882                 spin_lock(&BTRFS_I(inode)->lock);
1883                 BTRFS_I(inode)->delalloc_bytes -= len;
1884                 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1885                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1886                              &BTRFS_I(inode)->runtime_flags))
1887                         btrfs_del_delalloc_inode(root, inode);
1888                 spin_unlock(&BTRFS_I(inode)->lock);
1889         }
1890 }
1891
1892 /*
1893  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1894  * we don't create bios that span stripes or chunks
1895  *
1896  * return 1 if page cannot be merged to bio
1897  * return 0 if page can be merged to bio
1898  * return error otherwise
1899  */
1900 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1901                          size_t size, struct bio *bio,
1902                          unsigned long bio_flags)
1903 {
1904         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1905         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1906         u64 length = 0;
1907         u64 map_length;
1908         int ret;
1909
1910         if (bio_flags & EXTENT_BIO_COMPRESSED)
1911                 return 0;
1912
1913         length = bio->bi_iter.bi_size;
1914         map_length = length;
1915         ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
1916                               &map_length, NULL, 0);
1917         if (ret < 0)
1918                 return ret;
1919         if (map_length < length + size)
1920                 return 1;
1921         return 0;
1922 }
1923
1924 /*
1925  * in order to insert checksums into the metadata in large chunks,
1926  * we wait until bio submission time.   All the pages in the bio are
1927  * checksummed and sums are attached onto the ordered extent record.
1928  *
1929  * At IO completion time the cums attached on the ordered extent record
1930  * are inserted into the btree
1931  */
1932 static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
1933                                     int mirror_num, unsigned long bio_flags,
1934                                     u64 bio_offset)
1935 {
1936         struct btrfs_root *root = BTRFS_I(inode)->root;
1937         int ret = 0;
1938
1939         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1940         BUG_ON(ret); /* -ENOMEM */
1941         return 0;
1942 }
1943
1944 /*
1945  * in order to insert checksums into the metadata in large chunks,
1946  * we wait until bio submission time.   All the pages in the bio are
1947  * checksummed and sums are attached onto the ordered extent record.
1948  *
1949  * At IO completion time the cums attached on the ordered extent record
1950  * are inserted into the btree
1951  */
1952 static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
1953                           int mirror_num, unsigned long bio_flags,
1954                           u64 bio_offset)
1955 {
1956         struct btrfs_root *root = BTRFS_I(inode)->root;
1957         int ret;
1958
1959         ret = btrfs_map_bio(root, bio, mirror_num, 1);
1960         if (ret) {
1961                 bio->bi_error = ret;
1962                 bio_endio(bio);
1963         }
1964         return ret;
1965 }
1966
1967 /*
1968  * extent_io.c submission hook. This does the right thing for csum calculation
1969  * on write, or reading the csums from the tree before a read
1970  */
1971 static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1972                           int mirror_num, unsigned long bio_flags,
1973                           u64 bio_offset)
1974 {
1975         struct btrfs_root *root = BTRFS_I(inode)->root;
1976         enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1977         int ret = 0;
1978         int skip_sum;
1979         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1980
1981         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1982
1983         if (btrfs_is_free_space_inode(inode))
1984                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1985
1986         if (bio_op(bio) != REQ_OP_WRITE) {
1987                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1988                 if (ret)
1989                         goto out;
1990
1991                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1992                         ret = btrfs_submit_compressed_read(inode, bio,
1993                                                            mirror_num,
1994                                                            bio_flags);
1995                         goto out;
1996                 } else if (!skip_sum) {
1997                         ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1998                         if (ret)
1999                                 goto out;
2000                 }
2001                 goto mapit;
2002         } else if (async && !skip_sum) {
2003                 /* csum items have already been cloned */
2004                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2005                         goto mapit;
2006                 /* we're doing a write, do the async checksumming */
2007                 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
2008                                    inode, bio, mirror_num,
2009                                    bio_flags, bio_offset,
2010                                    __btrfs_submit_bio_start,
2011                                    __btrfs_submit_bio_done);
2012                 goto out;
2013         } else if (!skip_sum) {
2014                 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
2015                 if (ret)
2016                         goto out;
2017         }
2018
2019 mapit:
2020         ret = btrfs_map_bio(root, bio, mirror_num, 0);
2021
2022 out:
2023         if (ret < 0) {
2024                 bio->bi_error = ret;
2025                 bio_endio(bio);
2026         }
2027         return ret;
2028 }
2029
2030 /*
2031  * given a list of ordered sums record them in the inode.  This happens
2032  * at IO completion time based on sums calculated at bio submission time.
2033  */
2034 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2035                              struct inode *inode, u64 file_offset,
2036                              struct list_head *list)
2037 {
2038         struct btrfs_ordered_sum *sum;
2039
2040         list_for_each_entry(sum, list, list) {
2041                 trans->adding_csums = 1;
2042                 btrfs_csum_file_blocks(trans,
2043                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
2044                 trans->adding_csums = 0;
2045         }
2046         return 0;
2047 }
2048
2049 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2050                               struct extent_state **cached_state, int dedupe)
2051 {
2052         WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2053         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2054                                    cached_state);
2055 }
2056
2057 /* see btrfs_writepage_start_hook for details on why this is required */
2058 struct btrfs_writepage_fixup {
2059         struct page *page;
2060         struct btrfs_work work;
2061 };
2062
2063 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2064 {
2065         struct btrfs_writepage_fixup *fixup;
2066         struct btrfs_ordered_extent *ordered;
2067         struct extent_state *cached_state = NULL;
2068         struct page *page;
2069         struct inode *inode;
2070         u64 page_start;
2071         u64 page_end;
2072         int ret;
2073
2074         fixup = container_of(work, struct btrfs_writepage_fixup, work);
2075         page = fixup->page;
2076 again:
2077         lock_page(page);
2078         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2079                 ClearPageChecked(page);
2080                 goto out_page;
2081         }
2082
2083         inode = page->mapping->host;
2084         page_start = page_offset(page);
2085         page_end = page_offset(page) + PAGE_SIZE - 1;
2086
2087         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2088                          &cached_state);
2089
2090         /* already ordered? We're done */
2091         if (PagePrivate2(page))
2092                 goto out;
2093
2094         ordered = btrfs_lookup_ordered_range(inode, page_start,
2095                                         PAGE_SIZE);
2096         if (ordered) {
2097                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2098                                      page_end, &cached_state, GFP_NOFS);
2099                 unlock_page(page);
2100                 btrfs_start_ordered_extent(inode, ordered, 1);
2101                 btrfs_put_ordered_extent(ordered);
2102                 goto again;
2103         }
2104
2105         ret = btrfs_delalloc_reserve_space(inode, page_start,
2106                                            PAGE_SIZE);
2107         if (ret) {
2108                 mapping_set_error(page->mapping, ret);
2109                 end_extent_writepage(page, ret, page_start, page_end);
2110                 ClearPageChecked(page);
2111                 goto out;
2112          }
2113
2114         ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
2115                                         &cached_state, 0);
2116         if (ret) {
2117                 mapping_set_error(page->mapping, ret);
2118                 end_extent_writepage(page, ret, page_start, page_end);
2119                 ClearPageChecked(page);
2120                 goto out;
2121         }
2122
2123         ClearPageChecked(page);
2124         set_page_dirty(page);
2125 out:
2126         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2127                              &cached_state, GFP_NOFS);
2128 out_page:
2129         unlock_page(page);
2130         put_page(page);
2131         kfree(fixup);
2132 }
2133
2134 /*
2135  * There are a few paths in the higher layers of the kernel that directly
2136  * set the page dirty bit without asking the filesystem if it is a
2137  * good idea.  This causes problems because we want to make sure COW
2138  * properly happens and the data=ordered rules are followed.
2139  *
2140  * In our case any range that doesn't have the ORDERED bit set
2141  * hasn't been properly setup for IO.  We kick off an async process
2142  * to fix it up.  The async helper will wait for ordered extents, set
2143  * the delalloc bit and make it safe to write the page.
2144  */
2145 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2146 {
2147         struct inode *inode = page->mapping->host;
2148         struct btrfs_writepage_fixup *fixup;
2149         struct btrfs_root *root = BTRFS_I(inode)->root;
2150
2151         /* this page is properly in the ordered list */
2152         if (TestClearPagePrivate2(page))
2153                 return 0;
2154
2155         if (PageChecked(page))
2156                 return -EAGAIN;
2157
2158         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2159         if (!fixup)
2160                 return -EAGAIN;
2161
2162         SetPageChecked(page);
2163         get_page(page);
2164         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2165                         btrfs_writepage_fixup_worker, NULL, NULL);
2166         fixup->page = page;
2167         btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
2168         return -EBUSY;
2169 }
2170
2171 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2172                                        struct inode *inode, u64 file_pos,
2173                                        u64 disk_bytenr, u64 disk_num_bytes,
2174                                        u64 num_bytes, u64 ram_bytes,
2175                                        u8 compression, u8 encryption,
2176                                        u16 other_encoding, int extent_type)
2177 {
2178         struct btrfs_root *root = BTRFS_I(inode)->root;
2179         struct btrfs_file_extent_item *fi;
2180         struct btrfs_path *path;
2181         struct extent_buffer *leaf;
2182         struct btrfs_key ins;
2183         int extent_inserted = 0;
2184         int ret;
2185
2186         path = btrfs_alloc_path();
2187         if (!path)
2188                 return -ENOMEM;
2189
2190         /*
2191          * we may be replacing one extent in the tree with another.
2192          * The new extent is pinned in the extent map, and we don't want
2193          * to drop it from the cache until it is completely in the btree.
2194          *
2195          * So, tell btrfs_drop_extents to leave this extent in the cache.
2196          * the caller is expected to unpin it and allow it to be merged
2197          * with the others.
2198          */
2199         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2200                                    file_pos + num_bytes, NULL, 0,
2201                                    1, sizeof(*fi), &extent_inserted);
2202         if (ret)
2203                 goto out;
2204
2205         if (!extent_inserted) {
2206                 ins.objectid = btrfs_ino(inode);
2207                 ins.offset = file_pos;
2208                 ins.type = BTRFS_EXTENT_DATA_KEY;
2209
2210                 path->leave_spinning = 1;
2211                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2212                                               sizeof(*fi));
2213                 if (ret)
2214                         goto out;
2215         }
2216         leaf = path->nodes[0];
2217         fi = btrfs_item_ptr(leaf, path->slots[0],
2218                             struct btrfs_file_extent_item);
2219         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2220         btrfs_set_file_extent_type(leaf, fi, extent_type);
2221         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2222         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2223         btrfs_set_file_extent_offset(leaf, fi, 0);
2224         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2225         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2226         btrfs_set_file_extent_compression(leaf, fi, compression);
2227         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2228         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2229
2230         btrfs_mark_buffer_dirty(leaf);
2231         btrfs_release_path(path);
2232
2233         inode_add_bytes(inode, num_bytes);
2234
2235         ins.objectid = disk_bytenr;
2236         ins.offset = disk_num_bytes;
2237         ins.type = BTRFS_EXTENT_ITEM_KEY;
2238         ret = btrfs_alloc_reserved_file_extent(trans, root,
2239                                         root->root_key.objectid,
2240                                         btrfs_ino(inode), file_pos,
2241                                         ram_bytes, &ins);
2242         /*
2243          * Release the reserved range from inode dirty range map, as it is
2244          * already moved into delayed_ref_head
2245          */
2246         btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2247 out:
2248         btrfs_free_path(path);
2249
2250         return ret;
2251 }
2252
2253 /* snapshot-aware defrag */
2254 struct sa_defrag_extent_backref {
2255         struct rb_node node;
2256         struct old_sa_defrag_extent *old;
2257         u64 root_id;
2258         u64 inum;
2259         u64 file_pos;
2260         u64 extent_offset;
2261         u64 num_bytes;
2262         u64 generation;
2263 };
2264
2265 struct old_sa_defrag_extent {
2266         struct list_head list;
2267         struct new_sa_defrag_extent *new;
2268
2269         u64 extent_offset;
2270         u64 bytenr;
2271         u64 offset;
2272         u64 len;
2273         int count;
2274 };
2275
2276 struct new_sa_defrag_extent {
2277         struct rb_root root;
2278         struct list_head head;
2279         struct btrfs_path *path;
2280         struct inode *inode;
2281         u64 file_pos;
2282         u64 len;
2283         u64 bytenr;
2284         u64 disk_len;
2285         u8 compress_type;
2286 };
2287
2288 static int backref_comp(struct sa_defrag_extent_backref *b1,
2289                         struct sa_defrag_extent_backref *b2)
2290 {
2291         if (b1->root_id < b2->root_id)
2292                 return -1;
2293         else if (b1->root_id > b2->root_id)
2294                 return 1;
2295
2296         if (b1->inum < b2->inum)
2297                 return -1;
2298         else if (b1->inum > b2->inum)
2299                 return 1;
2300
2301         if (b1->file_pos < b2->file_pos)
2302                 return -1;
2303         else if (b1->file_pos > b2->file_pos)
2304                 return 1;
2305
2306         /*
2307          * [------------------------------] ===> (a range of space)
2308          *     |<--->|   |<---->| =============> (fs/file tree A)
2309          * |<---------------------------->| ===> (fs/file tree B)
2310          *
2311          * A range of space can refer to two file extents in one tree while
2312          * refer to only one file extent in another tree.
2313          *
2314          * So we may process a disk offset more than one time(two extents in A)
2315          * and locate at the same extent(one extent in B), then insert two same
2316          * backrefs(both refer to the extent in B).
2317          */
2318         return 0;
2319 }
2320
2321 static void backref_insert(struct rb_root *root,
2322                            struct sa_defrag_extent_backref *backref)
2323 {
2324         struct rb_node **p = &root->rb_node;
2325         struct rb_node *parent = NULL;
2326         struct sa_defrag_extent_backref *entry;
2327         int ret;
2328
2329         while (*p) {
2330                 parent = *p;
2331                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2332
2333                 ret = backref_comp(backref, entry);
2334                 if (ret < 0)
2335                         p = &(*p)->rb_left;
2336                 else
2337                         p = &(*p)->rb_right;
2338         }
2339
2340         rb_link_node(&backref->node, parent, p);
2341         rb_insert_color(&backref->node, root);
2342 }
2343
2344 /*
2345  * Note the backref might has changed, and in this case we just return 0.
2346  */
2347 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2348                                        void *ctx)
2349 {
2350         struct btrfs_file_extent_item *extent;
2351         struct btrfs_fs_info *fs_info;
2352         struct old_sa_defrag_extent *old = ctx;
2353         struct new_sa_defrag_extent *new = old->new;
2354         struct btrfs_path *path = new->path;
2355         struct btrfs_key key;
2356         struct btrfs_root *root;
2357         struct sa_defrag_extent_backref *backref;
2358         struct extent_buffer *leaf;
2359         struct inode *inode = new->inode;
2360         int slot;
2361         int ret;
2362         u64 extent_offset;
2363         u64 num_bytes;
2364
2365         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2366             inum == btrfs_ino(inode))
2367                 return 0;
2368
2369         key.objectid = root_id;
2370         key.type = BTRFS_ROOT_ITEM_KEY;
2371         key.offset = (u64)-1;
2372
2373         fs_info = BTRFS_I(inode)->root->fs_info;
2374         root = btrfs_read_fs_root_no_name(fs_info, &key);
2375         if (IS_ERR(root)) {
2376                 if (PTR_ERR(root) == -ENOENT)
2377                         return 0;
2378                 WARN_ON(1);
2379                 btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2380                          inum, offset, root_id);
2381                 return PTR_ERR(root);
2382         }
2383
2384         key.objectid = inum;
2385         key.type = BTRFS_EXTENT_DATA_KEY;
2386         if (offset > (u64)-1 << 32)
2387                 key.offset = 0;
2388         else
2389                 key.offset = offset;
2390
2391         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2392         if (WARN_ON(ret < 0))
2393                 return ret;
2394         ret = 0;
2395
2396         while (1) {
2397                 cond_resched();
2398
2399                 leaf = path->nodes[0];
2400                 slot = path->slots[0];
2401
2402                 if (slot >= btrfs_header_nritems(leaf)) {
2403                         ret = btrfs_next_leaf(root, path);
2404                         if (ret < 0) {
2405                                 goto out;
2406                         } else if (ret > 0) {
2407                                 ret = 0;
2408                                 goto out;
2409                         }
2410                         continue;
2411                 }
2412
2413                 path->slots[0]++;
2414
2415                 btrfs_item_key_to_cpu(leaf, &key, slot);
2416
2417                 if (key.objectid > inum)
2418                         goto out;
2419
2420                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2421                         continue;
2422
2423                 extent = btrfs_item_ptr(leaf, slot,
2424                                         struct btrfs_file_extent_item);
2425
2426                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2427                         continue;
2428
2429                 /*
2430                  * 'offset' refers to the exact key.offset,
2431                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2432                  * (key.offset - extent_offset).
2433                  */
2434                 if (key.offset != offset)
2435                         continue;
2436
2437                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2438                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2439
2440                 if (extent_offset >= old->extent_offset + old->offset +
2441                     old->len || extent_offset + num_bytes <=
2442                     old->extent_offset + old->offset)
2443                         continue;
2444                 break;
2445         }
2446
2447         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2448         if (!backref) {
2449                 ret = -ENOENT;
2450                 goto out;
2451         }
2452
2453         backref->root_id = root_id;
2454         backref->inum = inum;
2455         backref->file_pos = offset;
2456         backref->num_bytes = num_bytes;
2457         backref->extent_offset = extent_offset;
2458         backref->generation = btrfs_file_extent_generation(leaf, extent);
2459         backref->old = old;
2460         backref_insert(&new->root, backref);
2461         old->count++;
2462 out:
2463         btrfs_release_path(path);
2464         WARN_ON(ret);
2465         return ret;
2466 }
2467
2468 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2469                                    struct new_sa_defrag_extent *new)
2470 {
2471         struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2472         struct old_sa_defrag_extent *old, *tmp;
2473         int ret;
2474
2475         new->path = path;
2476
2477         list_for_each_entry_safe(old, tmp, &new->head, list) {
2478                 ret = iterate_inodes_from_logical(old->bytenr +
2479                                                   old->extent_offset, fs_info,
2480                                                   path, record_one_backref,
2481                                                   old);
2482                 if (ret < 0 && ret != -ENOENT)
2483                         return false;
2484
2485                 /* no backref to be processed for this extent */
2486                 if (!old->count) {
2487                         list_del(&old->list);
2488                         kfree(old);
2489                 }
2490         }
2491
2492         if (list_empty(&new->head))
2493                 return false;
2494
2495         return true;
2496 }
2497
2498 static int relink_is_mergable(struct extent_buffer *leaf,
2499                               struct btrfs_file_extent_item *fi,
2500                               struct new_sa_defrag_extent *new)
2501 {
2502         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2503                 return 0;
2504
2505         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2506                 return 0;
2507
2508         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2509                 return 0;
2510
2511         if (btrfs_file_extent_encryption(leaf, fi) ||
2512             btrfs_file_extent_other_encoding(leaf, fi))
2513                 return 0;
2514
2515         return 1;
2516 }
2517
2518 /*
2519  * Note the backref might has changed, and in this case we just return 0.
2520  */
2521 static noinline int relink_extent_backref(struct btrfs_path *path,
2522                                  struct sa_defrag_extent_backref *prev,
2523                                  struct sa_defrag_extent_backref *backref)
2524 {
2525         struct btrfs_file_extent_item *extent;
2526         struct btrfs_file_extent_item *item;
2527         struct btrfs_ordered_extent *ordered;
2528         struct btrfs_trans_handle *trans;
2529         struct btrfs_fs_info *fs_info;
2530         struct btrfs_root *root;
2531         struct btrfs_key key;
2532         struct extent_buffer *leaf;
2533         struct old_sa_defrag_extent *old = backref->old;
2534         struct new_sa_defrag_extent *new = old->new;
2535         struct inode *src_inode = new->inode;
2536         struct inode *inode;
2537         struct extent_state *cached = NULL;
2538         int ret = 0;
2539         u64 start;
2540         u64 len;
2541         u64 lock_start;
2542         u64 lock_end;
2543         bool merge = false;
2544         int index;
2545
2546         if (prev && prev->root_id == backref->root_id &&
2547             prev->inum == backref->inum &&
2548             prev->file_pos + prev->num_bytes == backref->file_pos)
2549                 merge = true;
2550
2551         /* step 1: get root */
2552         key.objectid = backref->root_id;
2553         key.type = BTRFS_ROOT_ITEM_KEY;
2554         key.offset = (u64)-1;
2555
2556         fs_info = BTRFS_I(src_inode)->root->fs_info;
2557         index = srcu_read_lock(&fs_info->subvol_srcu);
2558
2559         root = btrfs_read_fs_root_no_name(fs_info, &key);
2560         if (IS_ERR(root)) {
2561                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2562                 if (PTR_ERR(root) == -ENOENT)
2563                         return 0;
2564                 return PTR_ERR(root);
2565         }
2566
2567         if (btrfs_root_readonly(root)) {
2568                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2569                 return 0;
2570         }
2571
2572         /* step 2: get inode */
2573         key.objectid = backref->inum;
2574         key.type = BTRFS_INODE_ITEM_KEY;
2575         key.offset = 0;
2576
2577         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2578         if (IS_ERR(inode)) {
2579                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2580                 return 0;
2581         }
2582
2583         srcu_read_unlock(&fs_info->subvol_srcu, index);
2584
2585         /* step 3: relink backref */
2586         lock_start = backref->file_pos;
2587         lock_end = backref->file_pos + backref->num_bytes - 1;
2588         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2589                          &cached);
2590
2591         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2592         if (ordered) {
2593                 btrfs_put_ordered_extent(ordered);
2594                 goto out_unlock;
2595         }
2596
2597         trans = btrfs_join_transaction(root);
2598         if (IS_ERR(trans)) {
2599                 ret = PTR_ERR(trans);
2600                 goto out_unlock;
2601         }
2602
2603         key.objectid = backref->inum;
2604         key.type = BTRFS_EXTENT_DATA_KEY;
2605         key.offset = backref->file_pos;
2606
2607         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2608         if (ret < 0) {
2609                 goto out_free_path;
2610         } else if (ret > 0) {
2611                 ret = 0;
2612                 goto out_free_path;
2613         }
2614
2615         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2616                                 struct btrfs_file_extent_item);
2617
2618         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2619             backref->generation)
2620                 goto out_free_path;
2621
2622         btrfs_release_path(path);
2623
2624         start = backref->file_pos;
2625         if (backref->extent_offset < old->extent_offset + old->offset)
2626                 start += old->extent_offset + old->offset -
2627                          backref->extent_offset;
2628
2629         len = min(backref->extent_offset + backref->num_bytes,
2630                   old->extent_offset + old->offset + old->len);
2631         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2632
2633         ret = btrfs_drop_extents(trans, root, inode, start,
2634                                  start + len, 1);
2635         if (ret)
2636                 goto out_free_path;
2637 again:
2638         key.objectid = btrfs_ino(inode);
2639         key.type = BTRFS_EXTENT_DATA_KEY;
2640         key.offset = start;
2641
2642         path->leave_spinning = 1;
2643         if (merge) {
2644                 struct btrfs_file_extent_item *fi;
2645                 u64 extent_len;
2646                 struct btrfs_key found_key;
2647
2648                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2649                 if (ret < 0)
2650                         goto out_free_path;
2651
2652                 path->slots[0]--;
2653                 leaf = path->nodes[0];
2654                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2655
2656                 fi = btrfs_item_ptr(leaf, path->slots[0],
2657                                     struct btrfs_file_extent_item);
2658                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2659
2660                 if (extent_len + found_key.offset == start &&
2661                     relink_is_mergable(leaf, fi, new)) {
2662                         btrfs_set_file_extent_num_bytes(leaf, fi,
2663                                                         extent_len + len);
2664                         btrfs_mark_buffer_dirty(leaf);
2665                         inode_add_bytes(inode, len);
2666
2667                         ret = 1;
2668                         goto out_free_path;
2669                 } else {
2670                         merge = false;
2671                         btrfs_release_path(path);
2672                         goto again;
2673                 }
2674         }
2675
2676         ret = btrfs_insert_empty_item(trans, root, path, &key,
2677                                         sizeof(*extent));
2678         if (ret) {
2679                 btrfs_abort_transaction(trans, ret);
2680                 goto out_free_path;
2681         }
2682
2683         leaf = path->nodes[0];
2684         item = btrfs_item_ptr(leaf, path->slots[0],
2685                                 struct btrfs_file_extent_item);
2686         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2687         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2688         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2689         btrfs_set_file_extent_num_bytes(leaf, item, len);
2690         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2691         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2692         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2693         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2694         btrfs_set_file_extent_encryption(leaf, item, 0);
2695         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2696
2697         btrfs_mark_buffer_dirty(leaf);
2698         inode_add_bytes(inode, len);
2699         btrfs_release_path(path);
2700
2701         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2702                         new->disk_len, 0,
2703                         backref->root_id, backref->inum,
2704                         new->file_pos); /* start - extent_offset */
2705         if (ret) {
2706                 btrfs_abort_transaction(trans, ret);
2707                 goto out_free_path;
2708         }
2709
2710         ret = 1;
2711 out_free_path:
2712         btrfs_release_path(path);
2713         path->leave_spinning = 0;
2714         btrfs_end_transaction(trans, root);
2715 out_unlock:
2716         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2717                              &cached, GFP_NOFS);
2718         iput(inode);
2719         return ret;
2720 }
2721
2722 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2723 {
2724         struct old_sa_defrag_extent *old, *tmp;
2725
2726         if (!new)
2727                 return;
2728
2729         list_for_each_entry_safe(old, tmp, &new->head, list) {
2730                 kfree(old);
2731         }
2732         kfree(new);
2733 }
2734
2735 static void relink_file_extents(struct new_sa_defrag_extent *new)
2736 {
2737         struct btrfs_path *path;
2738         struct sa_defrag_extent_backref *backref;
2739         struct sa_defrag_extent_backref *prev = NULL;
2740         struct inode *inode;
2741         struct btrfs_root *root;
2742         struct rb_node *node;
2743         int ret;
2744
2745         inode = new->inode;
2746         root = BTRFS_I(inode)->root;
2747
2748         path = btrfs_alloc_path();
2749         if (!path)
2750                 return;
2751
2752         if (!record_extent_backrefs(path, new)) {
2753                 btrfs_free_path(path);
2754                 goto out;
2755         }
2756         btrfs_release_path(path);
2757
2758         while (1) {
2759                 node = rb_first(&new->root);
2760                 if (!node)
2761                         break;
2762                 rb_erase(node, &new->root);
2763
2764                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2765
2766                 ret = relink_extent_backref(path, prev, backref);
2767                 WARN_ON(ret < 0);
2768
2769                 kfree(prev);
2770
2771                 if (ret == 1)
2772                         prev = backref;
2773                 else
2774                         prev = NULL;
2775                 cond_resched();
2776         }
2777         kfree(prev);
2778
2779         btrfs_free_path(path);
2780 out:
2781         free_sa_defrag_extent(new);
2782
2783         atomic_dec(&root->fs_info->defrag_running);
2784         wake_up(&root->fs_info->transaction_wait);
2785 }
2786
2787 static struct new_sa_defrag_extent *
2788 record_old_file_extents(struct inode *inode,
2789                         struct btrfs_ordered_extent *ordered)
2790 {
2791         struct btrfs_root *root = BTRFS_I(inode)->root;
2792         struct btrfs_path *path;
2793         struct btrfs_key key;
2794         struct old_sa_defrag_extent *old;
2795         struct new_sa_defrag_extent *new;
2796         int ret;
2797
2798         new = kmalloc(sizeof(*new), GFP_NOFS);
2799         if (!new)
2800                 return NULL;
2801
2802         new->inode = inode;
2803         new->file_pos = ordered->file_offset;
2804         new->len = ordered->len;
2805         new->bytenr = ordered->start;
2806         new->disk_len = ordered->disk_len;
2807         new->compress_type = ordered->compress_type;
2808         new->root = RB_ROOT;
2809         INIT_LIST_HEAD(&new->head);
2810
2811         path = btrfs_alloc_path();
2812         if (!path)
2813                 goto out_kfree;
2814
2815         key.objectid = btrfs_ino(inode);
2816         key.type = BTRFS_EXTENT_DATA_KEY;
2817         key.offset = new->file_pos;
2818
2819         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2820         if (ret < 0)
2821                 goto out_free_path;
2822         if (ret > 0 && path->slots[0] > 0)
2823                 path->slots[0]--;
2824
2825         /* find out all the old extents for the file range */
2826         while (1) {
2827                 struct btrfs_file_extent_item *extent;
2828                 struct extent_buffer *l;
2829                 int slot;
2830                 u64 num_bytes;
2831                 u64 offset;
2832                 u64 end;
2833                 u64 disk_bytenr;
2834                 u64 extent_offset;
2835
2836                 l = path->nodes[0];
2837                 slot = path->slots[0];
2838
2839                 if (slot >= btrfs_header_nritems(l)) {
2840                         ret = btrfs_next_leaf(root, path);
2841                         if (ret < 0)
2842                                 goto out_free_path;
2843                         else if (ret > 0)
2844                                 break;
2845                         continue;
2846                 }
2847
2848                 btrfs_item_key_to_cpu(l, &key, slot);
2849
2850                 if (key.objectid != btrfs_ino(inode))
2851                         break;
2852                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2853                         break;
2854                 if (key.offset >= new->file_pos + new->len)
2855                         break;
2856
2857                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2858
2859                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2860                 if (key.offset + num_bytes < new->file_pos)
2861                         goto next;
2862
2863                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2864                 if (!disk_bytenr)
2865                         goto next;
2866
2867                 extent_offset = btrfs_file_extent_offset(l, extent);
2868
2869                 old = kmalloc(sizeof(*old), GFP_NOFS);
2870                 if (!old)
2871                         goto out_free_path;
2872
2873                 offset = max(new->file_pos, key.offset);
2874                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2875
2876                 old->bytenr = disk_bytenr;
2877                 old->extent_offset = extent_offset;
2878                 old->offset = offset - key.offset;
2879                 old->len = end - offset;
2880                 old->new = new;
2881                 old->count = 0;
2882                 list_add_tail(&old->list, &new->head);
2883 next:
2884                 path->slots[0]++;
2885                 cond_resched();
2886         }
2887
2888         btrfs_free_path(path);
2889         atomic_inc(&root->fs_info->defrag_running);
2890
2891         return new;
2892
2893 out_free_path:
2894         btrfs_free_path(path);
2895 out_kfree:
2896         free_sa_defrag_extent(new);
2897         return NULL;
2898 }
2899
2900 static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2901                                          u64 start, u64 len)
2902 {
2903         struct btrfs_block_group_cache *cache;
2904
2905         cache = btrfs_lookup_block_group(root->fs_info, start);
2906         ASSERT(cache);
2907
2908         spin_lock(&cache->lock);
2909         cache->delalloc_bytes -= len;
2910         spin_unlock(&cache->lock);
2911
2912         btrfs_put_block_group(cache);
2913 }
2914
2915 /* as ordered data IO finishes, this gets called so we can finish
2916  * an ordered extent if the range of bytes in the file it covers are
2917  * fully written.
2918  */
2919 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2920 {
2921         struct inode *inode = ordered_extent->inode;
2922         struct btrfs_root *root = BTRFS_I(inode)->root;
2923         struct btrfs_trans_handle *trans = NULL;
2924         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2925         struct extent_state *cached_state = NULL;
2926         struct new_sa_defrag_extent *new = NULL;
2927         int compress_type = 0;
2928         int ret = 0;
2929         u64 logical_len = ordered_extent->len;
2930         bool nolock;
2931         bool truncated = false;
2932
2933         nolock = btrfs_is_free_space_inode(inode);
2934
2935         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2936                 ret = -EIO;
2937                 goto out;
2938         }
2939
2940         btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2941                                      ordered_extent->file_offset +
2942                                      ordered_extent->len - 1);
2943
2944         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2945                 truncated = true;
2946                 logical_len = ordered_extent->truncated_len;
2947                 /* Truncated the entire extent, don't bother adding */
2948                 if (!logical_len)
2949                         goto out;
2950         }
2951
2952         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2953                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2954
2955                 /*
2956                  * For mwrite(mmap + memset to write) case, we still reserve
2957                  * space for NOCOW range.
2958                  * As NOCOW won't cause a new delayed ref, just free the space
2959                  */
2960                 btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2961                                        ordered_extent->len);
2962                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2963                 if (nolock)
2964                         trans = btrfs_join_transaction_nolock(root);
2965                 else
2966                         trans = btrfs_join_transaction(root);
2967                 if (IS_ERR(trans)) {
2968                         ret = PTR_ERR(trans);
2969                         trans = NULL;
2970                         goto out;
2971                 }
2972                 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2973                 ret = btrfs_update_inode_fallback(trans, root, inode);
2974                 if (ret) /* -ENOMEM or corruption */
2975                         btrfs_abort_transaction(trans, ret);
2976                 goto out;
2977         }
2978
2979         lock_extent_bits(io_tree, ordered_extent->file_offset,
2980                          ordered_extent->file_offset + ordered_extent->len - 1,
2981                          &cached_state);
2982
2983         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2984                         ordered_extent->file_offset + ordered_extent->len - 1,
2985                         EXTENT_DEFRAG, 0, cached_state);
2986         if (ret) {
2987                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2988                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2989                         /* the inode is shared */
2990                         new = record_old_file_extents(inode, ordered_extent);
2991
2992                 clear_extent_bit(io_tree, ordered_extent->file_offset,
2993                         ordered_extent->file_offset + ordered_extent->len - 1,
2994                         EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2995         }
2996
2997         if (nolock)
2998                 trans = btrfs_join_transaction_nolock(root);
2999         else
3000                 trans = btrfs_join_transaction(root);
3001         if (IS_ERR(trans)) {
3002                 ret = PTR_ERR(trans);
3003                 trans = NULL;
3004                 goto out_unlock;
3005         }
3006
3007         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
3008
3009         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3010                 compress_type = ordered_extent->compress_type;
3011         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3012                 BUG_ON(compress_type);
3013                 ret = btrfs_mark_extent_written(trans, inode,
3014                                                 ordered_extent->file_offset,
3015                                                 ordered_extent->file_offset +
3016                                                 logical_len);
3017         } else {
3018                 BUG_ON(root == root->fs_info->tree_root);
3019                 ret = insert_reserved_file_extent(trans, inode,
3020                                                 ordered_extent->file_offset,
3021                                                 ordered_extent->start,
3022                                                 ordered_extent->disk_len,
3023                                                 logical_len, logical_len,
3024                                                 compress_type, 0, 0,
3025                                                 BTRFS_FILE_EXTENT_REG);
3026                 if (!ret)
3027                         btrfs_release_delalloc_bytes(root,
3028                                                      ordered_extent->start,
3029                                                      ordered_extent->disk_len);
3030         }
3031         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3032                            ordered_extent->file_offset, ordered_extent->len,
3033                            trans->transid);
3034         if (ret < 0) {
3035                 btrfs_abort_transaction(trans, ret);
3036                 goto out_unlock;
3037         }
3038
3039         add_pending_csums(trans, inode, ordered_extent->file_offset,
3040                           &ordered_extent->list);
3041
3042         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3043         ret = btrfs_update_inode_fallback(trans, root, inode);
3044         if (ret) { /* -ENOMEM or corruption */
3045                 btrfs_abort_transaction(trans, ret);
3046                 goto out_unlock;
3047         }
3048         ret = 0;
3049 out_unlock:
3050         unlock_extent_cached(io_tree, ordered_extent->file_offset,
3051                              ordered_extent->file_offset +
3052                              ordered_extent->len - 1, &cached_state, GFP_NOFS);
3053 out:
3054         if (root != root->fs_info->tree_root)
3055                 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
3056         if (trans)
3057                 btrfs_end_transaction(trans, root);
3058
3059         if (ret || truncated) {
3060                 u64 start, end;
3061
3062                 if (truncated)
3063                         start = ordered_extent->file_offset + logical_len;
3064                 else
3065                         start = ordered_extent->file_offset;
3066                 end = ordered_extent->file_offset + ordered_extent->len - 1;
3067                 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
3068
3069                 /* Drop the cache for the part of the extent we didn't write. */
3070                 btrfs_drop_extent_cache(inode, start, end, 0);
3071
3072                 /*
3073                  * If the ordered extent had an IOERR or something else went
3074                  * wrong we need to return the space for this ordered extent
3075                  * back to the allocator.  We only free the extent in the
3076                  * truncated case if we didn't write out the extent at all.
3077                  */
3078                 if ((ret || !logical_len) &&
3079                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3080                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3081                         btrfs_free_reserved_extent(root, ordered_extent->start,
3082                                                    ordered_extent->disk_len, 1);
3083         }
3084
3085
3086         /*
3087          * This needs to be done to make sure anybody waiting knows we are done
3088          * updating everything for this ordered extent.
3089          */
3090         btrfs_remove_ordered_extent(inode, ordered_extent);
3091
3092         /* for snapshot-aware defrag */
3093         if (new) {
3094                 if (ret) {
3095                         free_sa_defrag_extent(new);
3096                         atomic_dec(&root->fs_info->defrag_running);
3097                 } else {
3098                         relink_file_extents(new);
3099                 }
3100         }
3101
3102         /* once for us */
3103         btrfs_put_ordered_extent(ordered_extent);
3104         /* once for the tree */
3105         btrfs_put_ordered_extent(ordered_extent);
3106
3107         return ret;
3108 }
3109
3110 static void finish_ordered_fn(struct btrfs_work *work)
3111 {
3112         struct btrfs_ordered_extent *ordered_extent;
3113         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3114         btrfs_finish_ordered_io(ordered_extent);
3115 }
3116
3117 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3118                                 struct extent_state *state, int uptodate)
3119 {
3120         struct inode *inode = page->mapping->host;
3121         struct btrfs_root *root = BTRFS_I(inode)->root;
3122         struct btrfs_ordered_extent *ordered_extent = NULL;
3123         struct btrfs_workqueue *wq;
3124         btrfs_work_func_t func;
3125
3126         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3127
3128         ClearPagePrivate2(page);
3129         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3130                                             end - start + 1, uptodate))
3131                 return 0;
3132
3133         if (btrfs_is_free_space_inode(inode)) {
3134                 wq = root->fs_info->endio_freespace_worker;
3135                 func = btrfs_freespace_write_helper;
3136         } else {
3137                 wq = root->fs_info->endio_write_workers;
3138                 func = btrfs_endio_write_helper;
3139         }
3140
3141         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3142                         NULL);
3143         btrfs_queue_work(wq, &ordered_extent->work);
3144
3145         return 0;
3146 }
3147
3148 static int __readpage_endio_check(struct inode *inode,
3149                                   struct btrfs_io_bio *io_bio,
3150                                   int icsum, struct page *page,
3151                                   int pgoff, u64 start, size_t len)
3152 {
3153         char *kaddr;
3154         u32 csum_expected;
3155         u32 csum = ~(u32)0;
3156
3157         csum_expected = *(((u32 *)io_bio->csum) + icsum);
3158
3159         kaddr = kmap_atomic(page);
3160         csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3161         btrfs_csum_final(csum, (char *)&csum);
3162         if (csum != csum_expected)
3163                 goto zeroit;
3164
3165         kunmap_atomic(kaddr);
3166         return 0;
3167 zeroit:
3168         btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
3169                 "csum failed ino %llu off %llu csum %u expected csum %u",
3170                            btrfs_ino(inode), start, csum, csum_expected);
3171         memset(kaddr + pgoff, 1, len);
3172         flush_dcache_page(page);
3173         kunmap_atomic(kaddr);
3174         if (csum_expected == 0)
3175                 return 0;
3176         return -EIO;
3177 }
3178
3179 /*
3180  * when reads are done, we need to check csums to verify the data is correct
3181  * if there's a match, we allow the bio to finish.  If not, the code in
3182  * extent_io.c will try to find good copies for us.
3183  */
3184 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3185                                       u64 phy_offset, struct page *page,
3186                                       u64 start, u64 end, int mirror)
3187 {
3188         size_t offset = start - page_offset(page);
3189         struct inode *inode = page->mapping->host;
3190         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3191         struct btrfs_root *root = BTRFS_I(inode)->root;
3192
3193         if (PageChecked(page)) {
3194                 ClearPageChecked(page);
3195                 return 0;
3196         }
3197
3198         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3199                 return 0;
3200
3201         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3202             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3203                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3204                 return 0;
3205         }
3206
3207         phy_offset >>= inode->i_sb->s_blocksize_bits;
3208         return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3209                                       start, (size_t)(end - start + 1));
3210 }
3211
3212 void btrfs_add_delayed_iput(struct inode *inode)
3213 {
3214         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3215         struct btrfs_inode *binode = BTRFS_I(inode);
3216
3217         if (atomic_add_unless(&inode->i_count, -1, 1))
3218                 return;
3219
3220         spin_lock(&fs_info->delayed_iput_lock);
3221         if (binode->delayed_iput_count == 0) {
3222                 ASSERT(list_empty(&binode->delayed_iput));
3223                 list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3224         } else {
3225                 binode->delayed_iput_count++;
3226         }
3227         spin_unlock(&fs_info->delayed_iput_lock);
3228 }
3229
3230 void btrfs_run_delayed_iputs(struct btrfs_root *root)
3231 {
3232         struct btrfs_fs_info *fs_info = root->fs_info;
3233
3234         spin_lock(&fs_info->delayed_iput_lock);
3235         while (!list_empty(&fs_info->delayed_iputs)) {
3236                 struct btrfs_inode *inode;
3237
3238                 inode = list_first_entry(&fs_info->delayed_iputs,
3239                                 struct btrfs_inode, delayed_iput);
3240                 if (inode->delayed_iput_count) {
3241                         inode->delayed_iput_count--;
3242                         list_move_tail(&inode->delayed_iput,
3243                                         &fs_info->delayed_iputs);
3244                 } else {
3245                         list_del_init(&inode->delayed_iput);
3246                 }
3247                 spin_unlock(&fs_info->delayed_iput_lock);
3248                 iput(&inode->vfs_inode);
3249                 spin_lock(&fs_info->delayed_iput_lock);
3250         }
3251         spin_unlock(&fs_info->delayed_iput_lock);
3252 }
3253
3254 /*
3255  * This is called in transaction commit time. If there are no orphan
3256  * files in the subvolume, it removes orphan item and frees block_rsv
3257  * structure.
3258  */
3259 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3260                               struct btrfs_root *root)
3261 {
3262         struct btrfs_block_rsv *block_rsv;
3263         int ret;
3264
3265         if (atomic_read(&root->orphan_inodes) ||
3266             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3267                 return;
3268
3269         spin_lock(&root->orphan_lock);
3270         if (atomic_read(&root->orphan_inodes)) {
3271                 spin_unlock(&root->orphan_lock);
3272                 return;
3273         }
3274
3275         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3276                 spin_unlock(&root->orphan_lock);
3277                 return;
3278         }
3279
3280         block_rsv = root->orphan_block_rsv;
3281         root->orphan_block_rsv = NULL;
3282         spin_unlock(&root->orphan_lock);
3283
3284         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3285             btrfs_root_refs(&root->root_item) > 0) {
3286                 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
3287                                             root->root_key.objectid);
3288                 if (ret)
3289                         btrfs_abort_transaction(trans, ret);
3290                 else
3291                         clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3292                                   &root->state);
3293         }
3294
3295         if (block_rsv) {
3296                 WARN_ON(block_rsv->size > 0);
3297                 btrfs_free_block_rsv(root, block_rsv);
3298         }
3299 }
3300
3301 /*
3302  * This creates an orphan entry for the given inode in case something goes
3303  * wrong in the middle of an unlink/truncate.
3304  *
3305  * NOTE: caller of this function should reserve 5 units of metadata for
3306  *       this function.
3307  */
3308 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3309 {
3310         struct btrfs_root *root = BTRFS_I(inode)->root;
3311         struct btrfs_block_rsv *block_rsv = NULL;
3312         int reserve = 0;
3313         int insert = 0;
3314         int ret;
3315
3316         if (!root->orphan_block_rsv) {
3317                 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3318                 if (!block_rsv)
3319                         return -ENOMEM;
3320         }
3321
3322         spin_lock(&root->orphan_lock);
3323         if (!root->orphan_block_rsv) {
3324                 root->orphan_block_rsv = block_rsv;
3325         } else if (block_rsv) {
3326                 btrfs_free_block_rsv(root, block_rsv);
3327                 block_rsv = NULL;
3328         }
3329
3330         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3331                               &BTRFS_I(inode)->runtime_flags)) {
3332 #if 0
3333                 /*
3334                  * For proper ENOSPC handling, we should do orphan
3335                  * cleanup when mounting. But this introduces backward
3336                  * compatibility issue.
3337                  */
3338                 if (!xchg(&root->orphan_item_inserted, 1))
3339                         insert = 2;
3340                 else
3341                         insert = 1;
3342 #endif
3343                 insert = 1;
3344                 atomic_inc(&root->orphan_inodes);
3345         }
3346
3347         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3348                               &BTRFS_I(inode)->runtime_flags))
3349                 reserve = 1;
3350         spin_unlock(&root->orphan_lock);
3351
3352         /* grab metadata reservation from transaction handle */
3353         if (reserve) {
3354                 ret = btrfs_orphan_reserve_metadata(trans, inode);
3355                 ASSERT(!ret);
3356                 if (ret) {
3357                         atomic_dec(&root->orphan_inodes);
3358                         clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3359                                   &BTRFS_I(inode)->runtime_flags);
3360                         if (insert)
3361                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3362                                           &BTRFS_I(inode)->runtime_flags);
3363                         return ret;
3364                 }
3365         }
3366
3367         /* insert an orphan item to track this unlinked/truncated file */
3368         if (insert >= 1) {
3369                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3370                 if (ret) {
3371                         atomic_dec(&root->orphan_inodes);
3372                         if (reserve) {
3373                                 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3374                                           &BTRFS_I(inode)->runtime_flags);
3375                                 btrfs_orphan_release_metadata(inode);
3376                         }
3377                         if (ret != -EEXIST) {
3378                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3379                                           &BTRFS_I(inode)->runtime_flags);
3380                                 btrfs_abort_transaction(trans, ret);
3381                                 return ret;
3382                         }
3383                 }
3384                 ret = 0;
3385         }
3386
3387         /* insert an orphan item to track subvolume contains orphan files */
3388         if (insert >= 2) {
3389                 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3390                                                root->root_key.objectid);
3391                 if (ret && ret != -EEXIST) {
3392                         btrfs_abort_transaction(trans, ret);
3393                         return ret;
3394                 }
3395         }
3396         return 0;
3397 }
3398
3399 /*
3400  * We have done the truncate/delete so we can go ahead and remove the orphan
3401  * item for this particular inode.
3402  */
3403 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3404                             struct inode *inode)
3405 {
3406         struct btrfs_root *root = BTRFS_I(inode)->root;
3407         int delete_item = 0;
3408         int release_rsv = 0;
3409         int ret = 0;
3410
3411         spin_lock(&root->orphan_lock);
3412         if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3413                                &BTRFS_I(inode)->runtime_flags))
3414                 delete_item = 1;
3415
3416         if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3417                                &BTRFS_I(inode)->runtime_flags))
3418                 release_rsv = 1;
3419         spin_unlock(&root->orphan_lock);
3420
3421         if (delete_item) {
3422                 atomic_dec(&root->orphan_inodes);
3423                 if (trans)
3424                         ret = btrfs_del_orphan_item(trans, root,
3425                                                     btrfs_ino(inode));
3426         }
3427
3428         if (release_rsv)
3429                 btrfs_orphan_release_metadata(inode);
3430
3431         return ret;
3432 }
3433
3434 /*
3435  * this cleans up any orphans that may be left on the list from the last use
3436  * of this root.
3437  */
3438 int btrfs_orphan_cleanup(struct btrfs_root *root)
3439 {
3440         struct btrfs_path *path;
3441         struct extent_buffer *leaf;
3442         struct btrfs_key key, found_key;
3443         struct btrfs_trans_handle *trans;
3444         struct inode *inode;
3445         u64 last_objectid = 0;
3446         int ret = 0, nr_unlink = 0, nr_truncate = 0;
3447
3448         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3449                 return 0;
3450
3451         path = btrfs_alloc_path();
3452         if (!path) {
3453                 ret = -ENOMEM;
3454                 goto out;
3455         }
3456         path->reada = READA_BACK;
3457
3458         key.objectid = BTRFS_ORPHAN_OBJECTID;
3459         key.type = BTRFS_ORPHAN_ITEM_KEY;
3460         key.offset = (u64)-1;
3461
3462         while (1) {
3463                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3464                 if (ret < 0)
3465                         goto out;
3466
3467                 /*
3468                  * if ret == 0 means we found what we were searching for, which
3469                  * is weird, but possible, so only screw with path if we didn't
3470                  * find the key and see if we have stuff that matches
3471                  */
3472                 if (ret > 0) {
3473                         ret = 0;
3474                         if (path->slots[0] == 0)
3475                                 break;
3476                         path->slots[0]--;
3477                 }
3478
3479                 /* pull out the item */
3480                 leaf = path->nodes[0];
3481                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3482
3483                 /* make sure the item matches what we want */
3484                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3485                         break;
3486                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3487                         break;
3488
3489                 /* release the path since we're done with it */
3490                 btrfs_release_path(path);
3491
3492                 /*
3493                  * this is where we are basically btrfs_lookup, without the
3494                  * crossing root thing.  we store the inode number in the
3495                  * offset of the orphan item.
3496                  */
3497
3498                 if (found_key.offset == last_objectid) {
3499                         btrfs_err(root->fs_info,
3500                                 "Error removing orphan entry, stopping orphan cleanup");
3501                         ret = -EINVAL;
3502                         goto out;
3503                 }
3504
3505                 last_objectid = found_key.offset;
3506
3507                 found_key.objectid = found_key.offset;
3508                 found_key.type = BTRFS_INODE_ITEM_KEY;
3509                 found_key.offset = 0;
3510                 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3511                 ret = PTR_ERR_OR_ZERO(inode);
3512                 if (ret && ret != -ENOENT)
3513                         goto out;
3514
3515                 if (ret == -ENOENT && root == root->fs_info->tree_root) {
3516                         struct btrfs_root *dead_root;
3517                         struct btrfs_fs_info *fs_info = root->fs_info;
3518                         int is_dead_root = 0;
3519
3520                         /*
3521                          * this is an orphan in the tree root. Currently these
3522                          * could come from 2 sources:
3523                          *  a) a snapshot deletion in progress
3524                          *  b) a free space cache inode
3525                          * We need to distinguish those two, as the snapshot
3526                          * orphan must not get deleted.
3527                          * find_dead_roots already ran before us, so if this
3528                          * is a snapshot deletion, we should find the root
3529                          * in the dead_roots list
3530                          */
3531                         spin_lock(&fs_info->trans_lock);
3532                         list_for_each_entry(dead_root, &fs_info->dead_roots,
3533                                             root_list) {
3534                                 if (dead_root->root_key.objectid ==
3535                                     found_key.objectid) {
3536                                         is_dead_root = 1;
3537                                         break;
3538                                 }
3539                         }
3540                         spin_unlock(&fs_info->trans_lock);
3541                         if (is_dead_root) {
3542                                 /* prevent this orphan from being found again */
3543                                 key.offset = found_key.objectid - 1;
3544                                 continue;
3545                         }
3546                 }
3547                 /*
3548                  * Inode is already gone but the orphan item is still there,
3549                  * kill the orphan item.
3550                  */
3551                 if (ret == -ENOENT) {
3552                         trans = btrfs_start_transaction(root, 1);
3553                         if (IS_ERR(trans)) {
3554                                 ret = PTR_ERR(trans);
3555                                 goto out;
3556                         }
3557                         btrfs_debug(root->fs_info, "auto deleting %Lu",
3558                                 found_key.objectid);
3559                         ret = btrfs_del_orphan_item(trans, root,
3560                                                     found_key.objectid);
3561                         btrfs_end_transaction(trans, root);
3562                         if (ret)
3563                                 goto out;
3564                         continue;
3565                 }
3566
3567                 /*
3568                  * add this inode to the orphan list so btrfs_orphan_del does
3569                  * the proper thing when we hit it
3570                  */
3571                 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3572                         &BTRFS_I(inode)->runtime_flags);
3573                 atomic_inc(&root->orphan_inodes);
3574
3575                 /* if we have links, this was a truncate, lets do that */
3576                 if (inode->i_nlink) {
3577                         if (WARN_ON(!S_ISREG(inode->i_mode))) {
3578                                 iput(inode);
3579                                 continue;
3580                         }
3581                         nr_truncate++;
3582
3583                         /* 1 for the orphan item deletion. */
3584                         trans = btrfs_start_transaction(root, 1);
3585                         if (IS_ERR(trans)) {
3586                                 iput(inode);
3587                                 ret = PTR_ERR(trans);
3588                                 goto out;
3589                         }
3590                         ret = btrfs_orphan_add(trans, inode);
3591                         btrfs_end_transaction(trans, root);
3592                         if (ret) {
3593                                 iput(inode);
3594                                 goto out;
3595                         }
3596
3597                         ret = btrfs_truncate(inode);
3598                         if (ret)
3599                                 btrfs_orphan_del(NULL, inode);
3600                 } else {
3601                         nr_unlink++;
3602                 }
3603
3604                 /* this will do delete_inode and everything for us */
3605                 iput(inode);
3606                 if (ret)
3607                         goto out;
3608         }
3609         /* release the path since we're done with it */
3610         btrfs_release_path(path);
3611
3612         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3613
3614         if (root->orphan_block_rsv)
3615                 btrfs_block_rsv_release(root, root->orphan_block_rsv,
3616                                         (u64)-1);
3617
3618         if (root->orphan_block_rsv ||
3619             test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3620                 trans = btrfs_join_transaction(root);
3621                 if (!IS_ERR(trans))
3622                         btrfs_end_transaction(trans, root);
3623         }
3624
3625         if (nr_unlink)
3626                 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3627         if (nr_truncate)
3628                 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3629
3630 out:
3631         if (ret)
3632                 btrfs_err(root->fs_info,
3633                         "could not do orphan cleanup %d", ret);
3634         btrfs_free_path(path);
3635         return ret;
3636 }
3637
3638 /*
3639  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3640  * don't find any xattrs, we know there can't be any acls.
3641  *
3642  * slot is the slot the inode is in, objectid is the objectid of the inode
3643  */
3644 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3645                                           int slot, u64 objectid,
3646                                           int *first_xattr_slot)
3647 {
3648         u32 nritems = btrfs_header_nritems(leaf);
3649         struct btrfs_key found_key;
3650         static u64 xattr_access = 0;
3651         static u64 xattr_default = 0;
3652         int scanned = 0;
3653
3654         if (!xattr_access) {
3655                 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3656                                         strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3657                 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3658                                         strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3659         }
3660
3661         slot++;
3662         *first_xattr_slot = -1;
3663         while (slot < nritems) {
3664                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3665
3666                 /* we found a different objectid, there must not be acls */
3667                 if (found_key.objectid != objectid)
3668                         return 0;
3669
3670                 /* we found an xattr, assume we've got an acl */
3671                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3672                         if (*first_xattr_slot == -1)
3673                                 *first_xattr_slot = slot;
3674                         if (found_key.offset == xattr_access ||
3675                             found_key.offset == xattr_default)
3676                                 return 1;
3677                 }
3678
3679                 /*
3680                  * we found a key greater than an xattr key, there can't
3681                  * be any acls later on
3682                  */
3683                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3684                         return 0;
3685
3686                 slot++;
3687                 scanned++;
3688
3689                 /*
3690                  * it goes inode, inode backrefs, xattrs, extents,
3691                  * so if there are a ton of hard links to an inode there can
3692                  * be a lot of backrefs.  Don't waste time searching too hard,
3693                  * this is just an optimization
3694                  */
3695                 if (scanned >= 8)
3696                         break;
3697         }
3698         /* we hit the end of the leaf before we found an xattr or
3699          * something larger than an xattr.  We have to assume the inode
3700          * has acls
3701          */
3702         if (*first_xattr_slot == -1)
3703                 *first_xattr_slot = slot;
3704         return 1;
3705 }
3706
3707 /*
3708  * read an inode from the btree into the in-memory inode
3709  */
3710 static int btrfs_read_locked_inode(struct inode *inode)
3711 {
3712         struct btrfs_path *path;
3713         struct extent_buffer *leaf;
3714         struct btrfs_inode_item *inode_item;
3715         struct btrfs_root *root = BTRFS_I(inode)->root;
3716         struct btrfs_key location;
3717         unsigned long ptr;
3718         int maybe_acls;
3719         u32 rdev;
3720         int ret;
3721         bool filled = false;
3722         int first_xattr_slot;
3723
3724         ret = btrfs_fill_inode(inode, &rdev);
3725         if (!ret)
3726                 filled = true;
3727
3728         path = btrfs_alloc_path();
3729         if (!path) {
3730                 ret = -ENOMEM;
3731                 goto make_bad;
3732         }
3733
3734         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3735
3736         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3737         if (ret) {
3738                 if (ret > 0)
3739                         ret = -ENOENT;
3740                 goto make_bad;
3741         }
3742
3743         leaf = path->nodes[0];
3744
3745         if (filled)
3746                 goto cache_index;
3747
3748         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3749                                     struct btrfs_inode_item);
3750         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3751         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3752         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3753         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3754         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3755
3756         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3757         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3758
3759         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3760         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3761
3762         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3763         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3764
3765         BTRFS_I(inode)->i_otime.tv_sec =
3766                 btrfs_timespec_sec(leaf, &inode_item->otime);
3767         BTRFS_I(inode)->i_otime.tv_nsec =
3768                 btrfs_timespec_nsec(leaf, &inode_item->otime);
3769
3770         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3771         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3772         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3773
3774         inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3775         inode->i_generation = BTRFS_I(inode)->generation;
3776         inode->i_rdev = 0;
3777         rdev = btrfs_inode_rdev(leaf, inode_item);
3778
3779         BTRFS_I(inode)->index_cnt = (u64)-1;
3780         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3781
3782 cache_index:
3783         /*
3784          * If we were modified in the current generation and evicted from memory
3785          * and then re-read we need to do a full sync since we don't have any
3786          * idea about which extents were modified before we were evicted from
3787          * cache.
3788          *
3789          * This is required for both inode re-read from disk and delayed inode
3790          * in delayed_nodes_tree.
3791          */
3792         if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3793                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3794                         &BTRFS_I(inode)->runtime_flags);
3795
3796         /*
3797          * We don't persist the id of the transaction where an unlink operation
3798          * against the inode was last made. So here we assume the inode might
3799          * have been evicted, and therefore the exact value of last_unlink_trans
3800          * lost, and set it to last_trans to avoid metadata inconsistencies
3801          * between the inode and its parent if the inode is fsync'ed and the log
3802          * replayed. For example, in the scenario:
3803          *
3804          * touch mydir/foo
3805          * ln mydir/foo mydir/bar
3806          * sync
3807          * unlink mydir/bar
3808          * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3809          * xfs_io -c fsync mydir/foo
3810          * <power failure>
3811          * mount fs, triggers fsync log replay
3812          *
3813          * We must make sure that when we fsync our inode foo we also log its
3814          * parent inode, otherwise after log replay the parent still has the
3815          * dentry with the "bar" name but our inode foo has a link count of 1
3816          * and doesn't have an inode ref with the name "bar" anymore.
3817          *
3818          * Setting last_unlink_trans to last_trans is a pessimistic approach,
3819          * but it guarantees correctness at the expense of occasional full
3820          * transaction commits on fsync if our inode is a directory, or if our
3821          * inode is not a directory, logging its parent unnecessarily.
3822          */
3823         BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3824
3825         path->slots[0]++;
3826         if (inode->i_nlink != 1 ||
3827             path->slots[0] >= btrfs_header_nritems(leaf))
3828                 goto cache_acl;
3829
3830         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3831         if (location.objectid != btrfs_ino(inode))
3832                 goto cache_acl;
3833
3834         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3835         if (location.type == BTRFS_INODE_REF_KEY) {
3836                 struct btrfs_inode_ref *ref;
3837
3838                 ref = (struct btrfs_inode_ref *)ptr;
3839                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3840         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3841                 struct btrfs_inode_extref *extref;
3842
3843                 extref = (struct btrfs_inode_extref *)ptr;
3844                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3845                                                                      extref);
3846         }
3847 cache_acl:
3848         /*
3849          * try to precache a NULL acl entry for files that don't have
3850          * any xattrs or acls
3851          */
3852         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3853                                            btrfs_ino(inode), &first_xattr_slot);
3854         if (first_xattr_slot != -1) {
3855                 path->slots[0] = first_xattr_slot;
3856                 ret = btrfs_load_inode_props(inode, path);
3857                 if (ret)
3858                         btrfs_err(root->fs_info,
3859                                   "error loading props for ino %llu (root %llu): %d",
3860                                   btrfs_ino(inode),
3861                                   root->root_key.objectid, ret);
3862         }
3863         btrfs_free_path(path);
3864
3865         if (!maybe_acls)
3866                 cache_no_acl(inode);
3867
3868         switch (inode->i_mode & S_IFMT) {
3869         case S_IFREG:
3870                 inode->i_mapping->a_ops = &btrfs_aops;
3871                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3872                 inode->i_fop = &btrfs_file_operations;
3873                 inode->i_op = &btrfs_file_inode_operations;
3874                 break;
3875         case S_IFDIR:
3876                 inode->i_fop = &btrfs_dir_file_operations;
3877                 inode->i_op = &btrfs_dir_inode_operations;
3878                 break;
3879         case S_IFLNK:
3880                 inode->i_op = &btrfs_symlink_inode_operations;
3881                 inode_nohighmem(inode);
3882                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3883                 break;
3884         default:
3885                 inode->i_op = &btrfs_special_inode_operations;
3886                 init_special_inode(inode, inode->i_mode, rdev);
3887                 break;
3888         }
3889
3890         btrfs_update_iflags(inode);
3891         return 0;
3892
3893 make_bad:
3894         btrfs_free_path(path);
3895         make_bad_inode(inode);
3896         return ret;
3897 }
3898
3899 /*
3900  * given a leaf and an inode, copy the inode fields into the leaf
3901  */
3902 static void fill_inode_item(struct btrfs_trans_handle *trans,
3903                             struct extent_buffer *leaf,
3904                             struct btrfs_inode_item *item,
3905                             struct inode *inode)
3906 {
3907         struct btrfs_map_token token;
3908
3909         btrfs_init_map_token(&token);
3910
3911         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3912         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3913         btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3914                                    &token);
3915         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3916         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3917
3918         btrfs_set_token_timespec_sec(leaf, &item->atime,
3919                                      inode->i_atime.tv_sec, &token);
3920         btrfs_set_token_timespec_nsec(leaf, &item->atime,
3921                                       inode->i_atime.tv_nsec, &token);
3922
3923         btrfs_set_token_timespec_sec(leaf, &item->mtime,
3924                                      inode->i_mtime.tv_sec, &token);
3925         btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3926                                       inode->i_mtime.tv_nsec, &token);
3927
3928         btrfs_set_token_timespec_sec(leaf, &item->ctime,
3929                                      inode->i_ctime.tv_sec, &token);
3930         btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3931                                       inode->i_ctime.tv_nsec, &token);
3932
3933         btrfs_set_token_timespec_sec(leaf, &item->otime,
3934                                      BTRFS_I(inode)->i_otime.tv_sec, &token);
3935         btrfs_set_token_timespec_nsec(leaf, &item->otime,
3936                                       BTRFS_I(inode)->i_otime.tv_nsec, &token);
3937
3938         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3939                                      &token);
3940         btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3941                                          &token);
3942         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3943         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3944         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3945         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3946         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3947 }
3948
3949 /*
3950  * copy everything in the in-memory inode into the btree.
3951  */
3952 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3953                                 struct btrfs_root *root, struct inode *inode)
3954 {
3955         struct btrfs_inode_item *inode_item;
3956         struct btrfs_path *path;
3957         struct extent_buffer *leaf;
3958         int ret;
3959
3960         path = btrfs_alloc_path();
3961         if (!path)
3962                 return -ENOMEM;
3963
3964         path->leave_spinning = 1;
3965         ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3966                                  1);
3967         if (ret) {
3968                 if (ret > 0)
3969                         ret = -ENOENT;
3970                 goto failed;
3971         }
3972
3973         leaf = path->nodes[0];
3974         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3975                                     struct btrfs_inode_item);
3976
3977         fill_inode_item(trans, leaf, inode_item, inode);
3978         btrfs_mark_buffer_dirty(leaf);
3979         btrfs_set_inode_last_trans(trans, inode);
3980         ret = 0;
3981 failed:
3982         btrfs_free_path(path);
3983         return ret;
3984 }
3985
3986 /*
3987  * copy everything in the in-memory inode into the btree.
3988  */
3989 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3990                                 struct btrfs_root *root, struct inode *inode)
3991 {
3992         int ret;
3993
3994         /*
3995          * If the inode is a free space inode, we can deadlock during commit
3996          * if we put it into the delayed code.
3997          *
3998          * The data relocation inode should also be directly updated
3999          * without delay
4000          */
4001         if (!btrfs_is_free_space_inode(inode)
4002             && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
4003             && !test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
4004                 btrfs_update_root_times(trans, root);
4005
4006                 ret = btrfs_delayed_update_inode(trans, root, inode);
4007                 if (!ret)
4008                         btrfs_set_inode_last_trans(trans, inode);
4009                 return ret;
4010         }
4011
4012         return btrfs_update_inode_item(trans, root, inode);
4013 }
4014
4015 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4016                                          struct btrfs_root *root,
4017                                          struct inode *inode)
4018 {
4019         int ret;
4020
4021         ret = btrfs_update_inode(trans, root, inode);
4022         if (ret == -ENOSPC)
4023                 return btrfs_update_inode_item(trans, root, inode);
4024         return ret;
4025 }
4026
4027 /*
4028  * unlink helper that gets used here in inode.c and in the tree logging
4029  * recovery code.  It remove a link in a directory with a given name, and
4030  * also drops the back refs in the inode to the directory
4031  */
4032 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4033                                 struct btrfs_root *root,
4034                                 struct inode *dir, struct inode *inode,
4035                                 const char *name, int name_len)
4036 {
4037         struct btrfs_path *path;
4038         int ret = 0;
4039         struct extent_buffer *leaf;
4040         struct btrfs_dir_item *di;
4041         struct btrfs_key key;
4042         u64 index;
4043         u64 ino = btrfs_ino(inode);
4044         u64 dir_ino = btrfs_ino(dir);
4045
4046         path = btrfs_alloc_path();
4047         if (!path) {
4048                 ret = -ENOMEM;
4049                 goto out;
4050         }
4051
4052         path->leave_spinning = 1;
4053         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4054                                     name, name_len, -1);
4055         if (IS_ERR(di)) {
4056                 ret = PTR_ERR(di);
4057                 goto err;
4058         }
4059         if (!di) {
4060                 ret = -ENOENT;
4061                 goto err;
4062         }
4063         leaf = path->nodes[0];
4064         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4065         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4066         if (ret)
4067                 goto err;
4068         btrfs_release_path(path);
4069
4070         /*
4071          * If we don't have dir index, we have to get it by looking up
4072          * the inode ref, since we get the inode ref, remove it directly,
4073          * it is unnecessary to do delayed deletion.
4074          *
4075          * But if we have dir index, needn't search inode ref to get it.
4076          * Since the inode ref is close to the inode item, it is better
4077          * that we delay to delete it, and just do this deletion when
4078          * we update the inode item.
4079          */
4080         if (BTRFS_I(inode)->dir_index) {
4081                 ret = btrfs_delayed_delete_inode_ref(inode);
4082                 if (!ret) {
4083                         index = BTRFS_I(inode)->dir_index;
4084                         goto skip_backref;
4085                 }
4086         }
4087
4088         ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4089                                   dir_ino, &index);
4090         if (ret) {
4091                 btrfs_info(root->fs_info,
4092                         "failed to delete reference to %.*s, inode %llu parent %llu",
4093                         name_len, name, ino, dir_ino);
4094                 btrfs_abort_transaction(trans, ret);
4095                 goto err;
4096         }
4097 skip_backref:
4098         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
4099         if (ret) {
4100                 btrfs_abort_transaction(trans, ret);
4101                 goto err;
4102         }
4103
4104         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
4105                                          inode, dir_ino);
4106         if (ret != 0 && ret != -ENOENT) {
4107                 btrfs_abort_transaction(trans, ret);
4108                 goto err;
4109         }
4110
4111         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
4112                                            dir, index);
4113         if (ret == -ENOENT)
4114                 ret = 0;
4115         else if (ret)
4116                 btrfs_abort_transaction(trans, ret);
4117 err:
4118         btrfs_free_path(path);
4119         if (ret)
4120                 goto out;
4121
4122         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4123         inode_inc_iversion(inode);
4124         inode_inc_iversion(dir);
4125         inode->i_ctime = dir->i_mtime =
4126                 dir->i_ctime = current_time(inode);
4127         ret = btrfs_update_inode(trans, root, dir);
4128 out:
4129         return ret;
4130 }
4131
4132 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4133                        struct btrfs_root *root,
4134                        struct inode *dir, struct inode *inode,
4135                        const char *name, int name_len)
4136 {
4137         int ret;
4138         ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4139         if (!ret) {
4140                 drop_nlink(inode);
4141                 ret = btrfs_update_inode(trans, root, inode);
4142         }
4143         return ret;
4144 }
4145
4146 /*
4147  * helper to start transaction for unlink and rmdir.
4148  *
4149  * unlink and rmdir are special in btrfs, they do not always free space, so
4150  * if we cannot make our reservations the normal way try and see if there is
4151  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4152  * allow the unlink to occur.
4153  */
4154 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4155 {
4156         struct btrfs_root *root = BTRFS_I(dir)->root;
4157
4158         /*
4159          * 1 for the possible orphan item
4160          * 1 for the dir item
4161          * 1 for the dir index
4162          * 1 for the inode ref
4163          * 1 for the inode
4164          */
4165         return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4166 }
4167
4168 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4169 {
4170         struct btrfs_root *root = BTRFS_I(dir)->root;
4171         struct btrfs_trans_handle *trans;
4172         struct inode *inode = d_inode(dentry);
4173         int ret;
4174
4175         trans = __unlink_start_trans(dir);
4176         if (IS_ERR(trans))
4177                 return PTR_ERR(trans);
4178
4179         btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0);
4180
4181         ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
4182                                  dentry->d_name.name, dentry->d_name.len);
4183         if (ret)
4184                 goto out;
4185
4186         if (inode->i_nlink == 0) {
4187                 ret = btrfs_orphan_add(trans, inode);
4188                 if (ret)
4189                         goto out;
4190         }
4191
4192 out:
4193         btrfs_end_transaction(trans, root);
4194         btrfs_btree_balance_dirty(root);
4195         return ret;
4196 }
4197
4198 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4199                         struct btrfs_root *root,
4200                         struct inode *dir, u64 objectid,
4201                         const char *name, int name_len)
4202 {
4203         struct btrfs_path *path;
4204         struct extent_buffer *leaf;
4205         struct btrfs_dir_item *di;
4206         struct btrfs_key key;
4207         u64 index;
4208         int ret;
4209         u64 dir_ino = btrfs_ino(dir);
4210
4211         path = btrfs_alloc_path();
4212         if (!path)
4213                 return -ENOMEM;
4214
4215         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4216                                    name, name_len, -1);
4217         if (IS_ERR_OR_NULL(di)) {
4218                 if (!di)
4219                         ret = -ENOENT;
4220                 else
4221                         ret = PTR_ERR(di);
4222                 goto out;
4223         }
4224
4225         leaf = path->nodes[0];
4226         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4227         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4228         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4229         if (ret) {
4230                 btrfs_abort_transaction(trans, ret);
4231                 goto out;
4232         }
4233         btrfs_release_path(path);
4234
4235         ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
4236                                  objectid, root->root_key.objectid,
4237                                  dir_ino, &index, name, name_len);
4238         if (ret < 0) {
4239                 if (ret != -ENOENT) {
4240                         btrfs_abort_transaction(trans, ret);
4241                         goto out;
4242                 }
4243                 di = btrfs_search_dir_index_item(root, path, dir_ino,
4244                                                  name, name_len);
4245                 if (IS_ERR_OR_NULL(di)) {
4246                         if (!di)
4247                                 ret = -ENOENT;
4248                         else
4249                                 ret = PTR_ERR(di);
4250                         btrfs_abort_transaction(trans, ret);
4251                         goto out;
4252                 }
4253
4254                 leaf = path->nodes[0];
4255                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4256                 btrfs_release_path(path);
4257                 index = key.offset;
4258         }
4259         btrfs_release_path(path);
4260
4261         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
4262         if (ret) {
4263                 btrfs_abort_transaction(trans, ret);
4264                 goto out;
4265         }
4266
4267         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4268         inode_inc_iversion(dir);
4269         dir->i_mtime = dir->i_ctime = current_time(dir);
4270         ret = btrfs_update_inode_fallback(trans, root, dir);
4271         if (ret)
4272                 btrfs_abort_transaction(trans, ret);
4273 out:
4274         btrfs_free_path(path);
4275         return ret;
4276 }
4277
4278 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4279 {
4280         struct inode *inode = d_inode(dentry);
4281         int err = 0;
4282         struct btrfs_root *root = BTRFS_I(dir)->root;
4283         struct btrfs_trans_handle *trans;
4284         u64 last_unlink_trans;
4285
4286         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4287                 return -ENOTEMPTY;
4288         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
4289                 return -EPERM;
4290
4291         trans = __unlink_start_trans(dir);
4292         if (IS_ERR(trans))
4293                 return PTR_ERR(trans);
4294
4295         if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4296                 err = btrfs_unlink_subvol(trans, root, dir,
4297                                           BTRFS_I(inode)->location.objectid,
4298                                           dentry->d_name.name,
4299                                           dentry->d_name.len);
4300                 goto out;
4301         }
4302
4303         err = btrfs_orphan_add(trans, inode);
4304         if (err)
4305                 goto out;
4306
4307         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4308
4309         /* now the directory is empty */
4310         err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
4311                                  dentry->d_name.name, dentry->d_name.len);
4312         if (!err) {
4313                 btrfs_i_size_write(inode, 0);
4314                 /*
4315                  * Propagate the last_unlink_trans value of the deleted dir to
4316                  * its parent directory. This is to prevent an unrecoverable
4317                  * log tree in the case we do something like this:
4318                  * 1) create dir foo
4319                  * 2) create snapshot under dir foo
4320                  * 3) delete the snapshot
4321                  * 4) rmdir foo
4322                  * 5) mkdir foo
4323                  * 6) fsync foo or some file inside foo
4324                  */
4325                 if (last_unlink_trans >= trans->transid)
4326                         BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4327         }
4328 out:
4329         btrfs_end_transaction(trans, root);
4330         btrfs_btree_balance_dirty(root);
4331
4332         return err;
4333 }
4334
4335 static int truncate_space_check(struct btrfs_trans_handle *trans,
4336                                 struct btrfs_root *root,
4337                                 u64 bytes_deleted)
4338 {
4339         int ret;
4340
4341         /*
4342          * This is only used to apply pressure to the enospc system, we don't
4343          * intend to use this reservation at all.
4344          */
4345         bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
4346         bytes_deleted *= root->nodesize;
4347         ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
4348                                   bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4349         if (!ret) {
4350                 trace_btrfs_space_reservation(root->fs_info, "transaction",
4351                                               trans->transid,
4352                                               bytes_deleted, 1);
4353                 trans->bytes_reserved += bytes_deleted;
4354         }
4355         return ret;
4356
4357 }
4358
4359 static int truncate_inline_extent(struct inode *inode,
4360                                   struct btrfs_path *path,
4361                                   struct btrfs_key *found_key,
4362                                   const u64 item_end,
4363                                   const u64 new_size)
4364 {
4365         struct extent_buffer *leaf = path->nodes[0];
4366         int slot = path->slots[0];
4367         struct btrfs_file_extent_item *fi;
4368         u32 size = (u32)(new_size - found_key->offset);
4369         struct btrfs_root *root = BTRFS_I(inode)->root;
4370
4371         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4372
4373         if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
4374                 loff_t offset = new_size;
4375                 loff_t page_end = ALIGN(offset, PAGE_SIZE);
4376
4377                 /*
4378                  * Zero out the remaining of the last page of our inline extent,
4379                  * instead of directly truncating our inline extent here - that
4380                  * would be much more complex (decompressing all the data, then
4381                  * compressing the truncated data, which might be bigger than
4382                  * the size of the inline extent, resize the extent, etc).
4383                  * We release the path because to get the page we might need to
4384                  * read the extent item from disk (data not in the page cache).
4385                  */
4386                 btrfs_release_path(path);
4387                 return btrfs_truncate_block(inode, offset, page_end - offset,
4388                                         0);
4389         }
4390
4391         btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4392         size = btrfs_file_extent_calc_inline_size(size);
4393         btrfs_truncate_item(root, path, size, 1);
4394
4395         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4396                 inode_sub_bytes(inode, item_end + 1 - new_size);
4397
4398         return 0;
4399 }
4400
4401 /*
4402  * this can truncate away extent items, csum items and directory items.
4403  * It starts at a high offset and removes keys until it can't find
4404  * any higher than new_size
4405  *
4406  * csum items that cross the new i_size are truncated to the new size
4407  * as well.
4408  *
4409  * min_type is the minimum key type to truncate down to.  If set to 0, this
4410  * will kill all the items on this inode, including the INODE_ITEM_KEY.
4411  */
4412 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4413                                struct btrfs_root *root,
4414                                struct inode *inode,
4415                                u64 new_size, u32 min_type)
4416 {
4417         struct btrfs_path *path;
4418         struct extent_buffer *leaf;
4419         struct btrfs_file_extent_item *fi;
4420         struct btrfs_key key;
4421         struct btrfs_key found_key;
4422         u64 extent_start = 0;
4423         u64 extent_num_bytes = 0;
4424         u64 extent_offset = 0;
4425         u64 item_end = 0;
4426         u64 last_size = new_size;
4427         u32 found_type = (u8)-1;
4428         int found_extent;
4429         int del_item;
4430         int pending_del_nr = 0;
4431         int pending_del_slot = 0;
4432         int extent_type = -1;
4433         int ret;
4434         int err = 0;
4435         u64 ino = btrfs_ino(inode);
4436         u64 bytes_deleted = 0;
4437         bool be_nice = 0;
4438         bool should_throttle = 0;
4439         bool should_end = 0;
4440
4441         BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4442
4443         /*
4444          * for non-free space inodes and ref cows, we want to back off from
4445          * time to time
4446          */
4447         if (!btrfs_is_free_space_inode(inode) &&
4448             test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4449                 be_nice = 1;
4450
4451         path = btrfs_alloc_path();
4452         if (!path)
4453                 return -ENOMEM;
4454         path->reada = READA_BACK;
4455
4456         /*
4457          * We want to drop from the next block forward in case this new size is
4458          * not block aligned since we will be keeping the last block of the
4459          * extent just the way it is.
4460          */
4461         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4462             root == root->fs_info->tree_root)
4463                 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4464                                         root->sectorsize), (u64)-1, 0);
4465
4466         /*
4467          * This function is also used to drop the items in the log tree before
4468          * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4469          * it is used to drop the loged items. So we shouldn't kill the delayed
4470          * items.
4471          */
4472         if (min_type == 0 && root == BTRFS_I(inode)->root)
4473                 btrfs_kill_delayed_inode_items(inode);
4474
4475         key.objectid = ino;
4476         key.offset = (u64)-1;
4477         key.type = (u8)-1;
4478
4479 search_again:
4480         /*
4481          * with a 16K leaf size and 128MB extents, you can actually queue
4482          * up a huge file in a single leaf.  Most of the time that
4483          * bytes_deleted is > 0, it will be huge by the time we get here
4484          */
4485         if (be_nice && bytes_deleted > SZ_32M) {
4486                 if (btrfs_should_end_transaction(trans, root)) {
4487                         err = -EAGAIN;
4488                         goto error;
4489                 }
4490         }
4491
4492
4493         path->leave_spinning = 1;
4494         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4495         if (ret < 0) {
4496                 err = ret;
4497                 goto out;
4498         }
4499
4500         if (ret > 0) {
4501                 /* there are no items in the tree for us to truncate, we're
4502                  * done
4503                  */
4504                 if (path->slots[0] == 0)
4505                         goto out;
4506                 path->slots[0]--;
4507         }
4508
4509         while (1) {
4510                 fi = NULL;
4511                 leaf = path->nodes[0];
4512                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4513                 found_type = found_key.type;
4514
4515                 if (found_key.objectid != ino)
4516                         break;
4517
4518                 if (found_type < min_type)
4519                         break;
4520
4521                 item_end = found_key.offset;
4522                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
4523                         fi = btrfs_item_ptr(leaf, path->slots[0],
4524                                             struct btrfs_file_extent_item);
4525                         extent_type = btrfs_file_extent_type(leaf, fi);
4526                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4527                                 item_end +=
4528                                     btrfs_file_extent_num_bytes(leaf, fi);
4529                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4530                                 item_end += btrfs_file_extent_inline_len(leaf,
4531                                                          path->slots[0], fi);
4532                         }
4533                         item_end--;
4534                 }
4535                 if (found_type > min_type) {
4536                         del_item = 1;
4537                 } else {
4538                         if (item_end < new_size) {
4539                                 /*
4540                                  * With NO_HOLES mode, for the following mapping
4541                                  *
4542                                  * [0-4k][hole][8k-12k]
4543                                  *
4544                                  * if truncating isize down to 6k, it ends up
4545                                  * isize being 8k.
4546                                  */
4547                                 if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
4548                                         last_size = new_size;
4549                                 break;
4550                         }
4551                         if (found_key.offset >= new_size)
4552                                 del_item = 1;
4553                         else
4554                                 del_item = 0;
4555                 }
4556                 found_extent = 0;
4557                 /* FIXME, shrink the extent if the ref count is only 1 */
4558                 if (found_type != BTRFS_EXTENT_DATA_KEY)
4559                         goto delete;
4560
4561                 if (del_item)
4562                         last_size = found_key.offset;
4563                 else
4564                         last_size = new_size;
4565
4566                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4567                         u64 num_dec;
4568                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4569                         if (!del_item) {
4570                                 u64 orig_num_bytes =
4571                                         btrfs_file_extent_num_bytes(leaf, fi);
4572                                 extent_num_bytes = ALIGN(new_size -
4573                                                 found_key.offset,
4574                                                 root->sectorsize);
4575                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4576                                                          extent_num_bytes);
4577                                 num_dec = (orig_num_bytes -
4578                                            extent_num_bytes);
4579                                 if (test_bit(BTRFS_ROOT_REF_COWS,
4580                                              &root->state) &&
4581                                     extent_start != 0)
4582                                         inode_sub_bytes(inode, num_dec);
4583                                 btrfs_mark_buffer_dirty(leaf);
4584                         } else {
4585                                 extent_num_bytes =
4586                                         btrfs_file_extent_disk_num_bytes(leaf,
4587                                                                          fi);
4588                                 extent_offset = found_key.offset -
4589                                         btrfs_file_extent_offset(leaf, fi);
4590
4591                                 /* FIXME blocksize != 4096 */
4592                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4593                                 if (extent_start != 0) {
4594                                         found_extent = 1;
4595                                         if (test_bit(BTRFS_ROOT_REF_COWS,
4596                                                      &root->state))
4597                                                 inode_sub_bytes(inode, num_dec);
4598                                 }
4599                         }
4600                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4601                         /*
4602                          * we can't truncate inline items that have had
4603                          * special encodings
4604                          */
4605                         if (!del_item &&
4606                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
4607                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4608
4609                                 /*
4610                                  * Need to release path in order to truncate a
4611                                  * compressed extent. So delete any accumulated
4612                                  * extent items so far.
4613                                  */
4614                                 if (btrfs_file_extent_compression(leaf, fi) !=
4615                                     BTRFS_COMPRESS_NONE && pending_del_nr) {
4616                                         err = btrfs_del_items(trans, root, path,
4617                                                               pending_del_slot,
4618                                                               pending_del_nr);
4619                                         if (err) {
4620                                                 btrfs_abort_transaction(trans,
4621                                                                         err);
4622                                                 goto error;
4623                                         }
4624                                         pending_del_nr = 0;
4625                                 }
4626
4627                                 err = truncate_inline_extent(inode, path,
4628                                                              &found_key,
4629                                                              item_end,
4630                                                              new_size);
4631                                 if (err) {
4632                                         btrfs_abort_transaction(trans, err);
4633                                         goto error;
4634                                 }
4635                         } else if (test_bit(BTRFS_ROOT_REF_COWS,
4636                                             &root->state)) {
4637                                 inode_sub_bytes(inode, item_end + 1 - new_size);
4638                         }
4639                 }
4640 delete:
4641                 if (del_item) {
4642                         if (!pending_del_nr) {
4643                                 /* no pending yet, add ourselves */
4644                                 pending_del_slot = path->slots[0];
4645                                 pending_del_nr = 1;
4646                         } else if (pending_del_nr &&
4647                                    path->slots[0] + 1 == pending_del_slot) {
4648                                 /* hop on the pending chunk */
4649                                 pending_del_nr++;
4650                                 pending_del_slot = path->slots[0];
4651                         } else {
4652                                 BUG();
4653                         }
4654                 } else {
4655                         break;
4656                 }
4657                 should_throttle = 0;
4658
4659                 if (found_extent &&
4660                     (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4661                      root == root->fs_info->tree_root)) {
4662                         btrfs_set_path_blocking(path);
4663                         bytes_deleted += extent_num_bytes;
4664                         ret = btrfs_free_extent(trans, root, extent_start,
4665                                                 extent_num_bytes, 0,
4666                                                 btrfs_header_owner(leaf),
4667                                                 ino, extent_offset);
4668                         BUG_ON(ret);
4669                         if (btrfs_should_throttle_delayed_refs(trans, root))
4670                                 btrfs_async_run_delayed_refs(root,
4671                                         trans->delayed_ref_updates * 2,
4672                                         trans->transid, 0);
4673                         if (be_nice) {
4674                                 if (truncate_space_check(trans, root,
4675                                                          extent_num_bytes)) {
4676                                         should_end = 1;
4677                                 }
4678                                 if (btrfs_should_throttle_delayed_refs(trans,
4679                                                                        root)) {
4680                                         should_throttle = 1;
4681                                 }
4682                         }
4683                 }
4684
4685                 if (found_type == BTRFS_INODE_ITEM_KEY)
4686                         break;
4687
4688                 if (path->slots[0] == 0 ||
4689                     path->slots[0] != pending_del_slot ||
4690                     should_throttle || should_end) {
4691                         if (pending_del_nr) {
4692                                 ret = btrfs_del_items(trans, root, path,
4693                                                 pending_del_slot,
4694                                                 pending_del_nr);
4695                                 if (ret) {
4696                                         btrfs_abort_transaction(trans, ret);
4697                                         goto error;
4698                                 }
4699                                 pending_del_nr = 0;
4700                         }
4701                         btrfs_release_path(path);
4702                         if (should_throttle) {
4703                                 unsigned long updates = trans->delayed_ref_updates;
4704                                 if (updates) {
4705                                         trans->delayed_ref_updates = 0;
4706                                         ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4707                                         if (ret && !err)
4708                                                 err = ret;
4709                                 }
4710                         }
4711                         /*
4712                          * if we failed to refill our space rsv, bail out
4713                          * and let the transaction restart
4714                          */
4715                         if (should_end) {
4716                                 err = -EAGAIN;
4717                                 goto error;
4718                         }
4719                         goto search_again;
4720                 } else {
4721                         path->slots[0]--;
4722                 }
4723         }
4724 out:
4725         if (pending_del_nr) {
4726                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
4727                                       pending_del_nr);
4728                 if (ret)
4729                         btrfs_abort_transaction(trans, ret);
4730         }
4731 error:
4732         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4733                 btrfs_ordered_update_i_size(inode, last_size, NULL);
4734
4735         btrfs_free_path(path);
4736
4737         if (be_nice && bytes_deleted > SZ_32M) {
4738                 unsigned long updates = trans->delayed_ref_updates;
4739                 if (updates) {
4740                         trans->delayed_ref_updates = 0;
4741                         ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4742                         if (ret && !err)
4743                                 err = ret;
4744                 }
4745         }
4746         return err;
4747 }
4748
4749 /*
4750  * btrfs_truncate_block - read, zero a chunk and write a block
4751  * @inode - inode that we're zeroing
4752  * @from - the offset to start zeroing
4753  * @len - the length to zero, 0 to zero the entire range respective to the
4754  *      offset
4755  * @front - zero up to the offset instead of from the offset on
4756  *
4757  * This will find the block for the "from" offset and cow the block and zero the
4758  * part we want to zero.  This is used with truncate and hole punching.
4759  */
4760 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4761                         int front)
4762 {
4763         struct address_space *mapping = inode->i_mapping;
4764         struct btrfs_root *root = BTRFS_I(inode)->root;
4765         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4766         struct btrfs_ordered_extent *ordered;
4767         struct extent_state *cached_state = NULL;
4768         char *kaddr;
4769         u32 blocksize = root->sectorsize;
4770         pgoff_t index = from >> PAGE_SHIFT;
4771         unsigned offset = from & (blocksize - 1);
4772         struct page *page;
4773         gfp_t mask = btrfs_alloc_write_mask(mapping);
4774         int ret = 0;
4775         u64 block_start;
4776         u64 block_end;
4777
4778         if ((offset & (blocksize - 1)) == 0 &&
4779             (!len || ((len & (blocksize - 1)) == 0)))
4780                 goto out;
4781
4782         ret = btrfs_delalloc_reserve_space(inode,
4783                         round_down(from, blocksize), blocksize);
4784         if (ret)
4785                 goto out;
4786
4787 again:
4788         page = find_or_create_page(mapping, index, mask);
4789         if (!page) {
4790                 btrfs_delalloc_release_space(inode,
4791                                 round_down(from, blocksize),
4792                                 blocksize);
4793                 ret = -ENOMEM;
4794                 goto out;
4795         }
4796
4797         block_start = round_down(from, blocksize);
4798         block_end = block_start + blocksize - 1;
4799
4800         if (!PageUptodate(page)) {
4801                 ret = btrfs_readpage(NULL, page);
4802                 lock_page(page);
4803                 if (page->mapping != mapping) {
4804                         unlock_page(page);
4805                         put_page(page);
4806                         goto again;
4807                 }
4808                 if (!PageUptodate(page)) {
4809                         ret = -EIO;
4810                         goto out_unlock;
4811                 }
4812         }
4813         wait_on_page_writeback(page);
4814
4815         lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4816         set_page_extent_mapped(page);
4817
4818         ordered = btrfs_lookup_ordered_extent(inode, block_start);
4819         if (ordered) {
4820                 unlock_extent_cached(io_tree, block_start, block_end,
4821                                      &cached_state, GFP_NOFS);
4822                 unlock_page(page);
4823                 put_page(page);
4824                 btrfs_start_ordered_extent(inode, ordered, 1);
4825                 btrfs_put_ordered_extent(ordered);
4826                 goto again;
4827         }
4828
4829         clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4830                           EXTENT_DIRTY | EXTENT_DELALLOC |
4831                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4832                           0, 0, &cached_state, GFP_NOFS);
4833
4834         ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
4835                                         &cached_state, 0);
4836         if (ret) {
4837                 unlock_extent_cached(io_tree, block_start, block_end,
4838                                      &cached_state, GFP_NOFS);
4839                 goto out_unlock;
4840         }
4841
4842         if (offset != blocksize) {
4843                 if (!len)
4844                         len = blocksize - offset;
4845                 kaddr = kmap(page);
4846                 if (front)
4847                         memset(kaddr + (block_start - page_offset(page)),
4848                                 0, offset);
4849                 else
4850                         memset(kaddr + (block_start - page_offset(page)) +  offset,
4851                                 0, len);
4852                 flush_dcache_page(page);
4853                 kunmap(page);
4854         }
4855         ClearPageChecked(page);
4856         set_page_dirty(page);
4857         unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
4858                              GFP_NOFS);
4859
4860 out_unlock:
4861         if (ret)
4862                 btrfs_delalloc_release_space(inode, block_start,
4863                                              blocksize);
4864         unlock_page(page);
4865         put_page(page);
4866 out:
4867         return ret;
4868 }
4869
4870 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4871                              u64 offset, u64 len)
4872 {
4873         struct btrfs_trans_handle *trans;
4874         int ret;
4875
4876         /*
4877          * Still need to make sure the inode looks like it's been updated so
4878          * that any holes get logged if we fsync.
4879          */
4880         if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4881                 BTRFS_I(inode)->last_trans = root->fs_info->generation;
4882                 BTRFS_I(inode)->last_sub_trans = root->log_transid;
4883                 BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4884                 return 0;
4885         }
4886
4887         /*
4888          * 1 - for the one we're dropping
4889          * 1 - for the one we're adding
4890          * 1 - for updating the inode.
4891          */
4892         trans = btrfs_start_transaction(root, 3);
4893         if (IS_ERR(trans))
4894                 return PTR_ERR(trans);
4895
4896         ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4897         if (ret) {
4898                 btrfs_abort_transaction(trans, ret);
4899                 btrfs_end_transaction(trans, root);
4900                 return ret;
4901         }
4902
4903         ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4904                                        0, 0, len, 0, len, 0, 0, 0);
4905         if (ret)
4906                 btrfs_abort_transaction(trans, ret);
4907         else
4908                 btrfs_update_inode(trans, root, inode);
4909         btrfs_end_transaction(trans, root);
4910         return ret;
4911 }
4912
4913 /*
4914  * This function puts in dummy file extents for the area we're creating a hole
4915  * for.  So if we are truncating this file to a larger size we need to insert
4916  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4917  * the range between oldsize and size
4918  */
4919 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4920 {
4921         struct btrfs_root *root = BTRFS_I(inode)->root;
4922         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4923         struct extent_map *em = NULL;
4924         struct extent_state *cached_state = NULL;
4925         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4926         u64 hole_start = ALIGN(oldsize, root->sectorsize);
4927         u64 block_end = ALIGN(size, root->sectorsize);
4928         u64 last_byte;
4929         u64 cur_offset;
4930         u64 hole_size;
4931         int err = 0;
4932
4933         /*
4934          * If our size started in the middle of a block we need to zero out the
4935          * rest of the block before we expand the i_size, otherwise we could
4936          * expose stale data.
4937          */
4938         err = btrfs_truncate_block(inode, oldsize, 0, 0);
4939         if (err)
4940                 return err;
4941
4942         if (size <= hole_start)
4943                 return 0;
4944
4945         while (1) {
4946                 struct btrfs_ordered_extent *ordered;
4947
4948                 lock_extent_bits(io_tree, hole_start, block_end - 1,
4949                                  &cached_state);
4950                 ordered = btrfs_lookup_ordered_range(inode, hole_start,
4951                                                      block_end - hole_start);
4952                 if (!ordered)
4953                         break;
4954                 unlock_extent_cached(io_tree, hole_start, block_end - 1,
4955                                      &cached_state, GFP_NOFS);
4956                 btrfs_start_ordered_extent(inode, ordered, 1);
4957                 btrfs_put_ordered_extent(ordered);
4958         }
4959
4960         cur_offset = hole_start;
4961         while (1) {
4962                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4963                                 block_end - cur_offset, 0);
4964                 if (IS_ERR(em)) {
4965                         err = PTR_ERR(em);
4966                         em = NULL;
4967                         break;
4968                 }
4969                 last_byte = min(extent_map_end(em), block_end);
4970                 last_byte = ALIGN(last_byte , root->sectorsize);
4971                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4972                         struct extent_map *hole_em;
4973                         hole_size = last_byte - cur_offset;
4974
4975                         err = maybe_insert_hole(root, inode, cur_offset,
4976                                                 hole_size);
4977                         if (err)
4978                                 break;
4979                         btrfs_drop_extent_cache(inode, cur_offset,
4980                                                 cur_offset + hole_size - 1, 0);
4981                         hole_em = alloc_extent_map();
4982                         if (!hole_em) {
4983                                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4984                                         &BTRFS_I(inode)->runtime_flags);
4985                                 goto next;
4986                         }
4987                         hole_em->start = cur_offset;
4988                         hole_em->len = hole_size;
4989                         hole_em->orig_start = cur_offset;
4990
4991                         hole_em->block_start = EXTENT_MAP_HOLE;
4992                         hole_em->block_len = 0;
4993                         hole_em->orig_block_len = 0;
4994                         hole_em->ram_bytes = hole_size;
4995                         hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4996                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
4997                         hole_em->generation = root->fs_info->generation;
4998
4999                         while (1) {
5000                                 write_lock(&em_tree->lock);
5001                                 err = add_extent_mapping(em_tree, hole_em, 1);
5002                                 write_unlock(&em_tree->lock);
5003                                 if (err != -EEXIST)
5004                                         break;
5005                                 btrfs_drop_extent_cache(inode, cur_offset,
5006                                                         cur_offset +
5007                                                         hole_size - 1, 0);
5008                         }
5009                         free_extent_map(hole_em);
5010                 }
5011 next:
5012                 free_extent_map(em);
5013                 em = NULL;
5014                 cur_offset = last_byte;
5015                 if (cur_offset >= block_end)
5016                         break;
5017         }
5018         free_extent_map(em);
5019         unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
5020                              GFP_NOFS);
5021         return err;
5022 }
5023
5024 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5025 {
5026         struct btrfs_root *root = BTRFS_I(inode)->root;
5027         struct btrfs_trans_handle *trans;
5028         loff_t oldsize = i_size_read(inode);
5029         loff_t newsize = attr->ia_size;
5030         int mask = attr->ia_valid;
5031         int ret;
5032
5033         /*
5034          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5035          * special case where we need to update the times despite not having
5036          * these flags set.  For all other operations the VFS set these flags
5037          * explicitly if it wants a timestamp update.
5038          */
5039         if (newsize != oldsize) {
5040                 inode_inc_iversion(inode);
5041                 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5042                         inode->i_ctime = inode->i_mtime =
5043                                 current_time(inode);
5044         }
5045
5046         if (newsize > oldsize) {
5047                 /*
5048                  * Don't do an expanding truncate while snapshoting is ongoing.
5049                  * This is to ensure the snapshot captures a fully consistent
5050                  * state of this file - if the snapshot captures this expanding
5051                  * truncation, it must capture all writes that happened before
5052                  * this truncation.
5053                  */
5054                 btrfs_wait_for_snapshot_creation(root);
5055                 ret = btrfs_cont_expand(inode, oldsize, newsize);
5056                 if (ret) {
5057                         btrfs_end_write_no_snapshoting(root);
5058                         return ret;
5059                 }
5060
5061                 trans = btrfs_start_transaction(root, 1);
5062                 if (IS_ERR(trans)) {
5063                         btrfs_end_write_no_snapshoting(root);
5064                         return PTR_ERR(trans);
5065                 }
5066
5067                 i_size_write(inode, newsize);
5068                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5069                 pagecache_isize_extended(inode, oldsize, newsize);
5070                 ret = btrfs_update_inode(trans, root, inode);
5071                 btrfs_end_write_no_snapshoting(root);
5072                 btrfs_end_transaction(trans, root);
5073         } else {
5074
5075                 /*
5076                  * We're truncating a file that used to have good data down to
5077                  * zero. Make sure it gets into the ordered flush list so that
5078                  * any new writes get down to disk quickly.
5079                  */
5080                 if (newsize == 0)
5081                         set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5082                                 &BTRFS_I(inode)->runtime_flags);
5083
5084                 /*
5085                  * 1 for the orphan item we're going to add
5086                  * 1 for the orphan item deletion.
5087                  */
5088                 trans = btrfs_start_transaction(root, 2);
5089                 if (IS_ERR(trans))
5090                         return PTR_ERR(trans);
5091
5092                 /*
5093                  * We need to do this in case we fail at _any_ point during the
5094                  * actual truncate.  Once we do the truncate_setsize we could
5095                  * invalidate pages which forces any outstanding ordered io to
5096                  * be instantly completed which will give us extents that need
5097                  * to be truncated.  If we fail to get an orphan inode down we
5098                  * could have left over extents that were never meant to live,
5099                  * so we need to guarantee from this point on that everything
5100                  * will be consistent.
5101                  */
5102                 ret = btrfs_orphan_add(trans, inode);
5103                 btrfs_end_transaction(trans, root);
5104                 if (ret)
5105                         return ret;
5106
5107                 /* we don't support swapfiles, so vmtruncate shouldn't fail */
5108                 truncate_setsize(inode, newsize);
5109
5110                 /* Disable nonlocked read DIO to avoid the end less truncate */
5111                 btrfs_inode_block_unlocked_dio(inode);
5112                 inode_dio_wait(inode);
5113                 btrfs_inode_resume_unlocked_dio(inode);
5114
5115                 ret = btrfs_truncate(inode);
5116                 if (ret && inode->i_nlink) {
5117                         int err;
5118
5119                         /*
5120                          * failed to truncate, disk_i_size is only adjusted down
5121                          * as we remove extents, so it should represent the true
5122                          * size of the inode, so reset the in memory size and
5123                          * delete our orphan entry.
5124                          */
5125                         trans = btrfs_join_transaction(root);
5126                         if (IS_ERR(trans)) {
5127                                 btrfs_orphan_del(NULL, inode);
5128                                 return ret;
5129                         }
5130                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5131                         err = btrfs_orphan_del(trans, inode);
5132                         if (err)
5133                                 btrfs_abort_transaction(trans, err);
5134                         btrfs_end_transaction(trans, root);
5135                 }
5136         }
5137
5138         return ret;
5139 }
5140
5141 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5142 {
5143         struct inode *inode = d_inode(dentry);
5144         struct btrfs_root *root = BTRFS_I(inode)->root;
5145         int err;
5146
5147         if (btrfs_root_readonly(root))
5148                 return -EROFS;
5149
5150         err = setattr_prepare(dentry, attr);
5151         if (err)
5152                 return err;
5153
5154         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5155                 err = btrfs_setsize(inode, attr);
5156                 if (err)
5157                         return err;
5158         }
5159
5160         if (attr->ia_valid) {
5161                 setattr_copy(inode, attr);
5162                 inode_inc_iversion(inode);
5163                 err = btrfs_dirty_inode(inode);
5164
5165                 if (!err && attr->ia_valid & ATTR_MODE)
5166                         err = posix_acl_chmod(inode, inode->i_mode);
5167         }
5168
5169         return err;
5170 }
5171
5172 /*
5173  * While truncating the inode pages during eviction, we get the VFS calling
5174  * btrfs_invalidatepage() against each page of the inode. This is slow because
5175  * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5176  * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5177  * extent_state structures over and over, wasting lots of time.
5178  *
5179  * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5180  * those expensive operations on a per page basis and do only the ordered io
5181  * finishing, while we release here the extent_map and extent_state structures,
5182  * without the excessive merging and splitting.
5183  */
5184 static void evict_inode_truncate_pages(struct inode *inode)
5185 {
5186         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5187         struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5188         struct rb_node *node;
5189
5190         ASSERT(inode->i_state & I_FREEING);
5191         truncate_inode_pages_final(&inode->i_data);
5192
5193         write_lock(&map_tree->lock);
5194         while (!RB_EMPTY_ROOT(&map_tree->map)) {
5195                 struct extent_map *em;
5196
5197                 node = rb_first(&map_tree->map);
5198                 em = rb_entry(node, struct extent_map, rb_node);
5199                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5200                 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5201                 remove_extent_mapping(map_tree, em);
5202                 free_extent_map(em);
5203                 if (need_resched()) {
5204                         write_unlock(&map_tree->lock);
5205                         cond_resched();
5206                         write_lock(&map_tree->lock);
5207                 }
5208         }
5209         write_unlock(&map_tree->lock);
5210
5211         /*
5212          * Keep looping until we have no more ranges in the io tree.
5213          * We can have ongoing bios started by readpages (called from readahead)
5214          * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5215          * still in progress (unlocked the pages in the bio but did not yet
5216          * unlocked the ranges in the io tree). Therefore this means some
5217          * ranges can still be locked and eviction started because before
5218          * submitting those bios, which are executed by a separate task (work
5219          * queue kthread), inode references (inode->i_count) were not taken
5220          * (which would be dropped in the end io callback of each bio).
5221          * Therefore here we effectively end up waiting for those bios and
5222          * anyone else holding locked ranges without having bumped the inode's
5223          * reference count - if we don't do it, when they access the inode's
5224          * io_tree to unlock a range it may be too late, leading to an
5225          * use-after-free issue.
5226          */
5227         spin_lock(&io_tree->lock);
5228         while (!RB_EMPTY_ROOT(&io_tree->state)) {
5229                 struct extent_state *state;
5230                 struct extent_state *cached_state = NULL;
5231                 u64 start;
5232                 u64 end;
5233
5234                 node = rb_first(&io_tree->state);
5235                 state = rb_entry(node, struct extent_state, rb_node);
5236                 start = state->start;
5237                 end = state->end;
5238                 spin_unlock(&io_tree->lock);
5239
5240                 lock_extent_bits(io_tree, start, end, &cached_state);
5241
5242                 /*
5243                  * If still has DELALLOC flag, the extent didn't reach disk,
5244                  * and its reserved space won't be freed by delayed_ref.
5245                  * So we need to free its reserved space here.
5246                  * (Refer to comment in btrfs_invalidatepage, case 2)
5247                  *
5248                  * Note, end is the bytenr of last byte, so we need + 1 here.
5249                  */
5250                 if (state->state & EXTENT_DELALLOC)
5251                         btrfs_qgroup_free_data(inode, start, end - start + 1);
5252
5253                 clear_extent_bit(io_tree, start, end,
5254                                  EXTENT_LOCKED | EXTENT_DIRTY |
5255                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5256                                  EXTENT_DEFRAG, 1, 1,
5257                                  &cached_state, GFP_NOFS);
5258
5259                 cond_resched();
5260                 spin_lock(&io_tree->lock);
5261         }
5262         spin_unlock(&io_tree->lock);
5263 }
5264
5265 void btrfs_evict_inode(struct inode *inode)
5266 {
5267         struct btrfs_trans_handle *trans;
5268         struct btrfs_root *root = BTRFS_I(inode)->root;
5269         struct btrfs_block_rsv *rsv, *global_rsv;
5270         int steal_from_global = 0;
5271         u64 min_size;
5272         int ret;
5273
5274         trace_btrfs_inode_evict(inode);
5275
5276         if (!root) {
5277                 clear_inode(inode);
5278                 return;
5279         }
5280
5281         min_size = btrfs_calc_trunc_metadata_size(root, 1);
5282
5283         evict_inode_truncate_pages(inode);
5284
5285         if (inode->i_nlink &&
5286             ((btrfs_root_refs(&root->root_item) != 0 &&
5287               root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5288              btrfs_is_free_space_inode(inode)))
5289                 goto no_delete;
5290
5291         if (is_bad_inode(inode)) {
5292                 btrfs_orphan_del(NULL, inode);
5293                 goto no_delete;
5294         }
5295         /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5296         if (!special_file(inode->i_mode))
5297                 btrfs_wait_ordered_range(inode, 0, (u64)-1);
5298
5299         btrfs_free_io_failure_record(inode, 0, (u64)-1);
5300
5301         if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
5302                 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
5303                                  &BTRFS_I(inode)->runtime_flags));
5304                 goto no_delete;
5305         }
5306
5307         if (inode->i_nlink > 0) {
5308                 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5309                        root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5310                 goto no_delete;
5311         }
5312
5313         ret = btrfs_commit_inode_delayed_inode(inode);
5314         if (ret) {
5315                 btrfs_orphan_del(NULL, inode);
5316                 goto no_delete;
5317         }
5318
5319         rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
5320         if (!rsv) {
5321                 btrfs_orphan_del(NULL, inode);
5322                 goto no_delete;
5323         }
5324         rsv->size = min_size;
5325         rsv->failfast = 1;
5326         global_rsv = &root->fs_info->global_block_rsv;
5327
5328         btrfs_i_size_write(inode, 0);
5329
5330         /*
5331          * This is a bit simpler than btrfs_truncate since we've already
5332          * reserved our space for our orphan item in the unlink, so we just
5333          * need to reserve some slack space in case we add bytes and update
5334          * inode item when doing the truncate.
5335          */
5336         while (1) {
5337                 ret = btrfs_block_rsv_refill(root, rsv, min_size,
5338                                              BTRFS_RESERVE_FLUSH_LIMIT);
5339
5340                 /*
5341                  * Try and steal from the global reserve since we will
5342                  * likely not use this space anyway, we want to try as
5343                  * hard as possible to get this to work.
5344                  */
5345                 if (ret)
5346                         steal_from_global++;
5347                 else
5348                         steal_from_global = 0;
5349                 ret = 0;
5350
5351                 /*
5352                  * steal_from_global == 0: we reserved stuff, hooray!
5353                  * steal_from_global == 1: we didn't reserve stuff, boo!
5354                  * steal_from_global == 2: we've committed, still not a lot of
5355                  * room but maybe we'll have room in the global reserve this
5356                  * time.
5357                  * steal_from_global == 3: abandon all hope!
5358                  */
5359                 if (steal_from_global > 2) {
5360                         btrfs_warn(root->fs_info,
5361                                 "Could not get space for a delete, will truncate on mount %d",
5362                                 ret);
5363                         btrfs_orphan_del(NULL, inode);
5364                         btrfs_free_block_rsv(root, rsv);
5365                         goto no_delete;
5366                 }
5367
5368                 trans = btrfs_join_transaction(root);
5369                 if (IS_ERR(trans)) {
5370                         btrfs_orphan_del(NULL, inode);
5371                         btrfs_free_block_rsv(root, rsv);
5372                         goto no_delete;
5373                 }
5374
5375                 /*
5376                  * We can't just steal from the global reserve, we need to make
5377                  * sure there is room to do it, if not we need to commit and try
5378                  * again.
5379                  */
5380                 if (steal_from_global) {
5381                         if (!btrfs_check_space_for_delayed_refs(trans, root))
5382                                 ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5383                                                               min_size, 0);
5384                         else
5385                                 ret = -ENOSPC;
5386                 }
5387
5388                 /*
5389                  * Couldn't steal from the global reserve, we have too much
5390                  * pending stuff built up, commit the transaction and try it
5391                  * again.
5392                  */
5393                 if (ret) {
5394                         ret = btrfs_commit_transaction(trans, root);
5395                         if (ret) {
5396                                 btrfs_orphan_del(NULL, inode);
5397                                 btrfs_free_block_rsv(root, rsv);
5398                                 goto no_delete;
5399                         }
5400                         continue;
5401                 } else {
5402                         steal_from_global = 0;
5403                 }
5404
5405                 trans->block_rsv = rsv;
5406
5407                 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5408                 if (ret != -ENOSPC && ret != -EAGAIN)
5409                         break;
5410
5411                 trans->block_rsv = &root->fs_info->trans_block_rsv;
5412                 btrfs_end_transaction(trans, root);
5413                 trans = NULL;
5414                 btrfs_btree_balance_dirty(root);
5415         }
5416
5417         btrfs_free_block_rsv(root, rsv);
5418
5419         /*
5420          * Errors here aren't a big deal, it just means we leave orphan items
5421          * in the tree.  They will be cleaned up on the next mount.
5422          */
5423         if (ret == 0) {
5424                 trans->block_rsv = root->orphan_block_rsv;
5425                 btrfs_orphan_del(trans, inode);
5426         } else {
5427                 btrfs_orphan_del(NULL, inode);
5428         }
5429
5430         trans->block_rsv = &root->fs_info->trans_block_rsv;
5431         if (!(root == root->fs_info->tree_root ||
5432               root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5433                 btrfs_return_ino(root, btrfs_ino(inode));
5434
5435         btrfs_end_transaction(trans, root);
5436         btrfs_btree_balance_dirty(root);
5437 no_delete:
5438         btrfs_remove_delayed_node(inode);
5439         clear_inode(inode);
5440 }
5441
5442 /*
5443  * Return the key found in the dir entry in the location pointer, fill @type
5444  * with BTRFS_FT_*, and return 0.
5445  *
5446  * If no dir entries were found, location->objectid is 0.
5447  */
5448 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5449                                struct btrfs_key *location, u8 *type)
5450 {
5451         const char *name = dentry->d_name.name;
5452         int namelen = dentry->d_name.len;
5453         struct btrfs_dir_item *di;
5454         struct btrfs_path *path;
5455         struct btrfs_root *root = BTRFS_I(dir)->root;
5456         int ret = 0;
5457
5458         path = btrfs_alloc_path();
5459         if (!path)
5460                 return -ENOMEM;
5461
5462         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
5463                                     namelen, 0);
5464         if (IS_ERR(di))
5465                 ret = PTR_ERR(di);
5466
5467         if (IS_ERR_OR_NULL(di))
5468                 goto out_err;
5469
5470         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5471         if (!ret)
5472                 *type = btrfs_dir_type(path->nodes[0], di);
5473 out:
5474         btrfs_free_path(path);
5475         return ret;
5476 out_err:
5477         location->objectid = 0;
5478         goto out;
5479 }
5480
5481 /*
5482  * when we hit a tree root in a directory, the btrfs part of the inode
5483  * needs to be changed to reflect the root directory of the tree root.  This
5484  * is kind of like crossing a mount point.
5485  */
5486 static int fixup_tree_root_location(struct btrfs_root *root,
5487                                     struct inode *dir,
5488                                     struct dentry *dentry,
5489                                     struct btrfs_key *location,
5490                                     struct btrfs_root **sub_root)
5491 {
5492         struct btrfs_path *path;
5493         struct btrfs_root *new_root;
5494         struct btrfs_root_ref *ref;
5495         struct extent_buffer *leaf;
5496         struct btrfs_key key;
5497         int ret;
5498         int err = 0;
5499
5500         path = btrfs_alloc_path();
5501         if (!path) {
5502                 err = -ENOMEM;
5503                 goto out;
5504         }
5505
5506         err = -ENOENT;
5507         key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5508         key.type = BTRFS_ROOT_REF_KEY;
5509         key.offset = location->objectid;
5510
5511         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
5512                                 0, 0);
5513         if (ret) {
5514                 if (ret < 0)
5515                         err = ret;
5516                 goto out;
5517         }
5518
5519         leaf = path->nodes[0];
5520         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5521         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5522             btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5523                 goto out;
5524
5525         ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5526                                    (unsigned long)(ref + 1),
5527                                    dentry->d_name.len);
5528         if (ret)
5529                 goto out;
5530
5531         btrfs_release_path(path);
5532
5533         new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
5534         if (IS_ERR(new_root)) {
5535                 err = PTR_ERR(new_root);
5536                 goto out;
5537         }
5538
5539         *sub_root = new_root;
5540         location->objectid = btrfs_root_dirid(&new_root->root_item);
5541         location->type = BTRFS_INODE_ITEM_KEY;
5542         location->offset = 0;
5543         err = 0;
5544 out:
5545         btrfs_free_path(path);
5546         return err;
5547 }
5548
5549 static void inode_tree_add(struct inode *inode)
5550 {
5551         struct btrfs_root *root = BTRFS_I(inode)->root;
5552         struct btrfs_inode *entry;
5553         struct rb_node **p;
5554         struct rb_node *parent;
5555         struct rb_node *new = &BTRFS_I(inode)->rb_node;
5556         u64 ino = btrfs_ino(inode);
5557
5558         if (inode_unhashed(inode))
5559                 return;
5560         parent = NULL;
5561         spin_lock(&root->inode_lock);
5562         p = &root->inode_tree.rb_node;
5563         while (*p) {
5564                 parent = *p;
5565                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
5566
5567                 if (ino < btrfs_ino(&entry->vfs_inode))
5568                         p = &parent->rb_left;
5569                 else if (ino > btrfs_ino(&entry->vfs_inode))
5570                         p = &parent->rb_right;
5571                 else {
5572                         WARN_ON(!(entry->vfs_inode.i_state &
5573                                   (I_WILL_FREE | I_FREEING)));
5574                         rb_replace_node(parent, new, &root->inode_tree);
5575                         RB_CLEAR_NODE(parent);
5576                         spin_unlock(&root->inode_lock);
5577                         return;
5578                 }
5579         }
5580         rb_link_node(new, parent, p);
5581         rb_insert_color(new, &root->inode_tree);
5582         spin_unlock(&root->inode_lock);
5583 }
5584
5585 static void inode_tree_del(struct inode *inode)
5586 {
5587         struct btrfs_root *root = BTRFS_I(inode)->root;
5588         int empty = 0;
5589
5590         spin_lock(&root->inode_lock);
5591         if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5592                 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5593                 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5594                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5595         }
5596         spin_unlock(&root->inode_lock);
5597
5598         if (empty && btrfs_root_refs(&root->root_item) == 0) {
5599                 spin_lock(&root->inode_lock);
5600                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5601                 spin_unlock(&root->inode_lock);
5602                 if (empty)
5603                         btrfs_add_dead_root(root);
5604         }
5605 }
5606
5607 void btrfs_invalidate_inodes(struct btrfs_root *root)
5608 {
5609         struct rb_node *node;
5610         struct rb_node *prev;
5611         struct btrfs_inode *entry;
5612         struct inode *inode;
5613         u64 objectid = 0;
5614
5615         if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5616                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5617
5618         spin_lock(&root->inode_lock);
5619 again:
5620         node = root->inode_tree.rb_node;
5621         prev = NULL;
5622         while (node) {
5623                 prev = node;
5624                 entry = rb_entry(node, struct btrfs_inode, rb_node);
5625
5626                 if (objectid < btrfs_ino(&entry->vfs_inode))
5627                         node = node->rb_left;
5628                 else if (objectid > btrfs_ino(&entry->vfs_inode))
5629                         node = node->rb_right;
5630                 else
5631                         break;
5632         }
5633         if (!node) {
5634                 while (prev) {
5635                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
5636                         if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5637                                 node = prev;
5638                                 break;
5639                         }
5640                         prev = rb_next(prev);
5641                 }
5642         }
5643         while (node) {
5644                 entry = rb_entry(node, struct btrfs_inode, rb_node);
5645                 objectid = btrfs_ino(&entry->vfs_inode) + 1;
5646                 inode = igrab(&entry->vfs_inode);
5647                 if (inode) {
5648                         spin_unlock(&root->inode_lock);
5649                         if (atomic_read(&inode->i_count) > 1)
5650                                 d_prune_aliases(inode);
5651                         /*
5652                          * btrfs_drop_inode will have it removed from
5653                          * the inode cache when its usage count
5654                          * hits zero.
5655                          */
5656                         iput(inode);
5657                         cond_resched();
5658                         spin_lock(&root->inode_lock);
5659                         goto again;
5660                 }
5661
5662                 if (cond_resched_lock(&root->inode_lock))
5663                         goto again;
5664
5665                 node = rb_next(node);
5666         }
5667         spin_unlock(&root->inode_lock);
5668 }
5669
5670 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5671 {
5672         struct btrfs_iget_args *args = p;
5673         inode->i_ino = args->location->objectid;
5674         memcpy(&BTRFS_I(inode)->location, args->location,
5675                sizeof(*args->location));
5676         BTRFS_I(inode)->root = args->root;
5677         return 0;
5678 }
5679
5680 static int btrfs_find_actor(struct inode *inode, void *opaque)
5681 {
5682         struct btrfs_iget_args *args = opaque;
5683         return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5684                 args->root == BTRFS_I(inode)->root;
5685 }
5686
5687 static struct inode *btrfs_iget_locked(struct super_block *s,
5688                                        struct btrfs_key *location,
5689                                        struct btrfs_root *root)
5690 {
5691         struct inode *inode;
5692         struct btrfs_iget_args args;
5693         unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5694
5695         args.location = location;
5696         args.root = root;
5697
5698         inode = iget5_locked(s, hashval, btrfs_find_actor,
5699                              btrfs_init_locked_inode,
5700                              (void *)&args);
5701         return inode;
5702 }
5703
5704 /* Get an inode object given its location and corresponding root.
5705  * Returns in *is_new if the inode was read from disk
5706  */
5707 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5708                          struct btrfs_root *root, int *new)
5709 {
5710         struct inode *inode;
5711
5712         inode = btrfs_iget_locked(s, location, root);
5713         if (!inode)
5714                 return ERR_PTR(-ENOMEM);
5715
5716         if (inode->i_state & I_NEW) {
5717                 int ret;
5718
5719                 ret = btrfs_read_locked_inode(inode);
5720                 if (!is_bad_inode(inode)) {
5721                         inode_tree_add(inode);
5722                         unlock_new_inode(inode);
5723                         if (new)
5724                                 *new = 1;
5725                 } else {
5726                         unlock_new_inode(inode);
5727                         iput(inode);
5728                         ASSERT(ret < 0);
5729                         inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
5730                 }
5731         }
5732
5733         return inode;
5734 }
5735
5736 static struct inode *new_simple_dir(struct super_block *s,
5737                                     struct btrfs_key *key,
5738                                     struct btrfs_root *root)
5739 {
5740         struct inode *inode = new_inode(s);
5741
5742         if (!inode)
5743                 return ERR_PTR(-ENOMEM);
5744
5745         BTRFS_I(inode)->root = root;
5746         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5747         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5748
5749         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5750         inode->i_op = &btrfs_dir_ro_inode_operations;
5751         inode->i_opflags &= ~IOP_XATTR;
5752         inode->i_fop = &simple_dir_operations;
5753         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5754         inode->i_mtime = current_time(inode);
5755         inode->i_atime = inode->i_mtime;
5756         inode->i_ctime = inode->i_mtime;
5757         BTRFS_I(inode)->i_otime = inode->i_mtime;
5758
5759         return inode;
5760 }
5761
5762 static inline u8 btrfs_inode_type(struct inode *inode)
5763 {
5764         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5765 }
5766
5767 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5768 {
5769         struct inode *inode;
5770         struct btrfs_root *root = BTRFS_I(dir)->root;
5771         struct btrfs_root *sub_root = root;
5772         struct btrfs_key location;
5773         u8 di_type = 0;
5774         int index;
5775         int ret = 0;
5776
5777         if (dentry->d_name.len > BTRFS_NAME_LEN)
5778                 return ERR_PTR(-ENAMETOOLONG);
5779
5780         ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
5781         if (ret < 0)
5782                 return ERR_PTR(ret);
5783
5784         if (location.objectid == 0)
5785                 return ERR_PTR(-ENOENT);
5786
5787         if (location.type == BTRFS_INODE_ITEM_KEY) {
5788                 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5789                 if (IS_ERR(inode))
5790                         return inode;
5791
5792                 /* Do extra check against inode mode with di_type */
5793                 if (btrfs_inode_type(inode) != di_type) {
5794                         btrfs_crit(root->fs_info,
5795 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5796                                   inode->i_mode, btrfs_inode_type(inode),
5797                                   di_type);
5798                         iput(inode);
5799                         return ERR_PTR(-EUCLEAN);
5800                 }
5801                 return inode;
5802         }
5803
5804         BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5805
5806         index = srcu_read_lock(&root->fs_info->subvol_srcu);
5807         ret = fixup_tree_root_location(root, dir, dentry,
5808                                        &location, &sub_root);
5809         if (ret < 0) {
5810                 if (ret != -ENOENT)
5811                         inode = ERR_PTR(ret);
5812                 else
5813                         inode = new_simple_dir(dir->i_sb, &location, sub_root);
5814         } else {
5815                 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5816         }
5817         srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5818
5819         if (!IS_ERR(inode) && root != sub_root) {
5820                 down_read(&root->fs_info->cleanup_work_sem);
5821                 if (!(inode->i_sb->s_flags & MS_RDONLY))
5822                         ret = btrfs_orphan_cleanup(sub_root);
5823                 up_read(&root->fs_info->cleanup_work_sem);
5824                 if (ret) {
5825                         iput(inode);
5826                         inode = ERR_PTR(ret);
5827                 }
5828         }
5829
5830         return inode;
5831 }
5832
5833 static int btrfs_dentry_delete(const struct dentry *dentry)
5834 {
5835         struct btrfs_root *root;
5836         struct inode *inode = d_inode(dentry);
5837
5838         if (!inode && !IS_ROOT(dentry))
5839                 inode = d_inode(dentry->d_parent);
5840
5841         if (inode) {
5842                 root = BTRFS_I(inode)->root;
5843                 if (btrfs_root_refs(&root->root_item) == 0)
5844                         return 1;
5845
5846                 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5847                         return 1;
5848         }
5849         return 0;
5850 }
5851
5852 static void btrfs_dentry_release(struct dentry *dentry)
5853 {
5854         kfree(dentry->d_fsdata);
5855 }
5856
5857 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5858                                    unsigned int flags)
5859 {
5860         struct inode *inode;
5861
5862         inode = btrfs_lookup_dentry(dir, dentry);
5863         if (IS_ERR(inode)) {
5864                 if (PTR_ERR(inode) == -ENOENT)
5865                         inode = NULL;
5866                 else
5867                         return ERR_CAST(inode);
5868         }
5869
5870         return d_splice_alias(inode, dentry);
5871 }
5872
5873 unsigned char btrfs_filetype_table[] = {
5874         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5875 };
5876
5877 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5878 {
5879         struct inode *inode = file_inode(file);
5880         struct btrfs_root *root = BTRFS_I(inode)->root;
5881         struct btrfs_item *item;
5882         struct btrfs_dir_item *di;
5883         struct btrfs_key key;
5884         struct btrfs_key found_key;
5885         struct btrfs_path *path;
5886         struct list_head ins_list;
5887         struct list_head del_list;
5888         int ret;
5889         struct extent_buffer *leaf;
5890         int slot;
5891         unsigned char d_type;
5892         int over = 0;
5893         u32 di_cur;
5894         u32 di_total;
5895         u32 di_len;
5896         int key_type = BTRFS_DIR_INDEX_KEY;
5897         char tmp_name[32];
5898         char *name_ptr;
5899         int name_len;
5900         int is_curr = 0;        /* ctx->pos points to the current index? */
5901         bool emitted;
5902         bool put = false;
5903
5904         /* FIXME, use a real flag for deciding about the key type */
5905         if (root->fs_info->tree_root == root)
5906                 key_type = BTRFS_DIR_ITEM_KEY;
5907
5908         if (!dir_emit_dots(file, ctx))
5909                 return 0;
5910
5911         path = btrfs_alloc_path();
5912         if (!path)
5913                 return -ENOMEM;
5914
5915         path->reada = READA_FORWARD;
5916
5917         if (key_type == BTRFS_DIR_INDEX_KEY) {
5918                 INIT_LIST_HEAD(&ins_list);
5919                 INIT_LIST_HEAD(&del_list);
5920                 put = btrfs_readdir_get_delayed_items(inode, &ins_list,
5921                                                       &del_list);
5922         }
5923
5924         key.type = key_type;
5925         key.offset = ctx->pos;
5926         key.objectid = btrfs_ino(inode);
5927
5928         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5929         if (ret < 0)
5930                 goto err;
5931
5932         emitted = false;
5933         while (1) {
5934                 leaf = path->nodes[0];
5935                 slot = path->slots[0];
5936                 if (slot >= btrfs_header_nritems(leaf)) {
5937                         ret = btrfs_next_leaf(root, path);
5938                         if (ret < 0)
5939                                 goto err;
5940                         else if (ret > 0)
5941                                 break;
5942                         continue;
5943                 }
5944
5945                 item = btrfs_item_nr(slot);
5946                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5947
5948                 if (found_key.objectid != key.objectid)
5949                         break;
5950                 if (found_key.type != key_type)
5951                         break;
5952                 if (found_key.offset < ctx->pos)
5953                         goto next;
5954                 if (key_type == BTRFS_DIR_INDEX_KEY &&
5955                     btrfs_should_delete_dir_index(&del_list,
5956                                                   found_key.offset))
5957                         goto next;
5958
5959                 ctx->pos = found_key.offset;
5960                 is_curr = 1;
5961
5962                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5963                 di_cur = 0;
5964                 di_total = btrfs_item_size(leaf, item);
5965
5966                 while (di_cur < di_total) {
5967                         struct btrfs_key location;
5968
5969                         if (verify_dir_item(root, leaf, di))
5970                                 break;
5971
5972                         name_len = btrfs_dir_name_len(leaf, di);
5973                         if (name_len <= sizeof(tmp_name)) {
5974                                 name_ptr = tmp_name;
5975                         } else {
5976                                 name_ptr = kmalloc(name_len, GFP_KERNEL);
5977                                 if (!name_ptr) {
5978                                         ret = -ENOMEM;
5979                                         goto err;
5980                                 }
5981                         }
5982                         read_extent_buffer(leaf, name_ptr,
5983                                            (unsigned long)(di + 1), name_len);
5984
5985                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5986                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
5987
5988
5989                         /* is this a reference to our own snapshot? If so
5990                          * skip it.
5991                          *
5992                          * In contrast to old kernels, we insert the snapshot's
5993                          * dir item and dir index after it has been created, so
5994                          * we won't find a reference to our own snapshot. We
5995                          * still keep the following code for backward
5996                          * compatibility.
5997                          */
5998                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
5999                             location.objectid == root->root_key.objectid) {
6000                                 over = 0;
6001                                 goto skip;
6002                         }
6003                         over = !dir_emit(ctx, name_ptr, name_len,
6004                                        location.objectid, d_type);
6005
6006 skip:
6007                         if (name_ptr != tmp_name)
6008                                 kfree(name_ptr);
6009
6010                         if (over)
6011                                 goto nopos;
6012                         emitted = true;
6013                         di_len = btrfs_dir_name_len(leaf, di) +
6014                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
6015                         di_cur += di_len;
6016                         di = (struct btrfs_dir_item *)((char *)di + di_len);
6017                 }
6018 next:
6019                 path->slots[0]++;
6020         }
6021
6022         if (key_type == BTRFS_DIR_INDEX_KEY) {
6023                 if (is_curr)
6024                         ctx->pos++;
6025                 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
6026                 if (ret)
6027                         goto nopos;
6028         }
6029
6030         /*
6031          * If we haven't emitted any dir entry, we must not touch ctx->pos as
6032          * it was was set to the termination value in previous call. We assume
6033          * that "." and ".." were emitted if we reach this point and set the
6034          * termination value as well for an empty directory.
6035          */
6036         if (ctx->pos > 2 && !emitted)
6037                 goto nopos;
6038
6039         /* Reached end of directory/root. Bump pos past the last item. */
6040         ctx->pos++;
6041
6042         /*
6043          * Stop new entries from being returned after we return the last
6044          * entry.
6045          *
6046          * New directory entries are assigned a strictly increasing
6047          * offset.  This means that new entries created during readdir
6048          * are *guaranteed* to be seen in the future by that readdir.
6049          * This has broken buggy programs which operate on names as
6050          * they're returned by readdir.  Until we re-use freed offsets
6051          * we have this hack to stop new entries from being returned
6052          * under the assumption that they'll never reach this huge
6053          * offset.
6054          *
6055          * This is being careful not to overflow 32bit loff_t unless the
6056          * last entry requires it because doing so has broken 32bit apps
6057          * in the past.
6058          */
6059         if (key_type == BTRFS_DIR_INDEX_KEY) {
6060                 if (ctx->pos >= INT_MAX)
6061                         ctx->pos = LLONG_MAX;
6062                 else
6063                         ctx->pos = INT_MAX;
6064         }
6065 nopos:
6066         ret = 0;
6067 err:
6068         if (put)
6069                 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6070         btrfs_free_path(path);
6071         return ret;
6072 }
6073
6074 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
6075 {
6076         struct btrfs_root *root = BTRFS_I(inode)->root;
6077         struct btrfs_trans_handle *trans;
6078         int ret = 0;
6079         bool nolock = false;
6080
6081         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6082                 return 0;
6083
6084         if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
6085                 nolock = true;
6086
6087         if (wbc->sync_mode == WB_SYNC_ALL) {
6088                 if (nolock)
6089                         trans = btrfs_join_transaction_nolock(root);
6090                 else
6091                         trans = btrfs_join_transaction(root);
6092                 if (IS_ERR(trans))
6093                         return PTR_ERR(trans);
6094                 ret = btrfs_commit_transaction(trans, root);
6095         }
6096         return ret;
6097 }
6098
6099 /*
6100  * This is somewhat expensive, updating the tree every time the
6101  * inode changes.  But, it is most likely to find the inode in cache.
6102  * FIXME, needs more benchmarking...there are no reasons other than performance
6103  * to keep or drop this code.
6104  */
6105 static int btrfs_dirty_inode(struct inode *inode)
6106 {
6107         struct btrfs_root *root = BTRFS_I(inode)->root;
6108         struct btrfs_trans_handle *trans;
6109         int ret;
6110
6111         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6112                 return 0;
6113
6114         trans = btrfs_join_transaction(root);
6115         if (IS_ERR(trans))
6116                 return PTR_ERR(trans);
6117
6118         ret = btrfs_update_inode(trans, root, inode);
6119         if (ret && ret == -ENOSPC) {
6120                 /* whoops, lets try again with the full transaction */
6121                 btrfs_end_transaction(trans, root);
6122                 trans = btrfs_start_transaction(root, 1);
6123                 if (IS_ERR(trans))
6124                         return PTR_ERR(trans);
6125
6126                 ret = btrfs_update_inode(trans, root, inode);
6127         }
6128         btrfs_end_transaction(trans, root);
6129         if (BTRFS_I(inode)->delayed_node)
6130                 btrfs_balance_delayed_items(root);
6131
6132         return ret;
6133 }
6134
6135 /*
6136  * This is a copy of file_update_time.  We need this so we can return error on
6137  * ENOSPC for updating the inode in the case of file write and mmap writes.
6138  */
6139 static int btrfs_update_time(struct inode *inode, struct timespec *now,
6140                              int flags)
6141 {
6142         struct btrfs_root *root = BTRFS_I(inode)->root;
6143
6144         if (btrfs_root_readonly(root))
6145                 return -EROFS;
6146
6147         if (flags & S_VERSION)
6148                 inode_inc_iversion(inode);
6149         if (flags & S_CTIME)
6150                 inode->i_ctime = *now;
6151         if (flags & S_MTIME)
6152                 inode->i_mtime = *now;
6153         if (flags & S_ATIME)
6154                 inode->i_atime = *now;
6155         return btrfs_dirty_inode(inode);
6156 }
6157
6158 /*
6159  * find the highest existing sequence number in a directory
6160  * and then set the in-memory index_cnt variable to reflect
6161  * free sequence numbers
6162  */
6163 static int btrfs_set_inode_index_count(struct inode *inode)
6164 {
6165         struct btrfs_root *root = BTRFS_I(inode)->root;
6166         struct btrfs_key key, found_key;
6167         struct btrfs_path *path;
6168         struct extent_buffer *leaf;
6169         int ret;
6170
6171         key.objectid = btrfs_ino(inode);
6172         key.type = BTRFS_DIR_INDEX_KEY;
6173         key.offset = (u64)-1;
6174
6175         path = btrfs_alloc_path();
6176         if (!path)
6177                 return -ENOMEM;
6178
6179         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6180         if (ret < 0)
6181                 goto out;
6182         /* FIXME: we should be able to handle this */
6183         if (ret == 0)
6184                 goto out;
6185         ret = 0;
6186
6187         /*
6188          * MAGIC NUMBER EXPLANATION:
6189          * since we search a directory based on f_pos we have to start at 2
6190          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6191          * else has to start at 2
6192          */
6193         if (path->slots[0] == 0) {
6194                 BTRFS_I(inode)->index_cnt = 2;
6195                 goto out;
6196         }
6197
6198         path->slots[0]--;
6199
6200         leaf = path->nodes[0];
6201         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6202
6203         if (found_key.objectid != btrfs_ino(inode) ||
6204             found_key.type != BTRFS_DIR_INDEX_KEY) {
6205                 BTRFS_I(inode)->index_cnt = 2;
6206                 goto out;
6207         }
6208
6209         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
6210 out:
6211         btrfs_free_path(path);
6212         return ret;
6213 }
6214
6215 /*
6216  * helper to find a free sequence number in a given directory.  This current
6217  * code is very simple, later versions will do smarter things in the btree
6218  */
6219 int btrfs_set_inode_index(struct inode *dir, u64 *index)
6220 {
6221         int ret = 0;
6222
6223         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
6224                 ret = btrfs_inode_delayed_dir_index_count(dir);
6225                 if (ret) {
6226                         ret = btrfs_set_inode_index_count(dir);
6227                         if (ret)
6228                                 return ret;
6229                 }
6230         }
6231
6232         *index = BTRFS_I(dir)->index_cnt;
6233         BTRFS_I(dir)->index_cnt++;
6234
6235         return ret;
6236 }
6237
6238 static int btrfs_insert_inode_locked(struct inode *inode)
6239 {
6240         struct btrfs_iget_args args;
6241         args.location = &BTRFS_I(inode)->location;
6242         args.root = BTRFS_I(inode)->root;
6243
6244         return insert_inode_locked4(inode,
6245                    btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6246                    btrfs_find_actor, &args);
6247 }
6248
6249 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6250                                      struct btrfs_root *root,
6251                                      struct inode *dir,
6252                                      const char *name, int name_len,
6253                                      u64 ref_objectid, u64 objectid,
6254                                      umode_t mode, u64 *index)
6255 {
6256         struct inode *inode;
6257         struct btrfs_inode_item *inode_item;
6258         struct btrfs_key *location;
6259         struct btrfs_path *path;
6260         struct btrfs_inode_ref *ref;
6261         struct btrfs_key key[2];
6262         u32 sizes[2];
6263         int nitems = name ? 2 : 1;
6264         unsigned long ptr;
6265         int ret;
6266
6267         path = btrfs_alloc_path();
6268         if (!path)
6269                 return ERR_PTR(-ENOMEM);
6270
6271         inode = new_inode(root->fs_info->sb);
6272         if (!inode) {
6273                 btrfs_free_path(path);
6274                 return ERR_PTR(-ENOMEM);
6275         }
6276
6277         /*
6278          * O_TMPFILE, set link count to 0, so that after this point,
6279          * we fill in an inode item with the correct link count.
6280          */
6281         if (!name)
6282                 set_nlink(inode, 0);
6283
6284         /*
6285          * we have to initialize this early, so we can reclaim the inode
6286          * number if we fail afterwards in this function.
6287          */
6288         inode->i_ino = objectid;
6289
6290         if (dir && name) {
6291                 trace_btrfs_inode_request(dir);
6292
6293                 ret = btrfs_set_inode_index(dir, index);
6294                 if (ret) {
6295                         btrfs_free_path(path);
6296                         iput(inode);
6297                         return ERR_PTR(ret);
6298                 }
6299         } else if (dir) {
6300                 *index = 0;
6301         }
6302         /*
6303          * index_cnt is ignored for everything but a dir,
6304          * btrfs_get_inode_index_count has an explanation for the magic
6305          * number
6306          */
6307         BTRFS_I(inode)->index_cnt = 2;
6308         BTRFS_I(inode)->dir_index = *index;
6309         BTRFS_I(inode)->root = root;
6310         BTRFS_I(inode)->generation = trans->transid;
6311         inode->i_generation = BTRFS_I(inode)->generation;
6312
6313         /*
6314          * We could have gotten an inode number from somebody who was fsynced
6315          * and then removed in this same transaction, so let's just set full
6316          * sync since it will be a full sync anyway and this will blow away the
6317          * old info in the log.
6318          */
6319         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6320
6321         key[0].objectid = objectid;
6322         key[0].type = BTRFS_INODE_ITEM_KEY;
6323         key[0].offset = 0;
6324
6325         sizes[0] = sizeof(struct btrfs_inode_item);
6326
6327         if (name) {
6328                 /*
6329                  * Start new inodes with an inode_ref. This is slightly more
6330                  * efficient for small numbers of hard links since they will
6331                  * be packed into one item. Extended refs will kick in if we
6332                  * add more hard links than can fit in the ref item.
6333                  */
6334                 key[1].objectid = objectid;
6335                 key[1].type = BTRFS_INODE_REF_KEY;
6336                 key[1].offset = ref_objectid;
6337
6338                 sizes[1] = name_len + sizeof(*ref);
6339         }
6340
6341         location = &BTRFS_I(inode)->location;
6342         location->objectid = objectid;
6343         location->offset = 0;
6344         location->type = BTRFS_INODE_ITEM_KEY;
6345
6346         ret = btrfs_insert_inode_locked(inode);
6347         if (ret < 0)
6348                 goto fail;
6349
6350         path->leave_spinning = 1;
6351         ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
6352         if (ret != 0)
6353                 goto fail_unlock;
6354
6355         inode_init_owner(inode, dir, mode);
6356         inode_set_bytes(inode, 0);
6357
6358         inode->i_mtime = current_time(inode);
6359         inode->i_atime = inode->i_mtime;
6360         inode->i_ctime = inode->i_mtime;
6361         BTRFS_I(inode)->i_otime = inode->i_mtime;
6362
6363         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6364                                   struct btrfs_inode_item);
6365         memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
6366                              sizeof(*inode_item));
6367         fill_inode_item(trans, path->nodes[0], inode_item, inode);
6368
6369         if (name) {
6370                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6371                                      struct btrfs_inode_ref);
6372                 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6373                 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6374                 ptr = (unsigned long)(ref + 1);
6375                 write_extent_buffer(path->nodes[0], name, ptr, name_len);
6376         }
6377
6378         btrfs_mark_buffer_dirty(path->nodes[0]);
6379         btrfs_free_path(path);
6380
6381         btrfs_inherit_iflags(inode, dir);
6382
6383         if (S_ISREG(mode)) {
6384                 if (btrfs_test_opt(root->fs_info, NODATASUM))
6385                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6386                 if (btrfs_test_opt(root->fs_info, NODATACOW))
6387                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6388                                 BTRFS_INODE_NODATASUM;
6389         }
6390
6391         inode_tree_add(inode);
6392
6393         trace_btrfs_inode_new(inode);
6394         btrfs_set_inode_last_trans(trans, inode);
6395
6396         btrfs_update_root_times(trans, root);
6397
6398         ret = btrfs_inode_inherit_props(trans, inode, dir);
6399         if (ret)
6400                 btrfs_err(root->fs_info,
6401                           "error inheriting props for ino %llu (root %llu): %d",
6402                           btrfs_ino(inode), root->root_key.objectid, ret);
6403
6404         return inode;
6405
6406 fail_unlock:
6407         unlock_new_inode(inode);
6408 fail:
6409         if (dir && name)
6410                 BTRFS_I(dir)->index_cnt--;
6411         btrfs_free_path(path);
6412         iput(inode);
6413         return ERR_PTR(ret);
6414 }
6415
6416 /*
6417  * utility function to add 'inode' into 'parent_inode' with
6418  * a give name and a given sequence number.
6419  * if 'add_backref' is true, also insert a backref from the
6420  * inode to the parent directory.
6421  */
6422 int btrfs_add_link(struct btrfs_trans_handle *trans,
6423                    struct inode *parent_inode, struct inode *inode,
6424                    const char *name, int name_len, int add_backref, u64 index)
6425 {
6426         int ret = 0;
6427         struct btrfs_key key;
6428         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
6429         u64 ino = btrfs_ino(inode);
6430         u64 parent_ino = btrfs_ino(parent_inode);
6431
6432         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6433                 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
6434         } else {
6435                 key.objectid = ino;
6436                 key.type = BTRFS_INODE_ITEM_KEY;
6437                 key.offset = 0;
6438         }
6439
6440         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6441                 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
6442                                          key.objectid, root->root_key.objectid,
6443                                          parent_ino, index, name, name_len);
6444         } else if (add_backref) {
6445                 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6446                                              parent_ino, index);
6447         }
6448
6449         /* Nothing to clean up yet */
6450         if (ret)
6451                 return ret;
6452
6453         ret = btrfs_insert_dir_item(trans, root, name, name_len,
6454                                     parent_inode, &key,
6455                                     btrfs_inode_type(inode), index);
6456         if (ret == -EEXIST || ret == -EOVERFLOW)
6457                 goto fail_dir_item;
6458         else if (ret) {
6459                 btrfs_abort_transaction(trans, ret);
6460                 return ret;
6461         }
6462
6463         btrfs_i_size_write(parent_inode, parent_inode->i_size +
6464                            name_len * 2);
6465         inode_inc_iversion(parent_inode);
6466         parent_inode->i_mtime = parent_inode->i_ctime =
6467                 current_time(parent_inode);
6468         ret = btrfs_update_inode(trans, root, parent_inode);
6469         if (ret)
6470                 btrfs_abort_transaction(trans, ret);
6471         return ret;
6472
6473 fail_dir_item:
6474         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6475                 u64 local_index;
6476                 int err;
6477                 err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
6478                                  key.objectid, root->root_key.objectid,
6479                                  parent_ino, &local_index, name, name_len);
6480
6481         } else if (add_backref) {
6482                 u64 local_index;
6483                 int err;
6484
6485                 err = btrfs_del_inode_ref(trans, root, name, name_len,
6486                                           ino, parent_ino, &local_index);
6487         }
6488         return ret;
6489 }
6490
6491 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6492                             struct inode *dir, struct dentry *dentry,
6493                             struct inode *inode, int backref, u64 index)
6494 {
6495         int err = btrfs_add_link(trans, dir, inode,
6496                                  dentry->d_name.name, dentry->d_name.len,
6497                                  backref, index);
6498         if (err > 0)
6499                 err = -EEXIST;
6500         return err;
6501 }
6502
6503 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6504                         umode_t mode, dev_t rdev)
6505 {
6506         struct btrfs_trans_handle *trans;
6507         struct btrfs_root *root = BTRFS_I(dir)->root;
6508         struct inode *inode = NULL;
6509         int err;
6510         int drop_inode = 0;
6511         u64 objectid;
6512         u64 index = 0;
6513
6514         /*
6515          * 2 for inode item and ref
6516          * 2 for dir items
6517          * 1 for xattr if selinux is on
6518          */
6519         trans = btrfs_start_transaction(root, 5);
6520         if (IS_ERR(trans))
6521                 return PTR_ERR(trans);
6522
6523         err = btrfs_find_free_ino(root, &objectid);
6524         if (err)
6525                 goto out_unlock;
6526
6527         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6528                                 dentry->d_name.len, btrfs_ino(dir), objectid,
6529                                 mode, &index);
6530         if (IS_ERR(inode)) {
6531                 err = PTR_ERR(inode);
6532                 goto out_unlock;
6533         }
6534
6535         /*
6536         * If the active LSM wants to access the inode during
6537         * d_instantiate it needs these. Smack checks to see
6538         * if the filesystem supports xattrs by looking at the
6539         * ops vector.
6540         */
6541         inode->i_op = &btrfs_special_inode_operations;
6542         init_special_inode(inode, inode->i_mode, rdev);
6543
6544         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6545         if (err)
6546                 goto out_unlock_inode;
6547
6548         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6549         if (err) {
6550                 goto out_unlock_inode;
6551         } else {
6552                 btrfs_update_inode(trans, root, inode);
6553                 d_instantiate_new(dentry, inode);
6554         }
6555
6556 out_unlock:
6557         btrfs_end_transaction(trans, root);
6558         btrfs_balance_delayed_items(root);
6559         btrfs_btree_balance_dirty(root);
6560         if (drop_inode) {
6561                 inode_dec_link_count(inode);
6562                 iput(inode);
6563         }
6564         return err;
6565
6566 out_unlock_inode:
6567         drop_inode = 1;
6568         unlock_new_inode(inode);
6569         goto out_unlock;
6570
6571 }
6572
6573 static int btrfs_create(struct inode *dir, struct dentry *dentry,
6574                         umode_t mode, bool excl)
6575 {
6576         struct btrfs_trans_handle *trans;
6577         struct btrfs_root *root = BTRFS_I(dir)->root;
6578         struct inode *inode = NULL;
6579         int drop_inode_on_err = 0;
6580         int err;
6581         u64 objectid;
6582         u64 index = 0;
6583
6584         /*
6585          * 2 for inode item and ref
6586          * 2 for dir items
6587          * 1 for xattr if selinux is on
6588          */
6589         trans = btrfs_start_transaction(root, 5);
6590         if (IS_ERR(trans))
6591                 return PTR_ERR(trans);
6592
6593         err = btrfs_find_free_ino(root, &objectid);
6594         if (err)
6595                 goto out_unlock;
6596
6597         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6598                                 dentry->d_name.len, btrfs_ino(dir), objectid,
6599                                 mode, &index);
6600         if (IS_ERR(inode)) {
6601                 err = PTR_ERR(inode);
6602                 goto out_unlock;
6603         }
6604         drop_inode_on_err = 1;
6605         /*
6606         * If the active LSM wants to access the inode during
6607         * d_instantiate it needs these. Smack checks to see
6608         * if the filesystem supports xattrs by looking at the
6609         * ops vector.
6610         */
6611         inode->i_fop = &btrfs_file_operations;
6612         inode->i_op = &btrfs_file_inode_operations;
6613         inode->i_mapping->a_ops = &btrfs_aops;
6614
6615         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6616         if (err)
6617                 goto out_unlock_inode;
6618
6619         err = btrfs_update_inode(trans, root, inode);
6620         if (err)
6621                 goto out_unlock_inode;
6622
6623         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6624         if (err)
6625                 goto out_unlock_inode;
6626
6627         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6628         d_instantiate_new(dentry, inode);
6629
6630 out_unlock:
6631         btrfs_end_transaction(trans, root);
6632         if (err && drop_inode_on_err) {
6633                 inode_dec_link_count(inode);
6634                 iput(inode);
6635         }
6636         btrfs_balance_delayed_items(root);
6637         btrfs_btree_balance_dirty(root);
6638         return err;
6639
6640 out_unlock_inode:
6641         unlock_new_inode(inode);
6642         goto out_unlock;
6643
6644 }
6645
6646 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6647                       struct dentry *dentry)
6648 {
6649         struct btrfs_trans_handle *trans = NULL;
6650         struct btrfs_root *root = BTRFS_I(dir)->root;
6651         struct inode *inode = d_inode(old_dentry);
6652         u64 index;
6653         int err;
6654         int drop_inode = 0;
6655
6656         /* do not allow sys_link's with other subvols of the same device */
6657         if (root->objectid != BTRFS_I(inode)->root->objectid)
6658                 return -EXDEV;
6659
6660         if (inode->i_nlink >= BTRFS_LINK_MAX)
6661                 return -EMLINK;
6662
6663         err = btrfs_set_inode_index(dir, &index);
6664         if (err)
6665                 goto fail;
6666
6667         /*
6668          * 2 items for inode and inode ref
6669          * 2 items for dir items
6670          * 1 item for parent inode
6671          */
6672         trans = btrfs_start_transaction(root, 5);
6673         if (IS_ERR(trans)) {
6674                 err = PTR_ERR(trans);
6675                 trans = NULL;
6676                 goto fail;
6677         }
6678
6679         /* There are several dir indexes for this inode, clear the cache. */
6680         BTRFS_I(inode)->dir_index = 0ULL;
6681         inc_nlink(inode);
6682         inode_inc_iversion(inode);
6683         inode->i_ctime = current_time(inode);
6684         ihold(inode);
6685         set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6686
6687         err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
6688
6689         if (err) {
6690                 drop_inode = 1;
6691         } else {
6692                 struct dentry *parent = dentry->d_parent;
6693                 err = btrfs_update_inode(trans, root, inode);
6694                 if (err)
6695                         goto fail;
6696                 if (inode->i_nlink == 1) {
6697                         /*
6698                          * If new hard link count is 1, it's a file created
6699                          * with open(2) O_TMPFILE flag.
6700                          */
6701                         err = btrfs_orphan_del(trans, inode);
6702                         if (err)
6703                                 goto fail;
6704                 }
6705                 d_instantiate(dentry, inode);
6706                 btrfs_log_new_name(trans, inode, NULL, parent);
6707         }
6708
6709         btrfs_balance_delayed_items(root);
6710 fail:
6711         if (trans)
6712                 btrfs_end_transaction(trans, root);
6713         if (drop_inode) {
6714                 inode_dec_link_count(inode);
6715                 iput(inode);
6716         }
6717         btrfs_btree_balance_dirty(root);
6718         return err;
6719 }
6720
6721 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6722 {
6723         struct inode *inode = NULL;
6724         struct btrfs_trans_handle *trans;
6725         struct btrfs_root *root = BTRFS_I(dir)->root;
6726         int err = 0;
6727         int drop_on_err = 0;
6728         u64 objectid = 0;
6729         u64 index = 0;
6730
6731         /*
6732          * 2 items for inode and ref
6733          * 2 items for dir items
6734          * 1 for xattr if selinux is on
6735          */
6736         trans = btrfs_start_transaction(root, 5);
6737         if (IS_ERR(trans))
6738                 return PTR_ERR(trans);
6739
6740         err = btrfs_find_free_ino(root, &objectid);
6741         if (err)
6742                 goto out_fail;
6743
6744         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6745                                 dentry->d_name.len, btrfs_ino(dir), objectid,
6746                                 S_IFDIR | mode, &index);
6747         if (IS_ERR(inode)) {
6748                 err = PTR_ERR(inode);
6749                 goto out_fail;
6750         }
6751
6752         drop_on_err = 1;
6753         /* these must be set before we unlock the inode */
6754         inode->i_op = &btrfs_dir_inode_operations;
6755         inode->i_fop = &btrfs_dir_file_operations;
6756
6757         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6758         if (err)
6759                 goto out_fail_inode;
6760
6761         btrfs_i_size_write(inode, 0);
6762         err = btrfs_update_inode(trans, root, inode);
6763         if (err)
6764                 goto out_fail_inode;
6765
6766         err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6767                              dentry->d_name.len, 0, index);
6768         if (err)
6769                 goto out_fail_inode;
6770
6771         d_instantiate_new(dentry, inode);
6772         drop_on_err = 0;
6773
6774 out_fail:
6775         btrfs_end_transaction(trans, root);
6776         if (drop_on_err) {
6777                 inode_dec_link_count(inode);
6778                 iput(inode);
6779         }
6780         btrfs_balance_delayed_items(root);
6781         btrfs_btree_balance_dirty(root);
6782         return err;
6783
6784 out_fail_inode:
6785         unlock_new_inode(inode);
6786         goto out_fail;
6787 }
6788
6789 /* Find next extent map of a given extent map, caller needs to ensure locks */
6790 static struct extent_map *next_extent_map(struct extent_map *em)
6791 {
6792         struct rb_node *next;
6793
6794         next = rb_next(&em->rb_node);
6795         if (!next)
6796                 return NULL;
6797         return container_of(next, struct extent_map, rb_node);
6798 }
6799
6800 static struct extent_map *prev_extent_map(struct extent_map *em)
6801 {
6802         struct rb_node *prev;
6803
6804         prev = rb_prev(&em->rb_node);
6805         if (!prev)
6806                 return NULL;
6807         return container_of(prev, struct extent_map, rb_node);
6808 }
6809
6810 /* helper for btfs_get_extent.  Given an existing extent in the tree,
6811  * the existing extent is the nearest extent to map_start,
6812  * and an extent that you want to insert, deal with overlap and insert
6813  * the best fitted new extent into the tree.
6814  */
6815 static int merge_extent_mapping(struct extent_map_tree *em_tree,
6816                                 struct extent_map *existing,
6817                                 struct extent_map *em,
6818                                 u64 map_start)
6819 {
6820         struct extent_map *prev;
6821         struct extent_map *next;
6822         u64 start;
6823         u64 end;
6824         u64 start_diff;
6825
6826         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6827
6828         if (existing->start > map_start) {
6829                 next = existing;
6830                 prev = prev_extent_map(next);
6831         } else {
6832                 prev = existing;
6833                 next = next_extent_map(prev);
6834         }
6835
6836         start = prev ? extent_map_end(prev) : em->start;
6837         start = max_t(u64, start, em->start);
6838         end = next ? next->start : extent_map_end(em);
6839         end = min_t(u64, end, extent_map_end(em));
6840         start_diff = start - em->start;
6841         em->start = start;
6842         em->len = end - start;
6843         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6844             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6845                 em->block_start += start_diff;
6846                 em->block_len -= start_diff;
6847         }
6848         return add_extent_mapping(em_tree, em, 0);
6849 }
6850
6851 static noinline int uncompress_inline(struct btrfs_path *path,
6852                                       struct page *page,
6853                                       size_t pg_offset, u64 extent_offset,
6854                                       struct btrfs_file_extent_item *item)
6855 {
6856         int ret;
6857         struct extent_buffer *leaf = path->nodes[0];
6858         char *tmp;
6859         size_t max_size;
6860         unsigned long inline_size;
6861         unsigned long ptr;
6862         int compress_type;
6863
6864         WARN_ON(pg_offset != 0);
6865         compress_type = btrfs_file_extent_compression(leaf, item);
6866         max_size = btrfs_file_extent_ram_bytes(leaf, item);
6867         inline_size = btrfs_file_extent_inline_item_len(leaf,
6868                                         btrfs_item_nr(path->slots[0]));
6869         tmp = kmalloc(inline_size, GFP_NOFS);
6870         if (!tmp)
6871                 return -ENOMEM;
6872         ptr = btrfs_file_extent_inline_start(item);
6873
6874         read_extent_buffer(leaf, tmp, ptr, inline_size);
6875
6876         max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6877         ret = btrfs_decompress(compress_type, tmp, page,
6878                                extent_offset, inline_size, max_size);
6879
6880         /*
6881          * decompression code contains a memset to fill in any space between the end
6882          * of the uncompressed data and the end of max_size in case the decompressed
6883          * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6884          * the end of an inline extent and the beginning of the next block, so we
6885          * cover that region here.
6886          */
6887
6888         if (max_size + pg_offset < PAGE_SIZE) {
6889                 char *map = kmap(page);
6890                 memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
6891                 kunmap(page);
6892         }
6893         kfree(tmp);
6894         return ret;
6895 }
6896
6897 /*
6898  * a bit scary, this does extent mapping from logical file offset to the disk.
6899  * the ugly parts come from merging extents from the disk with the in-ram
6900  * representation.  This gets more complex because of the data=ordered code,
6901  * where the in-ram extents might be locked pending data=ordered completion.
6902  *
6903  * This also copies inline extents directly into the page.
6904  */
6905
6906 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6907                                     size_t pg_offset, u64 start, u64 len,
6908                                     int create)
6909 {
6910         int ret;
6911         int err = 0;
6912         u64 extent_start = 0;
6913         u64 extent_end = 0;
6914         u64 objectid = btrfs_ino(inode);
6915         u32 found_type;
6916         struct btrfs_path *path = NULL;
6917         struct btrfs_root *root = BTRFS_I(inode)->root;
6918         struct btrfs_file_extent_item *item;
6919         struct extent_buffer *leaf;
6920         struct btrfs_key found_key;
6921         struct extent_map *em = NULL;
6922         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6923         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6924         struct btrfs_trans_handle *trans = NULL;
6925         const bool new_inline = !page || create;
6926
6927 again:
6928         read_lock(&em_tree->lock);
6929         em = lookup_extent_mapping(em_tree, start, len);
6930         if (em)
6931                 em->bdev = root->fs_info->fs_devices->latest_bdev;
6932         read_unlock(&em_tree->lock);
6933
6934         if (em) {
6935                 if (em->start > start || em->start + em->len <= start)
6936                         free_extent_map(em);
6937                 else if (em->block_start == EXTENT_MAP_INLINE && page)
6938                         free_extent_map(em);
6939                 else
6940                         goto out;
6941         }
6942         em = alloc_extent_map();
6943         if (!em) {
6944                 err = -ENOMEM;
6945                 goto out;
6946         }
6947         em->bdev = root->fs_info->fs_devices->latest_bdev;
6948         em->start = EXTENT_MAP_HOLE;
6949         em->orig_start = EXTENT_MAP_HOLE;
6950         em->len = (u64)-1;
6951         em->block_len = (u64)-1;
6952
6953         if (!path) {
6954                 path = btrfs_alloc_path();
6955                 if (!path) {
6956                         err = -ENOMEM;
6957                         goto out;
6958                 }
6959                 /*
6960                  * Chances are we'll be called again, so go ahead and do
6961                  * readahead
6962                  */
6963                 path->reada = READA_FORWARD;
6964         }
6965
6966         ret = btrfs_lookup_file_extent(trans, root, path,
6967                                        objectid, start, trans != NULL);
6968         if (ret < 0) {
6969                 err = ret;
6970                 goto out;
6971         }
6972
6973         if (ret != 0) {
6974                 if (path->slots[0] == 0)
6975                         goto not_found;
6976                 path->slots[0]--;
6977         }
6978
6979         leaf = path->nodes[0];
6980         item = btrfs_item_ptr(leaf, path->slots[0],
6981                               struct btrfs_file_extent_item);
6982         /* are we inside the extent that was found? */
6983         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6984         found_type = found_key.type;
6985         if (found_key.objectid != objectid ||
6986             found_type != BTRFS_EXTENT_DATA_KEY) {
6987                 /*
6988                  * If we backup past the first extent we want to move forward
6989                  * and see if there is an extent in front of us, otherwise we'll
6990                  * say there is a hole for our whole search range which can
6991                  * cause problems.
6992                  */
6993                 extent_end = start;
6994                 goto next;
6995         }
6996
6997         found_type = btrfs_file_extent_type(leaf, item);
6998         extent_start = found_key.offset;
6999         if (found_type == BTRFS_FILE_EXTENT_REG ||
7000             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7001                 /* Only regular file could have regular/prealloc extent */
7002                 if (!S_ISREG(inode->i_mode)) {
7003                         err = -EUCLEAN;
7004                         btrfs_crit(root->fs_info,
7005                 "regular/prealloc extent found for non-regular inode %llu",
7006                                    btrfs_ino(inode));
7007                         goto out;
7008                 }
7009                 extent_end = extent_start +
7010                        btrfs_file_extent_num_bytes(leaf, item);
7011         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
7012                 size_t size;
7013                 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
7014                 extent_end = ALIGN(extent_start + size, root->sectorsize);
7015         }
7016 next:
7017         if (start >= extent_end) {
7018                 path->slots[0]++;
7019                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
7020                         ret = btrfs_next_leaf(root, path);
7021                         if (ret < 0) {
7022                                 err = ret;
7023                                 goto out;
7024                         }
7025                         if (ret > 0)
7026                                 goto not_found;
7027                         leaf = path->nodes[0];
7028                 }
7029                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7030                 if (found_key.objectid != objectid ||
7031                     found_key.type != BTRFS_EXTENT_DATA_KEY)
7032                         goto not_found;
7033                 if (start + len <= found_key.offset)
7034                         goto not_found;
7035                 if (start > found_key.offset)
7036                         goto next;
7037                 em->start = start;
7038                 em->orig_start = start;
7039                 em->len = found_key.offset - start;
7040                 goto not_found_em;
7041         }
7042
7043         btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
7044
7045         if (found_type == BTRFS_FILE_EXTENT_REG ||
7046             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7047                 goto insert;
7048         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
7049                 unsigned long ptr;
7050                 char *map;
7051                 size_t size;
7052                 size_t extent_offset;
7053                 size_t copy_size;
7054
7055                 if (new_inline)
7056                         goto out;
7057
7058                 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
7059                 extent_offset = page_offset(page) + pg_offset - extent_start;
7060                 copy_size = min_t(u64, PAGE_SIZE - pg_offset,
7061                                   size - extent_offset);
7062                 em->start = extent_start + extent_offset;
7063                 em->len = ALIGN(copy_size, root->sectorsize);
7064                 em->orig_block_len = em->len;
7065                 em->orig_start = em->start;
7066                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
7067                 if (create == 0 && !PageUptodate(page)) {
7068                         if (btrfs_file_extent_compression(leaf, item) !=
7069                             BTRFS_COMPRESS_NONE) {
7070                                 ret = uncompress_inline(path, page, pg_offset,
7071                                                         extent_offset, item);
7072                                 if (ret) {
7073                                         err = ret;
7074                                         goto out;
7075                                 }
7076                         } else {
7077                                 map = kmap(page);
7078                                 read_extent_buffer(leaf, map + pg_offset, ptr,
7079                                                    copy_size);
7080                                 if (pg_offset + copy_size < PAGE_SIZE) {
7081                                         memset(map + pg_offset + copy_size, 0,
7082                                                PAGE_SIZE - pg_offset -
7083                                                copy_size);
7084                                 }
7085                                 kunmap(page);
7086                         }
7087                         flush_dcache_page(page);
7088                 } else if (create && PageUptodate(page)) {
7089                         BUG();
7090                         if (!trans) {
7091                                 kunmap(page);
7092                                 free_extent_map(em);
7093                                 em = NULL;
7094
7095                                 btrfs_release_path(path);
7096                                 trans = btrfs_join_transaction(root);
7097
7098                                 if (IS_ERR(trans))
7099                                         return ERR_CAST(trans);
7100                                 goto again;
7101                         }
7102                         map = kmap(page);
7103                         write_extent_buffer(leaf, map + pg_offset, ptr,
7104                                             copy_size);
7105                         kunmap(page);
7106                         btrfs_mark_buffer_dirty(leaf);
7107                 }
7108                 set_extent_uptodate(io_tree, em->start,
7109                                     extent_map_end(em) - 1, NULL, GFP_NOFS);
7110                 goto insert;
7111         }
7112 not_found:
7113         em->start = start;
7114         em->orig_start = start;
7115         em->len = len;
7116 not_found_em:
7117         em->block_start = EXTENT_MAP_HOLE;
7118         set_bit(EXTENT_FLAG_VACANCY, &em->flags);
7119 insert:
7120         btrfs_release_path(path);
7121         if (em->start > start || extent_map_end(em) <= start) {
7122                 btrfs_err(root->fs_info,
7123                           "bad extent! em: [%llu %llu] passed [%llu %llu]",
7124                           em->start, em->len, start, len);
7125                 err = -EIO;
7126                 goto out;
7127         }
7128
7129         err = 0;
7130         write_lock(&em_tree->lock);
7131         ret = add_extent_mapping(em_tree, em, 0);
7132         /* it is possible that someone inserted the extent into the tree
7133          * while we had the lock dropped.  It is also possible that
7134          * an overlapping map exists in the tree
7135          */
7136         if (ret == -EEXIST) {
7137                 struct extent_map *existing;
7138
7139                 ret = 0;
7140
7141                 existing = search_extent_mapping(em_tree, start, len);
7142                 /*
7143                  * existing will always be non-NULL, since there must be
7144                  * extent causing the -EEXIST.
7145                  */
7146                 if (existing->start == em->start &&
7147                     extent_map_end(existing) == extent_map_end(em) &&
7148                     em->block_start == existing->block_start) {
7149                         /*
7150                          * these two extents are the same, it happens
7151                          * with inlines especially
7152                          */
7153                         free_extent_map(em);
7154                         em = existing;
7155                         err = 0;
7156
7157                 } else if (start >= extent_map_end(existing) ||
7158                     start <= existing->start) {
7159                         /*
7160                          * The existing extent map is the one nearest to
7161                          * the [start, start + len) range which overlaps
7162                          */
7163                         err = merge_extent_mapping(em_tree, existing,
7164                                                    em, start);
7165                         free_extent_map(existing);
7166                         if (err) {
7167                                 free_extent_map(em);
7168                                 em = NULL;
7169                         }
7170                 } else {
7171                         free_extent_map(em);
7172                         em = existing;
7173                         err = 0;
7174                 }
7175         }
7176         write_unlock(&em_tree->lock);
7177 out:
7178
7179         trace_btrfs_get_extent(root, em);
7180
7181         btrfs_free_path(path);
7182         if (trans) {
7183                 ret = btrfs_end_transaction(trans, root);
7184                 if (!err)
7185                         err = ret;
7186         }
7187         if (err) {
7188                 free_extent_map(em);
7189                 return ERR_PTR(err);
7190         }
7191         BUG_ON(!em); /* Error is always set */
7192         return em;
7193 }
7194
7195 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
7196                                            size_t pg_offset, u64 start, u64 len,
7197                                            int create)
7198 {
7199         struct extent_map *em;
7200         struct extent_map *hole_em = NULL;
7201         u64 range_start = start;
7202         u64 end;
7203         u64 found;
7204         u64 found_end;
7205         int err = 0;
7206
7207         em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
7208         if (IS_ERR(em))
7209                 return em;
7210         if (em) {
7211                 /*
7212                  * if our em maps to
7213                  * -  a hole or
7214                  * -  a pre-alloc extent,
7215                  * there might actually be delalloc bytes behind it.
7216                  */
7217                 if (em->block_start != EXTENT_MAP_HOLE &&
7218                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7219                         return em;
7220                 else
7221                         hole_em = em;
7222         }
7223
7224         /* check to see if we've wrapped (len == -1 or similar) */
7225         end = start + len;
7226         if (end < start)
7227                 end = (u64)-1;
7228         else
7229                 end -= 1;
7230
7231         em = NULL;
7232
7233         /* ok, we didn't find anything, lets look for delalloc */
7234         found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
7235                                  end, len, EXTENT_DELALLOC, 1);
7236         found_end = range_start + found;
7237         if (found_end < range_start)
7238                 found_end = (u64)-1;
7239
7240         /*
7241          * we didn't find anything useful, return
7242          * the original results from get_extent()
7243          */
7244         if (range_start > end || found_end <= start) {
7245                 em = hole_em;
7246                 hole_em = NULL;
7247                 goto out;
7248         }
7249
7250         /* adjust the range_start to make sure it doesn't
7251          * go backwards from the start they passed in
7252          */
7253         range_start = max(start, range_start);
7254         found = found_end - range_start;
7255
7256         if (found > 0) {
7257                 u64 hole_start = start;
7258                 u64 hole_len = len;
7259
7260                 em = alloc_extent_map();
7261                 if (!em) {
7262                         err = -ENOMEM;
7263                         goto out;
7264                 }
7265                 /*
7266                  * when btrfs_get_extent can't find anything it
7267                  * returns one huge hole
7268                  *
7269                  * make sure what it found really fits our range, and
7270                  * adjust to make sure it is based on the start from
7271                  * the caller
7272                  */
7273                 if (hole_em) {
7274                         u64 calc_end = extent_map_end(hole_em);
7275
7276                         if (calc_end <= start || (hole_em->start > end)) {
7277                                 free_extent_map(hole_em);
7278                                 hole_em = NULL;
7279                         } else {
7280                                 hole_start = max(hole_em->start, start);
7281                                 hole_len = calc_end - hole_start;
7282                         }
7283                 }
7284                 em->bdev = NULL;
7285                 if (hole_em && range_start > hole_start) {
7286                         /* our hole starts before our delalloc, so we
7287                          * have to return just the parts of the hole
7288                          * that go until  the delalloc starts
7289                          */
7290                         em->len = min(hole_len,
7291                                       range_start - hole_start);
7292                         em->start = hole_start;
7293                         em->orig_start = hole_start;
7294                         /*
7295                          * don't adjust block start at all,
7296                          * it is fixed at EXTENT_MAP_HOLE
7297                          */
7298                         em->block_start = hole_em->block_start;
7299                         em->block_len = hole_len;
7300                         if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7301                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7302                 } else {
7303                         em->start = range_start;
7304                         em->len = found;
7305                         em->orig_start = range_start;
7306                         em->block_start = EXTENT_MAP_DELALLOC;
7307                         em->block_len = found;
7308                 }
7309         } else if (hole_em) {
7310                 return hole_em;
7311         }
7312 out:
7313
7314         free_extent_map(hole_em);
7315         if (err) {
7316                 free_extent_map(em);
7317                 return ERR_PTR(err);
7318         }
7319         return em;
7320 }
7321
7322 static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7323                                                   const u64 start,
7324                                                   const u64 len,
7325                                                   const u64 orig_start,
7326                                                   const u64 block_start,
7327                                                   const u64 block_len,
7328                                                   const u64 orig_block_len,
7329                                                   const u64 ram_bytes,
7330                                                   const int type)
7331 {
7332         struct extent_map *em = NULL;
7333         int ret;
7334
7335         if (type != BTRFS_ORDERED_NOCOW) {
7336                 em = create_pinned_em(inode, start, len, orig_start,
7337                                       block_start, block_len, orig_block_len,
7338                                       ram_bytes, type);
7339                 if (IS_ERR(em))
7340                         goto out;
7341         }
7342         ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7343                                            len, block_len, type);
7344         if (ret) {
7345                 if (em) {
7346                         free_extent_map(em);
7347                         btrfs_drop_extent_cache(inode, start,
7348                                                 start + len - 1, 0);
7349                 }
7350                 em = ERR_PTR(ret);
7351         }
7352  out:
7353
7354         return em;
7355 }
7356
7357 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7358                                                   u64 start, u64 len)
7359 {
7360         struct btrfs_root *root = BTRFS_I(inode)->root;
7361         struct extent_map *em;
7362         struct btrfs_key ins;
7363         u64 alloc_hint;
7364         int ret;
7365
7366         alloc_hint = get_extent_allocation_hint(inode, start, len);
7367         ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
7368                                    alloc_hint, &ins, 1, 1);
7369         if (ret)
7370                 return ERR_PTR(ret);
7371
7372         em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7373                                      ins.objectid, ins.offset, ins.offset,
7374                                      ins.offset, 0);
7375         btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
7376         if (IS_ERR(em))
7377                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7378
7379         return em;
7380 }
7381
7382 /*
7383  * returns 1 when the nocow is safe, < 1 on error, 0 if the
7384  * block must be cow'd
7385  */
7386 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7387                               u64 *orig_start, u64 *orig_block_len,
7388                               u64 *ram_bytes)
7389 {
7390         struct btrfs_trans_handle *trans;
7391         struct btrfs_path *path;
7392         int ret;
7393         struct extent_buffer *leaf;
7394         struct btrfs_root *root = BTRFS_I(inode)->root;
7395         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7396         struct btrfs_file_extent_item *fi;
7397         struct btrfs_key key;
7398         u64 disk_bytenr;
7399         u64 backref_offset;
7400         u64 extent_end;
7401         u64 num_bytes;
7402         int slot;
7403         int found_type;
7404         bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7405
7406         path = btrfs_alloc_path();
7407         if (!path)
7408                 return -ENOMEM;
7409
7410         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7411                                        offset, 0);
7412         if (ret < 0)
7413                 goto out;
7414
7415         slot = path->slots[0];
7416         if (ret == 1) {
7417                 if (slot == 0) {
7418                         /* can't find the item, must cow */
7419                         ret = 0;
7420                         goto out;
7421                 }
7422                 slot--;
7423         }
7424         ret = 0;
7425         leaf = path->nodes[0];
7426         btrfs_item_key_to_cpu(leaf, &key, slot);
7427         if (key.objectid != btrfs_ino(inode) ||
7428             key.type != BTRFS_EXTENT_DATA_KEY) {
7429                 /* not our file or wrong item type, must cow */
7430                 goto out;
7431         }
7432
7433         if (key.offset > offset) {
7434                 /* Wrong offset, must cow */
7435                 goto out;
7436         }
7437
7438         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7439         found_type = btrfs_file_extent_type(leaf, fi);
7440         if (found_type != BTRFS_FILE_EXTENT_REG &&
7441             found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7442                 /* not a regular extent, must cow */
7443                 goto out;
7444         }
7445
7446         if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7447                 goto out;
7448
7449         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7450         if (extent_end <= offset)
7451                 goto out;
7452
7453         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7454         if (disk_bytenr == 0)
7455                 goto out;
7456
7457         if (btrfs_file_extent_compression(leaf, fi) ||
7458             btrfs_file_extent_encryption(leaf, fi) ||
7459             btrfs_file_extent_other_encoding(leaf, fi))
7460                 goto out;
7461
7462         backref_offset = btrfs_file_extent_offset(leaf, fi);
7463
7464         if (orig_start) {
7465                 *orig_start = key.offset - backref_offset;
7466                 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7467                 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7468         }
7469
7470         if (btrfs_extent_readonly(root, disk_bytenr))
7471                 goto out;
7472
7473         num_bytes = min(offset + *len, extent_end) - offset;
7474         if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7475                 u64 range_end;
7476
7477                 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
7478                 ret = test_range_bit(io_tree, offset, range_end,
7479                                      EXTENT_DELALLOC, 0, NULL);
7480                 if (ret) {
7481                         ret = -EAGAIN;
7482                         goto out;
7483                 }
7484         }
7485
7486         btrfs_release_path(path);
7487
7488         /*
7489          * look for other files referencing this extent, if we
7490          * find any we must cow
7491          */
7492         trans = btrfs_join_transaction(root);
7493         if (IS_ERR(trans)) {
7494                 ret = 0;
7495                 goto out;
7496         }
7497
7498         ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
7499                                     key.offset - backref_offset, disk_bytenr);
7500         btrfs_end_transaction(trans, root);
7501         if (ret) {
7502                 ret = 0;
7503                 goto out;
7504         }
7505
7506         /*
7507          * adjust disk_bytenr and num_bytes to cover just the bytes
7508          * in this extent we are about to write.  If there
7509          * are any csums in that range we have to cow in order
7510          * to keep the csums correct
7511          */
7512         disk_bytenr += backref_offset;
7513         disk_bytenr += offset - key.offset;
7514         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
7515                                 goto out;
7516         /*
7517          * all of the above have passed, it is safe to overwrite this extent
7518          * without cow
7519          */
7520         *len = num_bytes;
7521         ret = 1;
7522 out:
7523         btrfs_free_path(path);
7524         return ret;
7525 }
7526
7527 bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7528 {
7529         struct radix_tree_root *root = &inode->i_mapping->page_tree;
7530         int found = false;
7531         void **pagep = NULL;
7532         struct page *page = NULL;
7533         unsigned long start_idx;
7534         unsigned long end_idx;
7535
7536         start_idx = start >> PAGE_SHIFT;
7537
7538         /*
7539          * end is the last byte in the last page.  end == start is legal
7540          */
7541         end_idx = end >> PAGE_SHIFT;
7542
7543         rcu_read_lock();
7544
7545         /* Most of the code in this while loop is lifted from
7546          * find_get_page.  It's been modified to begin searching from a
7547          * page and return just the first page found in that range.  If the
7548          * found idx is less than or equal to the end idx then we know that
7549          * a page exists.  If no pages are found or if those pages are
7550          * outside of the range then we're fine (yay!) */
7551         while (page == NULL &&
7552                radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
7553                 page = radix_tree_deref_slot(pagep);
7554                 if (unlikely(!page))
7555                         break;
7556
7557                 if (radix_tree_exception(page)) {
7558                         if (radix_tree_deref_retry(page)) {
7559                                 page = NULL;
7560                                 continue;
7561                         }
7562                         /*
7563                          * Otherwise, shmem/tmpfs must be storing a swap entry
7564                          * here as an exceptional entry: so return it without
7565                          * attempting to raise page count.
7566                          */
7567                         page = NULL;
7568                         break; /* TODO: Is this relevant for this use case? */
7569                 }
7570
7571                 if (!page_cache_get_speculative(page)) {
7572                         page = NULL;
7573                         continue;
7574                 }
7575
7576                 /*
7577                  * Has the page moved?
7578                  * This is part of the lockless pagecache protocol. See
7579                  * include/linux/pagemap.h for details.
7580                  */
7581                 if (unlikely(page != *pagep)) {
7582                         put_page(page);
7583                         page = NULL;
7584                 }
7585         }
7586
7587         if (page) {
7588                 if (page->index <= end_idx)
7589                         found = true;
7590                 put_page(page);
7591         }
7592
7593         rcu_read_unlock();
7594         return found;
7595 }
7596
7597 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7598                               struct extent_state **cached_state, int writing)
7599 {
7600         struct btrfs_ordered_extent *ordered;
7601         int ret = 0;
7602
7603         while (1) {
7604                 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7605                                  cached_state);
7606                 /*
7607                  * We're concerned with the entire range that we're going to be
7608                  * doing DIO to, so we need to make sure there's no ordered
7609                  * extents in this range.
7610                  */
7611                 ordered = btrfs_lookup_ordered_range(inode, lockstart,
7612                                                      lockend - lockstart + 1);
7613
7614                 /*
7615                  * We need to make sure there are no buffered pages in this
7616                  * range either, we could have raced between the invalidate in
7617                  * generic_file_direct_write and locking the extent.  The
7618                  * invalidate needs to happen so that reads after a write do not
7619                  * get stale data.
7620                  */
7621                 if (!ordered &&
7622                     (!writing ||
7623                      !btrfs_page_exists_in_range(inode, lockstart, lockend)))
7624                         break;
7625
7626                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7627                                      cached_state, GFP_NOFS);
7628
7629                 if (ordered) {
7630                         /*
7631                          * If we are doing a DIO read and the ordered extent we
7632                          * found is for a buffered write, we can not wait for it
7633                          * to complete and retry, because if we do so we can
7634                          * deadlock with concurrent buffered writes on page
7635                          * locks. This happens only if our DIO read covers more
7636                          * than one extent map, if at this point has already
7637                          * created an ordered extent for a previous extent map
7638                          * and locked its range in the inode's io tree, and a
7639                          * concurrent write against that previous extent map's
7640                          * range and this range started (we unlock the ranges
7641                          * in the io tree only when the bios complete and
7642                          * buffered writes always lock pages before attempting
7643                          * to lock range in the io tree).
7644                          */
7645                         if (writing ||
7646                             test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7647                                 btrfs_start_ordered_extent(inode, ordered, 1);
7648                         else
7649                                 ret = -ENOTBLK;
7650                         btrfs_put_ordered_extent(ordered);
7651                 } else {
7652                         /*
7653                          * We could trigger writeback for this range (and wait
7654                          * for it to complete) and then invalidate the pages for
7655                          * this range (through invalidate_inode_pages2_range()),
7656                          * but that can lead us to a deadlock with a concurrent
7657                          * call to readpages() (a buffered read or a defrag call
7658                          * triggered a readahead) on a page lock due to an
7659                          * ordered dio extent we created before but did not have
7660                          * yet a corresponding bio submitted (whence it can not
7661                          * complete), which makes readpages() wait for that
7662                          * ordered extent to complete while holding a lock on
7663                          * that page.
7664                          */
7665                         ret = -ENOTBLK;
7666                 }
7667
7668                 if (ret)
7669                         break;
7670
7671                 cond_resched();
7672         }
7673
7674         return ret;
7675 }
7676
7677 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
7678                                            u64 len, u64 orig_start,
7679                                            u64 block_start, u64 block_len,
7680                                            u64 orig_block_len, u64 ram_bytes,
7681                                            int type)
7682 {
7683         struct extent_map_tree *em_tree;
7684         struct extent_map *em;
7685         struct btrfs_root *root = BTRFS_I(inode)->root;
7686         int ret;
7687
7688         em_tree = &BTRFS_I(inode)->extent_tree;
7689         em = alloc_extent_map();
7690         if (!em)
7691                 return ERR_PTR(-ENOMEM);
7692
7693         em->start = start;
7694         em->orig_start = orig_start;
7695         em->mod_start = start;
7696         em->mod_len = len;
7697         em->len = len;
7698         em->block_len = block_len;
7699         em->block_start = block_start;
7700         em->bdev = root->fs_info->fs_devices->latest_bdev;
7701         em->orig_block_len = orig_block_len;
7702         em->ram_bytes = ram_bytes;
7703         em->generation = -1;
7704         set_bit(EXTENT_FLAG_PINNED, &em->flags);
7705         if (type == BTRFS_ORDERED_PREALLOC)
7706                 set_bit(EXTENT_FLAG_FILLING, &em->flags);
7707
7708         do {
7709                 btrfs_drop_extent_cache(inode, em->start,
7710                                 em->start + em->len - 1, 0);
7711                 write_lock(&em_tree->lock);
7712                 ret = add_extent_mapping(em_tree, em, 1);
7713                 write_unlock(&em_tree->lock);
7714         } while (ret == -EEXIST);
7715
7716         if (ret) {
7717                 free_extent_map(em);
7718                 return ERR_PTR(ret);
7719         }
7720
7721         return em;
7722 }
7723
7724 static void adjust_dio_outstanding_extents(struct inode *inode,
7725                                            struct btrfs_dio_data *dio_data,
7726                                            const u64 len)
7727 {
7728         unsigned num_extents;
7729
7730         num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
7731                                            BTRFS_MAX_EXTENT_SIZE);
7732         /*
7733          * If we have an outstanding_extents count still set then we're
7734          * within our reservation, otherwise we need to adjust our inode
7735          * counter appropriately.
7736          */
7737         if (dio_data->outstanding_extents >= num_extents) {
7738                 dio_data->outstanding_extents -= num_extents;
7739         } else {
7740                 /*
7741                  * If dio write length has been split due to no large enough
7742                  * contiguous space, we need to compensate our inode counter
7743                  * appropriately.
7744                  */
7745                 u64 num_needed = num_extents - dio_data->outstanding_extents;
7746
7747                 spin_lock(&BTRFS_I(inode)->lock);
7748                 BTRFS_I(inode)->outstanding_extents += num_needed;
7749                 spin_unlock(&BTRFS_I(inode)->lock);
7750         }
7751 }
7752
7753 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7754                                    struct buffer_head *bh_result, int create)
7755 {
7756         struct extent_map *em;
7757         struct btrfs_root *root = BTRFS_I(inode)->root;
7758         struct extent_state *cached_state = NULL;
7759         struct btrfs_dio_data *dio_data = NULL;
7760         u64 start = iblock << inode->i_blkbits;
7761         u64 lockstart, lockend;
7762         u64 len = bh_result->b_size;
7763         int unlock_bits = EXTENT_LOCKED;
7764         int ret = 0;
7765
7766         if (create)
7767                 unlock_bits |= EXTENT_DIRTY;
7768         else
7769                 len = min_t(u64, len, root->sectorsize);
7770
7771         lockstart = start;
7772         lockend = start + len - 1;
7773
7774         if (current->journal_info) {
7775                 /*
7776                  * Need to pull our outstanding extents and set journal_info to NULL so
7777                  * that anything that needs to check if there's a transaction doesn't get
7778                  * confused.
7779                  */
7780                 dio_data = current->journal_info;
7781                 current->journal_info = NULL;
7782         }
7783
7784         /*
7785          * If this errors out it's because we couldn't invalidate pagecache for
7786          * this range and we need to fallback to buffered.
7787          */
7788         if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7789                                create)) {
7790                 ret = -ENOTBLK;
7791                 goto err;
7792         }
7793
7794         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
7795         if (IS_ERR(em)) {
7796                 ret = PTR_ERR(em);
7797                 goto unlock_err;
7798         }
7799
7800         /*
7801          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7802          * io.  INLINE is special, and we could probably kludge it in here, but
7803          * it's still buffered so for safety lets just fall back to the generic
7804          * buffered path.
7805          *
7806          * For COMPRESSED we _have_ to read the entire extent in so we can
7807          * decompress it, so there will be buffering required no matter what we
7808          * do, so go ahead and fallback to buffered.
7809          *
7810          * We return -ENOTBLK because that's what makes DIO go ahead and go back
7811          * to buffered IO.  Don't blame me, this is the price we pay for using
7812          * the generic code.
7813          */
7814         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7815             em->block_start == EXTENT_MAP_INLINE) {
7816                 free_extent_map(em);
7817                 ret = -ENOTBLK;
7818                 goto unlock_err;
7819         }
7820
7821         /* Just a good old fashioned hole, return */
7822         if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7823                         test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7824                 free_extent_map(em);
7825                 goto unlock_err;
7826         }
7827
7828         /*
7829          * We don't allocate a new extent in the following cases
7830          *
7831          * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7832          * existing extent.
7833          * 2) The extent is marked as PREALLOC.  We're good to go here and can
7834          * just use the extent.
7835          *
7836          */
7837         if (!create) {
7838                 len = min(len, em->len - (start - em->start));
7839                 lockstart = start + len;
7840                 goto unlock;
7841         }
7842
7843         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7844             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7845              em->block_start != EXTENT_MAP_HOLE)) {
7846                 int type;
7847                 u64 block_start, orig_start, orig_block_len, ram_bytes;
7848
7849                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7850                         type = BTRFS_ORDERED_PREALLOC;
7851                 else
7852                         type = BTRFS_ORDERED_NOCOW;
7853                 len = min(len, em->len - (start - em->start));
7854                 block_start = em->block_start + (start - em->start);
7855
7856                 if (can_nocow_extent(inode, start, &len, &orig_start,
7857                                      &orig_block_len, &ram_bytes) == 1 &&
7858                     btrfs_inc_nocow_writers(root->fs_info, block_start)) {
7859                         struct extent_map *em2;
7860
7861                         em2 = btrfs_create_dio_extent(inode, start, len,
7862                                                       orig_start, block_start,
7863                                                       len, orig_block_len,
7864                                                       ram_bytes, type);
7865                         btrfs_dec_nocow_writers(root->fs_info, block_start);
7866                         if (type == BTRFS_ORDERED_PREALLOC) {
7867                                 free_extent_map(em);
7868                                 em = em2;
7869                         }
7870                         if (em2 && IS_ERR(em2)) {
7871                                 ret = PTR_ERR(em2);
7872                                 goto unlock_err;
7873                         }
7874                         /*
7875                          * For inode marked NODATACOW or extent marked PREALLOC,
7876                          * use the existing or preallocated extent, so does not
7877                          * need to adjust btrfs_space_info's bytes_may_use.
7878                          */
7879                         btrfs_free_reserved_data_space_noquota(inode,
7880                                         start, len);
7881                         goto unlock;
7882                 }
7883         }
7884
7885         /*
7886          * this will cow the extent, reset the len in case we changed
7887          * it above
7888          */
7889         len = bh_result->b_size;
7890         free_extent_map(em);
7891         em = btrfs_new_extent_direct(inode, start, len);
7892         if (IS_ERR(em)) {
7893                 ret = PTR_ERR(em);
7894                 goto unlock_err;
7895         }
7896         len = min(len, em->len - (start - em->start));
7897 unlock:
7898         bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7899                 inode->i_blkbits;
7900         bh_result->b_size = len;
7901         bh_result->b_bdev = em->bdev;
7902         set_buffer_mapped(bh_result);
7903         if (create) {
7904                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7905                         set_buffer_new(bh_result);
7906
7907                 /*
7908                  * Need to update the i_size under the extent lock so buffered
7909                  * readers will get the updated i_size when we unlock.
7910                  */
7911                 if (start + len > i_size_read(inode))
7912                         i_size_write(inode, start + len);
7913
7914                 adjust_dio_outstanding_extents(inode, dio_data, len);
7915                 WARN_ON(dio_data->reserve < len);
7916                 dio_data->reserve -= len;
7917                 dio_data->unsubmitted_oe_range_end = start + len;
7918                 current->journal_info = dio_data;
7919         }
7920
7921         /*
7922          * In the case of write we need to clear and unlock the entire range,
7923          * in the case of read we need to unlock only the end area that we
7924          * aren't using if there is any left over space.
7925          */
7926         if (lockstart < lockend) {
7927                 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7928                                  lockend, unlock_bits, 1, 0,
7929                                  &cached_state, GFP_NOFS);
7930         } else {
7931                 free_extent_state(cached_state);
7932         }
7933
7934         free_extent_map(em);
7935
7936         return 0;
7937
7938 unlock_err:
7939         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7940                          unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7941 err:
7942         if (dio_data)
7943                 current->journal_info = dio_data;
7944         /*
7945          * Compensate the delalloc release we do in btrfs_direct_IO() when we
7946          * write less data then expected, so that we don't underflow our inode's
7947          * outstanding extents counter.
7948          */
7949         if (create && dio_data)
7950                 adjust_dio_outstanding_extents(inode, dio_data, len);
7951
7952         return ret;
7953 }
7954
7955 static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7956                                         int mirror_num)
7957 {
7958         struct btrfs_root *root = BTRFS_I(inode)->root;
7959         int ret;
7960
7961         BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7962
7963         bio_get(bio);
7964
7965         ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7966                                   BTRFS_WQ_ENDIO_DIO_REPAIR);
7967         if (ret)
7968                 goto err;
7969
7970         ret = btrfs_map_bio(root, bio, mirror_num, 0);
7971 err:
7972         bio_put(bio);
7973         return ret;
7974 }
7975
7976 static int btrfs_check_dio_repairable(struct inode *inode,
7977                                       struct bio *failed_bio,
7978                                       struct io_failure_record *failrec,
7979                                       int failed_mirror)
7980 {
7981         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7982         int num_copies;
7983
7984         num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7985         if (num_copies == 1) {
7986                 /*
7987                  * we only have a single copy of the data, so don't bother with
7988                  * all the retry and error correction code that follows. no
7989                  * matter what the error is, it is very likely to persist.
7990                  */
7991                 btrfs_debug(fs_info,
7992                         "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7993                         num_copies, failrec->this_mirror, failed_mirror);
7994                 return 0;
7995         }
7996
7997         failrec->failed_mirror = failed_mirror;
7998         failrec->this_mirror++;
7999         if (failrec->this_mirror == failed_mirror)
8000                 failrec->this_mirror++;
8001
8002         if (failrec->this_mirror > num_copies) {
8003                 btrfs_debug(fs_info,
8004                         "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
8005                         num_copies, failrec->this_mirror, failed_mirror);
8006                 return 0;
8007         }
8008
8009         return 1;
8010 }
8011
8012 static int dio_read_error(struct inode *inode, struct bio *failed_bio,
8013                         struct page *page, unsigned int pgoff,
8014                         u64 start, u64 end, int failed_mirror,
8015                         bio_end_io_t *repair_endio, void *repair_arg)
8016 {
8017         struct io_failure_record *failrec;
8018         struct bio *bio;
8019         int isector;
8020         int read_mode;
8021         int ret;
8022
8023         BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
8024
8025         ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
8026         if (ret)
8027                 return ret;
8028
8029         ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
8030                                          failed_mirror);
8031         if (!ret) {
8032                 free_io_failure(inode, failrec);
8033                 return -EIO;
8034         }
8035
8036         if ((failed_bio->bi_vcnt > 1)
8037                 || (failed_bio->bi_io_vec->bv_len
8038                         > BTRFS_I(inode)->root->sectorsize))
8039                 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
8040         else
8041                 read_mode = READ_SYNC;
8042
8043         isector = start - btrfs_io_bio(failed_bio)->logical;
8044         isector >>= inode->i_sb->s_blocksize_bits;
8045         bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
8046                                 pgoff, isector, repair_endio, repair_arg);
8047         if (!bio) {
8048                 free_io_failure(inode, failrec);
8049                 return -EIO;
8050         }
8051         bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
8052
8053         btrfs_debug(BTRFS_I(inode)->root->fs_info,
8054                     "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
8055                     read_mode, failrec->this_mirror, failrec->in_validation);
8056
8057         ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
8058         if (ret) {
8059                 free_io_failure(inode, failrec);
8060                 bio_put(bio);
8061         }
8062
8063         return ret;
8064 }
8065
8066 struct btrfs_retry_complete {
8067         struct completion done;
8068         struct inode *inode;
8069         u64 start;
8070         int uptodate;
8071 };
8072
8073 static void btrfs_retry_endio_nocsum(struct bio *bio)
8074 {
8075         struct btrfs_retry_complete *done = bio->bi_private;
8076         struct inode *inode;
8077         struct bio_vec *bvec;
8078         int i;
8079
8080         if (bio->bi_error)
8081                 goto end;
8082
8083         ASSERT(bio->bi_vcnt == 1);
8084         inode = bio->bi_io_vec->bv_page->mapping->host;
8085         ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
8086
8087         done->uptodate = 1;
8088         bio_for_each_segment_all(bvec, bio, i)
8089                 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
8090 end:
8091         complete(&done->done);
8092         bio_put(bio);
8093 }
8094
8095 static int __btrfs_correct_data_nocsum(struct inode *inode,
8096                                        struct btrfs_io_bio *io_bio)
8097 {
8098         struct btrfs_fs_info *fs_info;
8099         struct bio_vec *bvec;
8100         struct btrfs_retry_complete done;
8101         u64 start;
8102         unsigned int pgoff;
8103         u32 sectorsize;
8104         int nr_sectors;
8105         int i;
8106         int ret;
8107
8108         fs_info = BTRFS_I(inode)->root->fs_info;
8109         sectorsize = BTRFS_I(inode)->root->sectorsize;
8110
8111         start = io_bio->logical;
8112         done.inode = inode;
8113
8114         bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8115                 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8116                 pgoff = bvec->bv_offset;
8117
8118 next_block_or_try_again:
8119                 done.uptodate = 0;
8120                 done.start = start;
8121                 init_completion(&done.done);
8122
8123                 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8124                                 pgoff, start, start + sectorsize - 1,
8125                                 io_bio->mirror_num,
8126                                 btrfs_retry_endio_nocsum, &done);
8127                 if (ret)
8128                         return ret;
8129
8130                 wait_for_completion(&done.done);
8131
8132                 if (!done.uptodate) {
8133                         /* We might have another mirror, so try again */
8134                         goto next_block_or_try_again;
8135                 }
8136
8137                 start += sectorsize;
8138
8139                 nr_sectors--;
8140                 if (nr_sectors) {
8141                         pgoff += sectorsize;
8142                         ASSERT(pgoff < PAGE_SIZE);
8143                         goto next_block_or_try_again;
8144                 }
8145         }
8146
8147         return 0;
8148 }
8149
8150 static void btrfs_retry_endio(struct bio *bio)
8151 {
8152         struct btrfs_retry_complete *done = bio->bi_private;
8153         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8154         struct inode *inode;
8155         struct bio_vec *bvec;
8156         u64 start;
8157         int uptodate;
8158         int ret;
8159         int i;
8160
8161         if (bio->bi_error)
8162                 goto end;
8163
8164         uptodate = 1;
8165
8166         start = done->start;
8167
8168         ASSERT(bio->bi_vcnt == 1);
8169         inode = bio->bi_io_vec->bv_page->mapping->host;
8170         ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
8171
8172         bio_for_each_segment_all(bvec, bio, i) {
8173                 ret = __readpage_endio_check(done->inode, io_bio, i,
8174                                         bvec->bv_page, bvec->bv_offset,
8175                                         done->start, bvec->bv_len);
8176                 if (!ret)
8177                         clean_io_failure(done->inode, done->start,
8178                                         bvec->bv_page, bvec->bv_offset);
8179                 else
8180                         uptodate = 0;
8181         }
8182
8183         done->uptodate = uptodate;
8184 end:
8185         complete(&done->done);
8186         bio_put(bio);
8187 }
8188
8189 static int __btrfs_subio_endio_read(struct inode *inode,
8190                                     struct btrfs_io_bio *io_bio, int err)
8191 {
8192         struct btrfs_fs_info *fs_info;
8193         struct bio_vec *bvec;
8194         struct btrfs_retry_complete done;
8195         u64 start;
8196         u64 offset = 0;
8197         u32 sectorsize;
8198         int nr_sectors;
8199         unsigned int pgoff;
8200         int csum_pos;
8201         int i;
8202         int ret;
8203
8204         fs_info = BTRFS_I(inode)->root->fs_info;
8205         sectorsize = BTRFS_I(inode)->root->sectorsize;
8206
8207         err = 0;
8208         start = io_bio->logical;
8209         done.inode = inode;
8210
8211         bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8212                 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8213
8214                 pgoff = bvec->bv_offset;
8215 next_block:
8216                 csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8217                 ret = __readpage_endio_check(inode, io_bio, csum_pos,
8218                                         bvec->bv_page, pgoff, start,
8219                                         sectorsize);
8220                 if (likely(!ret))
8221                         goto next;
8222 try_again:
8223                 done.uptodate = 0;
8224                 done.start = start;
8225                 init_completion(&done.done);
8226
8227                 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8228                                 pgoff, start, start + sectorsize - 1,
8229                                 io_bio->mirror_num,
8230                                 btrfs_retry_endio, &done);
8231                 if (ret) {
8232                         err = ret;
8233                         goto next;
8234                 }
8235
8236                 wait_for_completion(&done.done);
8237
8238                 if (!done.uptodate) {
8239                         /* We might have another mirror, so try again */
8240                         goto try_again;
8241                 }
8242 next:
8243                 offset += sectorsize;
8244                 start += sectorsize;
8245
8246                 ASSERT(nr_sectors);
8247
8248                 nr_sectors--;
8249                 if (nr_sectors) {
8250                         pgoff += sectorsize;
8251                         ASSERT(pgoff < PAGE_SIZE);
8252                         goto next_block;
8253                 }
8254         }
8255
8256         return err;
8257 }
8258
8259 static int btrfs_subio_endio_read(struct inode *inode,
8260                                   struct btrfs_io_bio *io_bio, int err)
8261 {
8262         bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8263
8264         if (skip_csum) {
8265                 if (unlikely(err))
8266                         return __btrfs_correct_data_nocsum(inode, io_bio);
8267                 else
8268                         return 0;
8269         } else {
8270                 return __btrfs_subio_endio_read(inode, io_bio, err);
8271         }
8272 }
8273
8274 static void btrfs_endio_direct_read(struct bio *bio)
8275 {
8276         struct btrfs_dio_private *dip = bio->bi_private;
8277         struct inode *inode = dip->inode;
8278         struct bio *dio_bio;
8279         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8280         int err = bio->bi_error;
8281
8282         if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8283                 err = btrfs_subio_endio_read(inode, io_bio, err);
8284
8285         unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8286                       dip->logical_offset + dip->bytes - 1);
8287         dio_bio = dip->dio_bio;
8288
8289         kfree(dip);
8290
8291         dio_bio->bi_error = bio->bi_error;
8292         dio_end_io(dio_bio, bio->bi_error);
8293
8294         if (io_bio->end_io)
8295                 io_bio->end_io(io_bio, err);
8296         bio_put(bio);
8297 }
8298
8299 static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
8300                                                     const u64 offset,
8301                                                     const u64 bytes,
8302                                                     const int uptodate)
8303 {
8304         struct btrfs_root *root = BTRFS_I(inode)->root;
8305         struct btrfs_ordered_extent *ordered = NULL;
8306         u64 ordered_offset = offset;
8307         u64 ordered_bytes = bytes;
8308         int ret;
8309
8310 again:
8311         ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8312                                                    &ordered_offset,
8313                                                    ordered_bytes,
8314                                                    uptodate);
8315         if (!ret)
8316                 goto out_test;
8317
8318         btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
8319                         finish_ordered_fn, NULL, NULL);
8320         btrfs_queue_work(root->fs_info->endio_write_workers,
8321                          &ordered->work);
8322 out_test:
8323         /*
8324          * our bio might span multiple ordered extents.  If we haven't
8325          * completed the accounting for the whole dio, go back and try again
8326          */
8327         if (ordered_offset < offset + bytes) {
8328                 ordered_bytes = offset + bytes - ordered_offset;
8329                 ordered = NULL;
8330                 goto again;
8331         }
8332 }
8333
8334 static void btrfs_endio_direct_write(struct bio *bio)
8335 {
8336         struct btrfs_dio_private *dip = bio->bi_private;
8337         struct bio *dio_bio = dip->dio_bio;
8338
8339         btrfs_endio_direct_write_update_ordered(dip->inode,
8340                                                 dip->logical_offset,
8341                                                 dip->bytes,
8342                                                 !bio->bi_error);
8343
8344         kfree(dip);
8345
8346         dio_bio->bi_error = bio->bi_error;
8347         dio_end_io(dio_bio, bio->bi_error);
8348         bio_put(bio);
8349 }
8350
8351 static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
8352                                     struct bio *bio, int mirror_num,
8353                                     unsigned long bio_flags, u64 offset)
8354 {
8355         int ret;
8356         struct btrfs_root *root = BTRFS_I(inode)->root;
8357         ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
8358         BUG_ON(ret); /* -ENOMEM */
8359         return 0;
8360 }
8361
8362 static void btrfs_end_dio_bio(struct bio *bio)
8363 {
8364         struct btrfs_dio_private *dip = bio->bi_private;
8365         int err = bio->bi_error;
8366
8367         if (err)
8368                 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8369                            "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
8370                            btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf,
8371                            (unsigned long long)bio->bi_iter.bi_sector,
8372                            bio->bi_iter.bi_size, err);
8373
8374         if (dip->subio_endio)
8375                 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8376
8377         if (err) {
8378                 dip->errors = 1;
8379
8380                 /*
8381                  * before atomic variable goto zero, we must make sure
8382                  * dip->errors is perceived to be set.
8383                  */
8384                 smp_mb__before_atomic();
8385         }
8386
8387         /* if there are more bios still pending for this dio, just exit */
8388         if (!atomic_dec_and_test(&dip->pending_bios))
8389                 goto out;
8390
8391         if (dip->errors) {
8392                 bio_io_error(dip->orig_bio);
8393         } else {
8394                 dip->dio_bio->bi_error = 0;
8395                 bio_endio(dip->orig_bio);
8396         }
8397 out:
8398         bio_put(bio);
8399 }
8400
8401 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
8402                                        u64 first_sector, gfp_t gfp_flags)
8403 {
8404         struct bio *bio;
8405         bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
8406         if (bio)
8407                 bio_associate_current(bio);
8408         return bio;
8409 }
8410
8411 static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
8412                                                  struct inode *inode,
8413                                                  struct btrfs_dio_private *dip,
8414                                                  struct bio *bio,
8415                                                  u64 file_offset)
8416 {
8417         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8418         struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8419         int ret;
8420
8421         /*
8422          * We load all the csum data we need when we submit
8423          * the first bio to reduce the csum tree search and
8424          * contention.
8425          */
8426         if (dip->logical_offset == file_offset) {
8427                 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
8428                                                 file_offset);
8429                 if (ret)
8430                         return ret;
8431         }
8432
8433         if (bio == dip->orig_bio)
8434                 return 0;
8435
8436         file_offset -= dip->logical_offset;
8437         file_offset >>= inode->i_sb->s_blocksize_bits;
8438         io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8439
8440         return 0;
8441 }
8442
8443 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
8444                                          u64 file_offset, int skip_sum,
8445                                          int async_submit)
8446 {
8447         struct btrfs_dio_private *dip = bio->bi_private;
8448         bool write = bio_op(bio) == REQ_OP_WRITE;
8449         struct btrfs_root *root = BTRFS_I(inode)->root;
8450         int ret;
8451
8452         if (async_submit)
8453                 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8454
8455         bio_get(bio);
8456
8457         if (!write) {
8458                 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
8459                                 BTRFS_WQ_ENDIO_DATA);
8460                 if (ret)
8461                         goto err;
8462         }
8463
8464         if (skip_sum)
8465                 goto map;
8466
8467         if (write && async_submit) {
8468                 ret = btrfs_wq_submit_bio(root->fs_info,
8469                                    inode, bio, 0, 0, file_offset,
8470                                    __btrfs_submit_bio_start_direct_io,
8471                                    __btrfs_submit_bio_done);
8472                 goto err;
8473         } else if (write) {
8474                 /*
8475                  * If we aren't doing async submit, calculate the csum of the
8476                  * bio now.
8477                  */
8478                 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
8479                 if (ret)
8480                         goto err;
8481         } else {
8482                 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
8483                                                      file_offset);
8484                 if (ret)
8485                         goto err;
8486         }
8487 map:
8488         ret = btrfs_map_bio(root, bio, 0, async_submit);
8489 err:
8490         bio_put(bio);
8491         return ret;
8492 }
8493
8494 static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
8495                                     int skip_sum)
8496 {
8497         struct inode *inode = dip->inode;
8498         struct btrfs_root *root = BTRFS_I(inode)->root;
8499         struct bio *bio;
8500         struct bio *orig_bio = dip->orig_bio;
8501         struct bio_vec *bvec = orig_bio->bi_io_vec;
8502         u64 start_sector = orig_bio->bi_iter.bi_sector;
8503         u64 file_offset = dip->logical_offset;
8504         u64 submit_len = 0;
8505         u64 map_length;
8506         u32 blocksize = root->sectorsize;
8507         int async_submit = 0;
8508         int nr_sectors;
8509         int ret;
8510         int i;
8511
8512         map_length = orig_bio->bi_iter.bi_size;
8513         ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
8514                               start_sector << 9, &map_length, NULL, 0);
8515         if (ret)
8516                 return -EIO;
8517
8518         if (map_length >= orig_bio->bi_iter.bi_size) {
8519                 bio = orig_bio;
8520                 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8521                 goto submit;
8522         }
8523
8524         /* async crcs make it difficult to collect full stripe writes. */
8525         if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8526                 async_submit = 0;
8527         else
8528                 async_submit = 1;
8529
8530         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
8531         if (!bio)
8532                 return -ENOMEM;
8533
8534         bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
8535         bio->bi_private = dip;
8536         bio->bi_end_io = btrfs_end_dio_bio;
8537         btrfs_io_bio(bio)->logical = file_offset;
8538
8539         while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
8540                 nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
8541                 i = 0;
8542 next_block:
8543                 if (unlikely(map_length < submit_len + blocksize ||
8544                     bio_add_page(bio, bvec->bv_page, blocksize,
8545                             bvec->bv_offset + (i * blocksize)) < blocksize)) {
8546                         /*
8547                          * inc the count before we submit the bio so
8548                          * we know the end IO handler won't happen before
8549                          * we inc the count. Otherwise, the dip might get freed
8550                          * before we're done setting it up
8551                          */
8552                         atomic_inc(&dip->pending_bios);
8553                         ret = __btrfs_submit_dio_bio(bio, inode,
8554                                                      file_offset, skip_sum,
8555                                                      async_submit);
8556                         if (ret) {
8557                                 bio_put(bio);
8558                                 atomic_dec(&dip->pending_bios);
8559                                 goto out_err;
8560                         }
8561
8562                         start_sector += submit_len >> 9;
8563                         file_offset += submit_len;
8564
8565                         submit_len = 0;
8566
8567                         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
8568                                                   start_sector, GFP_NOFS);
8569                         if (!bio)
8570                                 goto out_err;
8571                         bio_set_op_attrs(bio, bio_op(orig_bio),
8572                                          bio_flags(orig_bio));
8573                         bio->bi_private = dip;
8574                         bio->bi_end_io = btrfs_end_dio_bio;
8575                         btrfs_io_bio(bio)->logical = file_offset;
8576
8577                         map_length = orig_bio->bi_iter.bi_size;
8578                         ret = btrfs_map_block(root->fs_info, bio_op(orig_bio),
8579                                               start_sector << 9,
8580                                               &map_length, NULL, 0);
8581                         if (ret) {
8582                                 bio_put(bio);
8583                                 goto out_err;
8584                         }
8585
8586                         goto next_block;
8587                 } else {
8588                         submit_len += blocksize;
8589                         if (--nr_sectors) {
8590                                 i++;
8591                                 goto next_block;
8592                         }
8593                         bvec++;
8594                 }
8595         }
8596
8597 submit:
8598         ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
8599                                      async_submit);
8600         if (!ret)
8601                 return 0;
8602
8603         if (bio != orig_bio)
8604                 bio_put(bio);
8605 out_err:
8606         dip->errors = 1;
8607         /*
8608          * before atomic variable goto zero, we must
8609          * make sure dip->errors is perceived to be set.
8610          */
8611         smp_mb__before_atomic();
8612         if (atomic_dec_and_test(&dip->pending_bios))
8613                 bio_io_error(dip->orig_bio);
8614
8615         /* bio_end_io() will handle error, so we needn't return it */
8616         return 0;
8617 }
8618
8619 static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8620                                 loff_t file_offset)
8621 {
8622         struct btrfs_dio_private *dip = NULL;
8623         struct bio *io_bio = NULL;
8624         struct btrfs_io_bio *btrfs_bio;
8625         int skip_sum;
8626         bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8627         int ret = 0;
8628
8629         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8630
8631         io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
8632         if (!io_bio) {
8633                 ret = -ENOMEM;
8634                 goto free_ordered;
8635         }
8636
8637         dip = kzalloc(sizeof(*dip), GFP_NOFS);
8638         if (!dip) {
8639                 ret = -ENOMEM;
8640                 goto free_ordered;
8641         }
8642
8643         dip->private = dio_bio->bi_private;
8644         dip->inode = inode;
8645         dip->logical_offset = file_offset;
8646         dip->bytes = dio_bio->bi_iter.bi_size;
8647         dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8648         io_bio->bi_private = dip;
8649         dip->orig_bio = io_bio;
8650         dip->dio_bio = dio_bio;
8651         atomic_set(&dip->pending_bios, 1);
8652         btrfs_bio = btrfs_io_bio(io_bio);
8653         btrfs_bio->logical = file_offset;
8654
8655         if (write) {
8656                 io_bio->bi_end_io = btrfs_endio_direct_write;
8657         } else {
8658                 io_bio->bi_end_io = btrfs_endio_direct_read;
8659                 dip->subio_endio = btrfs_subio_endio_read;
8660         }
8661
8662         /*
8663          * Reset the range for unsubmitted ordered extents (to a 0 length range)
8664          * even if we fail to submit a bio, because in such case we do the
8665          * corresponding error handling below and it must not be done a second
8666          * time by btrfs_direct_IO().
8667          */
8668         if (write) {
8669                 struct btrfs_dio_data *dio_data = current->journal_info;
8670
8671                 dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8672                         dip->bytes;
8673                 dio_data->unsubmitted_oe_range_start =
8674                         dio_data->unsubmitted_oe_range_end;
8675         }
8676
8677         ret = btrfs_submit_direct_hook(dip, skip_sum);
8678         if (!ret)
8679                 return;
8680
8681         if (btrfs_bio->end_io)
8682                 btrfs_bio->end_io(btrfs_bio, ret);
8683
8684 free_ordered:
8685         /*
8686          * If we arrived here it means either we failed to submit the dip
8687          * or we either failed to clone the dio_bio or failed to allocate the
8688          * dip. If we cloned the dio_bio and allocated the dip, we can just
8689          * call bio_endio against our io_bio so that we get proper resource
8690          * cleanup if we fail to submit the dip, otherwise, we must do the
8691          * same as btrfs_endio_direct_[write|read] because we can't call these
8692          * callbacks - they require an allocated dip and a clone of dio_bio.
8693          */
8694         if (io_bio && dip) {
8695                 io_bio->bi_error = -EIO;
8696                 bio_endio(io_bio);
8697                 /*
8698                  * The end io callbacks free our dip, do the final put on io_bio
8699                  * and all the cleanup and final put for dio_bio (through
8700                  * dio_end_io()).
8701                  */
8702                 dip = NULL;
8703                 io_bio = NULL;
8704         } else {
8705                 if (write)
8706                         btrfs_endio_direct_write_update_ordered(inode,
8707                                                 file_offset,
8708                                                 dio_bio->bi_iter.bi_size,
8709                                                 0);
8710                 else
8711                         unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8712                               file_offset + dio_bio->bi_iter.bi_size - 1);
8713
8714                 dio_bio->bi_error = -EIO;
8715                 /*
8716                  * Releases and cleans up our dio_bio, no need to bio_put()
8717                  * nor bio_endio()/bio_io_error() against dio_bio.
8718                  */
8719                 dio_end_io(dio_bio, ret);
8720         }
8721         if (io_bio)
8722                 bio_put(io_bio);
8723         kfree(dip);
8724 }
8725
8726 static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
8727                         const struct iov_iter *iter, loff_t offset)
8728 {
8729         int seg;
8730         int i;
8731         unsigned blocksize_mask = root->sectorsize - 1;
8732         ssize_t retval = -EINVAL;
8733
8734         if (offset & blocksize_mask)
8735                 goto out;
8736
8737         if (iov_iter_alignment(iter) & blocksize_mask)
8738                 goto out;
8739
8740         /* If this is a write we don't need to check anymore */
8741         if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
8742                 return 0;
8743         /*
8744          * Check to make sure we don't have duplicate iov_base's in this
8745          * iovec, if so return EINVAL, otherwise we'll get csum errors
8746          * when reading back.
8747          */
8748         for (seg = 0; seg < iter->nr_segs; seg++) {
8749                 for (i = seg + 1; i < iter->nr_segs; i++) {
8750                         if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8751                                 goto out;
8752                 }
8753         }
8754         retval = 0;
8755 out:
8756         return retval;
8757 }
8758
8759 static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8760 {
8761         struct file *file = iocb->ki_filp;
8762         struct inode *inode = file->f_mapping->host;
8763         struct btrfs_root *root = BTRFS_I(inode)->root;
8764         struct btrfs_dio_data dio_data = { 0 };
8765         loff_t offset = iocb->ki_pos;
8766         size_t count = 0;
8767         int flags = 0;
8768         bool wakeup = true;
8769         bool relock = false;
8770         ssize_t ret;
8771
8772         if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
8773                 return 0;
8774
8775         inode_dio_begin(inode);
8776         smp_mb__after_atomic();
8777
8778         /*
8779          * The generic stuff only does filemap_write_and_wait_range, which
8780          * isn't enough if we've written compressed pages to this area, so
8781          * we need to flush the dirty pages again to make absolutely sure
8782          * that any outstanding dirty pages are on disk.
8783          */
8784         count = iov_iter_count(iter);
8785         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8786                      &BTRFS_I(inode)->runtime_flags))
8787                 filemap_fdatawrite_range(inode->i_mapping, offset,
8788                                          offset + count - 1);
8789
8790         if (iov_iter_rw(iter) == WRITE) {
8791                 /*
8792                  * If the write DIO is beyond the EOF, we need update
8793                  * the isize, but it is protected by i_mutex. So we can
8794                  * not unlock the i_mutex at this case.
8795                  */
8796                 if (offset + count <= inode->i_size) {
8797                         inode_unlock(inode);
8798                         relock = true;
8799                 }
8800                 ret = btrfs_delalloc_reserve_space(inode, offset, count);
8801                 if (ret)
8802                         goto out;
8803                 dio_data.outstanding_extents = div64_u64(count +
8804                                                 BTRFS_MAX_EXTENT_SIZE - 1,
8805                                                 BTRFS_MAX_EXTENT_SIZE);
8806
8807                 /*
8808                  * We need to know how many extents we reserved so that we can
8809                  * do the accounting properly if we go over the number we
8810                  * originally calculated.  Abuse current->journal_info for this.
8811                  */
8812                 dio_data.reserve = round_up(count, root->sectorsize);
8813                 dio_data.unsubmitted_oe_range_start = (u64)offset;
8814                 dio_data.unsubmitted_oe_range_end = (u64)offset;
8815                 current->journal_info = &dio_data;
8816                 down_read(&BTRFS_I(inode)->dio_sem);
8817         } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8818                                      &BTRFS_I(inode)->runtime_flags)) {
8819                 inode_dio_end(inode);
8820                 flags = DIO_LOCKING | DIO_SKIP_HOLES;
8821                 wakeup = false;
8822         }
8823
8824         ret = __blockdev_direct_IO(iocb, inode,
8825                                    BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
8826                                    iter, btrfs_get_blocks_direct, NULL,
8827                                    btrfs_submit_direct, flags);
8828         if (iov_iter_rw(iter) == WRITE) {
8829                 up_read(&BTRFS_I(inode)->dio_sem);
8830                 current->journal_info = NULL;
8831                 if (ret < 0 && ret != -EIOCBQUEUED) {
8832                         if (dio_data.reserve)
8833                                 btrfs_delalloc_release_space(inode, offset,
8834                                                              dio_data.reserve);
8835                         /*
8836                          * On error we might have left some ordered extents
8837                          * without submitting corresponding bios for them, so
8838                          * cleanup them up to avoid other tasks getting them
8839                          * and waiting for them to complete forever.
8840                          */
8841                         if (dio_data.unsubmitted_oe_range_start <
8842                             dio_data.unsubmitted_oe_range_end)
8843                                 btrfs_endio_direct_write_update_ordered(inode,
8844                                         dio_data.unsubmitted_oe_range_start,
8845                                         dio_data.unsubmitted_oe_range_end -
8846                                         dio_data.unsubmitted_oe_range_start,
8847                                         0);
8848                 } else if (ret >= 0 && (size_t)ret < count)
8849                         btrfs_delalloc_release_space(inode, offset,
8850                                                      count - (size_t)ret);
8851         }
8852 out:
8853         if (wakeup)
8854                 inode_dio_end(inode);
8855         if (relock)
8856                 inode_lock(inode);
8857
8858         return ret;
8859 }
8860
8861 #define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
8862
8863 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8864                 __u64 start, __u64 len)
8865 {
8866         int     ret;
8867
8868         ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8869         if (ret)
8870                 return ret;
8871
8872         return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
8873 }
8874
8875 int btrfs_readpage(struct file *file, struct page *page)
8876 {
8877         struct extent_io_tree *tree;
8878         tree = &BTRFS_I(page->mapping->host)->io_tree;
8879         return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8880 }
8881
8882 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8883 {
8884         struct extent_io_tree *tree;
8885         struct inode *inode = page->mapping->host;
8886         int ret;
8887
8888         if (current->flags & PF_MEMALLOC) {
8889                 redirty_page_for_writepage(wbc, page);
8890                 unlock_page(page);
8891                 return 0;
8892         }
8893
8894         /*
8895          * If we are under memory pressure we will call this directly from the
8896          * VM, we need to make sure we have the inode referenced for the ordered
8897          * extent.  If not just return like we didn't do anything.
8898          */
8899         if (!igrab(inode)) {
8900                 redirty_page_for_writepage(wbc, page);
8901                 return AOP_WRITEPAGE_ACTIVATE;
8902         }
8903         tree = &BTRFS_I(page->mapping->host)->io_tree;
8904         ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
8905         btrfs_add_delayed_iput(inode);
8906         return ret;
8907 }
8908
8909 static int btrfs_writepages(struct address_space *mapping,
8910                             struct writeback_control *wbc)
8911 {
8912         struct extent_io_tree *tree;
8913
8914         tree = &BTRFS_I(mapping->host)->io_tree;
8915         return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
8916 }
8917
8918 static int
8919 btrfs_readpages(struct file *file, struct address_space *mapping,
8920                 struct list_head *pages, unsigned nr_pages)
8921 {
8922         struct extent_io_tree *tree;
8923         tree = &BTRFS_I(mapping->host)->io_tree;
8924         return extent_readpages(tree, mapping, pages, nr_pages,
8925                                 btrfs_get_extent);
8926 }
8927 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8928 {
8929         struct extent_io_tree *tree;
8930         struct extent_map_tree *map;
8931         int ret;
8932
8933         tree = &BTRFS_I(page->mapping->host)->io_tree;
8934         map = &BTRFS_I(page->mapping->host)->extent_tree;
8935         ret = try_release_extent_mapping(map, tree, page, gfp_flags);
8936         if (ret == 1) {
8937                 ClearPagePrivate(page);
8938                 set_page_private(page, 0);
8939                 put_page(page);
8940         }
8941         return ret;
8942 }
8943
8944 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8945 {
8946         if (PageWriteback(page) || PageDirty(page))
8947                 return 0;
8948         return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
8949 }
8950
8951 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8952                                  unsigned int length)
8953 {
8954         struct inode *inode = page->mapping->host;
8955         struct extent_io_tree *tree;
8956         struct btrfs_ordered_extent *ordered;
8957         struct extent_state *cached_state = NULL;
8958         u64 page_start = page_offset(page);
8959         u64 page_end = page_start + PAGE_SIZE - 1;
8960         u64 start;
8961         u64 end;
8962         int inode_evicting = inode->i_state & I_FREEING;
8963
8964         /*
8965          * we have the page locked, so new writeback can't start,
8966          * and the dirty bit won't be cleared while we are here.
8967          *
8968          * Wait for IO on this page so that we can safely clear
8969          * the PagePrivate2 bit and do ordered accounting
8970          */
8971         wait_on_page_writeback(page);
8972
8973         tree = &BTRFS_I(inode)->io_tree;
8974         if (offset) {
8975                 btrfs_releasepage(page, GFP_NOFS);
8976                 return;
8977         }
8978
8979         if (!inode_evicting)
8980                 lock_extent_bits(tree, page_start, page_end, &cached_state);
8981 again:
8982         start = page_start;
8983         ordered = btrfs_lookup_ordered_range(inode, start,
8984                                         page_end - start + 1);
8985         if (ordered) {
8986                 end = min(page_end, ordered->file_offset + ordered->len - 1);
8987                 /*
8988                  * IO on this page will never be started, so we need
8989                  * to account for any ordered extents now
8990                  */
8991                 if (!inode_evicting)
8992                         clear_extent_bit(tree, start, end,
8993                                          EXTENT_DIRTY | EXTENT_DELALLOC |
8994                                          EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8995                                          EXTENT_DEFRAG, 1, 0, &cached_state,
8996                                          GFP_NOFS);
8997                 /*
8998                  * whoever cleared the private bit is responsible
8999                  * for the finish_ordered_io
9000                  */
9001                 if (TestClearPagePrivate2(page)) {
9002                         struct btrfs_ordered_inode_tree *tree;
9003                         u64 new_len;
9004
9005                         tree = &BTRFS_I(inode)->ordered_tree;
9006
9007                         spin_lock_irq(&tree->lock);
9008                         set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
9009                         new_len = start - ordered->file_offset;
9010                         if (new_len < ordered->truncated_len)
9011                                 ordered->truncated_len = new_len;
9012                         spin_unlock_irq(&tree->lock);
9013
9014                         if (btrfs_dec_test_ordered_pending(inode, &ordered,
9015                                                            start,
9016                                                            end - start + 1, 1))
9017                                 btrfs_finish_ordered_io(ordered);
9018                 }
9019                 btrfs_put_ordered_extent(ordered);
9020                 if (!inode_evicting) {
9021                         cached_state = NULL;
9022                         lock_extent_bits(tree, start, end,
9023                                          &cached_state);
9024                 }
9025
9026                 start = end + 1;
9027                 if (start < page_end)
9028                         goto again;
9029         }
9030
9031         /*
9032          * Qgroup reserved space handler
9033          * Page here will be either
9034          * 1) Already written to disk
9035          *    In this case, its reserved space is released from data rsv map
9036          *    and will be freed by delayed_ref handler finally.
9037          *    So even we call qgroup_free_data(), it won't decrease reserved
9038          *    space.
9039          * 2) Not written to disk
9040          *    This means the reserved space should be freed here. However,
9041          *    if a truncate invalidates the page (by clearing PageDirty)
9042          *    and the page is accounted for while allocating extent
9043          *    in btrfs_check_data_free_space() we let delayed_ref to
9044          *    free the entire extent.
9045          */
9046         if (PageDirty(page))
9047                 btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
9048         if (!inode_evicting) {
9049                 clear_extent_bit(tree, page_start, page_end,
9050                                  EXTENT_LOCKED | EXTENT_DIRTY |
9051                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
9052                                  EXTENT_DEFRAG, 1, 1,
9053                                  &cached_state, GFP_NOFS);
9054
9055                 __btrfs_releasepage(page, GFP_NOFS);
9056         }
9057
9058         ClearPageChecked(page);
9059         if (PagePrivate(page)) {
9060                 ClearPagePrivate(page);
9061                 set_page_private(page, 0);
9062                 put_page(page);
9063         }
9064 }
9065
9066 /*
9067  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
9068  * called from a page fault handler when a page is first dirtied. Hence we must
9069  * be careful to check for EOF conditions here. We set the page up correctly
9070  * for a written page which means we get ENOSPC checking when writing into
9071  * holes and correct delalloc and unwritten extent mapping on filesystems that
9072  * support these features.
9073  *
9074  * We are not allowed to take the i_mutex here so we have to play games to
9075  * protect against truncate races as the page could now be beyond EOF.  Because
9076  * vmtruncate() writes the inode size before removing pages, once we have the
9077  * page lock we can determine safely if the page is beyond EOF. If it is not
9078  * beyond EOF, then the page is guaranteed safe against truncation until we
9079  * unlock the page.
9080  */
9081 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
9082 {
9083         struct page *page = vmf->page;
9084         struct inode *inode = file_inode(vma->vm_file);
9085         struct btrfs_root *root = BTRFS_I(inode)->root;
9086         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9087         struct btrfs_ordered_extent *ordered;
9088         struct extent_state *cached_state = NULL;
9089         char *kaddr;
9090         unsigned long zero_start;
9091         loff_t size;
9092         int ret;
9093         int reserved = 0;
9094         u64 reserved_space;
9095         u64 page_start;
9096         u64 page_end;
9097         u64 end;
9098
9099         reserved_space = PAGE_SIZE;
9100
9101         sb_start_pagefault(inode->i_sb);
9102         page_start = page_offset(page);
9103         page_end = page_start + PAGE_SIZE - 1;
9104         end = page_end;
9105
9106         /*
9107          * Reserving delalloc space after obtaining the page lock can lead to
9108          * deadlock. For example, if a dirty page is locked by this function
9109          * and the call to btrfs_delalloc_reserve_space() ends up triggering
9110          * dirty page write out, then the btrfs_writepage() function could
9111          * end up waiting indefinitely to get a lock on the page currently
9112          * being processed by btrfs_page_mkwrite() function.
9113          */
9114         ret = btrfs_delalloc_reserve_space(inode, page_start,
9115                                            reserved_space);
9116         if (!ret) {
9117                 ret = file_update_time(vma->vm_file);
9118                 reserved = 1;
9119         }
9120         if (ret) {
9121                 if (ret == -ENOMEM)
9122                         ret = VM_FAULT_OOM;
9123                 else /* -ENOSPC, -EIO, etc */
9124                         ret = VM_FAULT_SIGBUS;
9125                 if (reserved)
9126                         goto out;
9127                 goto out_noreserve;
9128         }
9129
9130         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
9131 again:
9132         lock_page(page);
9133         size = i_size_read(inode);
9134
9135         if ((page->mapping != inode->i_mapping) ||
9136             (page_start >= size)) {
9137                 /* page got truncated out from underneath us */
9138                 goto out_unlock;
9139         }
9140         wait_on_page_writeback(page);
9141
9142         lock_extent_bits(io_tree, page_start, page_end, &cached_state);
9143         set_page_extent_mapped(page);
9144
9145         /*
9146          * we can't set the delalloc bits if there are pending ordered
9147          * extents.  Drop our locks and wait for them to finish
9148          */
9149         ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
9150         if (ordered) {
9151                 unlock_extent_cached(io_tree, page_start, page_end,
9152                                      &cached_state, GFP_NOFS);
9153                 unlock_page(page);
9154                 btrfs_start_ordered_extent(inode, ordered, 1);
9155                 btrfs_put_ordered_extent(ordered);
9156                 goto again;
9157         }
9158
9159         if (page->index == ((size - 1) >> PAGE_SHIFT)) {
9160                 reserved_space = round_up(size - page_start, root->sectorsize);
9161                 if (reserved_space < PAGE_SIZE) {
9162                         end = page_start + reserved_space - 1;
9163                         spin_lock(&BTRFS_I(inode)->lock);
9164                         BTRFS_I(inode)->outstanding_extents++;
9165                         spin_unlock(&BTRFS_I(inode)->lock);
9166                         btrfs_delalloc_release_space(inode, page_start,
9167                                                 PAGE_SIZE - reserved_space);
9168                 }
9169         }
9170
9171         /*
9172          * XXX - page_mkwrite gets called every time the page is dirtied, even
9173          * if it was already dirty, so for space accounting reasons we need to
9174          * clear any delalloc bits for the range we are fixing to save.  There
9175          * is probably a better way to do this, but for now keep consistent with
9176          * prepare_pages in the normal write path.
9177          */
9178         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9179                           EXTENT_DIRTY | EXTENT_DELALLOC |
9180                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
9181                           0, 0, &cached_state, GFP_NOFS);
9182
9183         ret = btrfs_set_extent_delalloc(inode, page_start, end,
9184                                         &cached_state, 0);
9185         if (ret) {
9186                 unlock_extent_cached(io_tree, page_start, page_end,
9187                                      &cached_state, GFP_NOFS);
9188                 ret = VM_FAULT_SIGBUS;
9189                 goto out_unlock;
9190         }
9191         ret = 0;
9192
9193         /* page is wholly or partially inside EOF */
9194         if (page_start + PAGE_SIZE > size)
9195                 zero_start = size & ~PAGE_MASK;
9196         else
9197                 zero_start = PAGE_SIZE;
9198
9199         if (zero_start != PAGE_SIZE) {
9200                 kaddr = kmap(page);
9201                 memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
9202                 flush_dcache_page(page);
9203                 kunmap(page);
9204         }
9205         ClearPageChecked(page);
9206         set_page_dirty(page);
9207         SetPageUptodate(page);
9208
9209         BTRFS_I(inode)->last_trans = root->fs_info->generation;
9210         BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9211         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
9212
9213         unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
9214
9215 out_unlock:
9216         if (!ret) {
9217                 sb_end_pagefault(inode->i_sb);
9218                 return VM_FAULT_LOCKED;
9219         }
9220         unlock_page(page);
9221 out:
9222         btrfs_delalloc_release_space(inode, page_start, reserved_space);
9223 out_noreserve:
9224         sb_end_pagefault(inode->i_sb);
9225         return ret;
9226 }
9227
9228 static int btrfs_truncate(struct inode *inode)
9229 {
9230         struct btrfs_root *root = BTRFS_I(inode)->root;
9231         struct btrfs_block_rsv *rsv;
9232         int ret = 0;
9233         int err = 0;
9234         struct btrfs_trans_handle *trans;
9235         u64 mask = root->sectorsize - 1;
9236         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
9237
9238         ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
9239                                        (u64)-1);
9240         if (ret)
9241                 return ret;
9242
9243         /*
9244          * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
9245          * 3 things going on here
9246          *
9247          * 1) We need to reserve space for our orphan item and the space to
9248          * delete our orphan item.  Lord knows we don't want to have a dangling
9249          * orphan item because we didn't reserve space to remove it.
9250          *
9251          * 2) We need to reserve space to update our inode.
9252          *
9253          * 3) We need to have something to cache all the space that is going to
9254          * be free'd up by the truncate operation, but also have some slack
9255          * space reserved in case it uses space during the truncate (thank you
9256          * very much snapshotting).
9257          *
9258          * And we need these to all be separate.  The fact is we can use a lot of
9259          * space doing the truncate, and we have no earthly idea how much space
9260          * we will use, so we need the truncate reservation to be separate so it
9261          * doesn't end up using space reserved for updating the inode or
9262          * removing the orphan item.  We also need to be able to stop the
9263          * transaction and start a new one, which means we need to be able to
9264          * update the inode several times, and we have no idea of knowing how
9265          * many times that will be, so we can't just reserve 1 item for the
9266          * entirety of the operation, so that has to be done separately as well.
9267          * Then there is the orphan item, which does indeed need to be held on
9268          * to for the whole operation, and we need nobody to touch this reserved
9269          * space except the orphan code.
9270          *
9271          * So that leaves us with
9272          *
9273          * 1) root->orphan_block_rsv - for the orphan deletion.
9274          * 2) rsv - for the truncate reservation, which we will steal from the
9275          * transaction reservation.
9276          * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
9277          * updating the inode.
9278          */
9279         rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
9280         if (!rsv)
9281                 return -ENOMEM;
9282         rsv->size = min_size;
9283         rsv->failfast = 1;
9284
9285         /*
9286          * 1 for the truncate slack space
9287          * 1 for updating the inode.
9288          */
9289         trans = btrfs_start_transaction(root, 2);
9290         if (IS_ERR(trans)) {
9291                 err = PTR_ERR(trans);
9292                 goto out;
9293         }
9294
9295         /* Migrate the slack space for the truncate to our reserve */
9296         ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
9297                                       min_size, 0);
9298         BUG_ON(ret);
9299
9300         /*
9301          * So if we truncate and then write and fsync we normally would just
9302          * write the extents that changed, which is a problem if we need to
9303          * first truncate that entire inode.  So set this flag so we write out
9304          * all of the extents in the inode to the sync log so we're completely
9305          * safe.
9306          */
9307         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9308         trans->block_rsv = rsv;
9309
9310         while (1) {
9311                 ret = btrfs_truncate_inode_items(trans, root, inode,
9312                                                  inode->i_size,
9313                                                  BTRFS_EXTENT_DATA_KEY);
9314                 if (ret != -ENOSPC && ret != -EAGAIN) {
9315                         err = ret;
9316                         break;
9317                 }
9318
9319                 trans->block_rsv = &root->fs_info->trans_block_rsv;
9320                 ret = btrfs_update_inode(trans, root, inode);
9321                 if (ret) {
9322                         err = ret;
9323                         break;
9324                 }
9325
9326                 btrfs_end_transaction(trans, root);
9327                 btrfs_btree_balance_dirty(root);
9328
9329                 trans = btrfs_start_transaction(root, 2);
9330                 if (IS_ERR(trans)) {
9331                         ret = err = PTR_ERR(trans);
9332                         trans = NULL;
9333                         break;
9334                 }
9335
9336                 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
9337                                               rsv, min_size, 0);
9338                 BUG_ON(ret);    /* shouldn't happen */
9339                 trans->block_rsv = rsv;
9340         }
9341
9342         if (ret == 0 && inode->i_nlink > 0) {
9343                 trans->block_rsv = root->orphan_block_rsv;
9344                 ret = btrfs_orphan_del(trans, inode);
9345                 if (ret)
9346                         err = ret;
9347         }
9348
9349         if (trans) {
9350                 trans->block_rsv = &root->fs_info->trans_block_rsv;
9351                 ret = btrfs_update_inode(trans, root, inode);
9352                 if (ret && !err)
9353                         err = ret;
9354
9355                 ret = btrfs_end_transaction(trans, root);
9356                 btrfs_btree_balance_dirty(root);
9357         }
9358 out:
9359         btrfs_free_block_rsv(root, rsv);
9360
9361         if (ret && !err)
9362                 err = ret;
9363
9364         return err;
9365 }
9366
9367 /*
9368  * create a new subvolume directory/inode (helper for the ioctl).
9369  */
9370 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9371                              struct btrfs_root *new_root,
9372                              struct btrfs_root *parent_root,
9373                              u64 new_dirid)
9374 {
9375         struct inode *inode;
9376         int err;
9377         u64 index = 0;
9378
9379         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
9380                                 new_dirid, new_dirid,
9381                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
9382                                 &index);
9383         if (IS_ERR(inode))
9384                 return PTR_ERR(inode);
9385         inode->i_op = &btrfs_dir_inode_operations;
9386         inode->i_fop = &btrfs_dir_file_operations;
9387
9388         set_nlink(inode, 1);
9389         btrfs_i_size_write(inode, 0);
9390         unlock_new_inode(inode);
9391
9392         err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9393         if (err)
9394                 btrfs_err(new_root->fs_info,
9395                           "error inheriting subvolume %llu properties: %d",
9396                           new_root->root_key.objectid, err);
9397
9398         err = btrfs_update_inode(trans, new_root, inode);
9399
9400         iput(inode);
9401         return err;
9402 }
9403
9404 struct inode *btrfs_alloc_inode(struct super_block *sb)
9405 {
9406         struct btrfs_inode *ei;
9407         struct inode *inode;
9408
9409         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
9410         if (!ei)
9411                 return NULL;
9412
9413         ei->root = NULL;
9414         ei->generation = 0;
9415         ei->last_trans = 0;
9416         ei->last_sub_trans = 0;
9417         ei->logged_trans = 0;
9418         ei->delalloc_bytes = 0;
9419         ei->defrag_bytes = 0;
9420         ei->disk_i_size = 0;
9421         ei->flags = 0;
9422         ei->csum_bytes = 0;
9423         ei->index_cnt = (u64)-1;
9424         ei->dir_index = 0;
9425         ei->last_unlink_trans = 0;
9426         ei->last_log_commit = 0;
9427         ei->delayed_iput_count = 0;
9428
9429         spin_lock_init(&ei->lock);
9430         ei->outstanding_extents = 0;
9431         ei->reserved_extents = 0;
9432
9433         ei->runtime_flags = 0;
9434         ei->force_compress = BTRFS_COMPRESS_NONE;
9435
9436         ei->delayed_node = NULL;
9437
9438         ei->i_otime.tv_sec = 0;
9439         ei->i_otime.tv_nsec = 0;
9440
9441         inode = &ei->vfs_inode;
9442         extent_map_tree_init(&ei->extent_tree);
9443         extent_io_tree_init(&ei->io_tree, &inode->i_data);
9444         extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
9445         ei->io_tree.track_uptodate = 1;
9446         ei->io_failure_tree.track_uptodate = 1;
9447         atomic_set(&ei->sync_writers, 0);
9448         mutex_init(&ei->log_mutex);
9449         mutex_init(&ei->delalloc_mutex);
9450         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9451         INIT_LIST_HEAD(&ei->delalloc_inodes);
9452         INIT_LIST_HEAD(&ei->delayed_iput);
9453         RB_CLEAR_NODE(&ei->rb_node);
9454         init_rwsem(&ei->dio_sem);
9455
9456         return inode;
9457 }
9458
9459 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
9460 void btrfs_test_destroy_inode(struct inode *inode)
9461 {
9462         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9463         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9464 }
9465 #endif
9466
9467 static void btrfs_i_callback(struct rcu_head *head)
9468 {
9469         struct inode *inode = container_of(head, struct inode, i_rcu);
9470         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9471 }
9472
9473 void btrfs_destroy_inode(struct inode *inode)
9474 {
9475         struct btrfs_ordered_extent *ordered;
9476         struct btrfs_root *root = BTRFS_I(inode)->root;
9477
9478         WARN_ON(!hlist_empty(&inode->i_dentry));
9479         WARN_ON(inode->i_data.nrpages);
9480         WARN_ON(BTRFS_I(inode)->outstanding_extents);
9481         WARN_ON(BTRFS_I(inode)->reserved_extents);
9482         WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9483         WARN_ON(BTRFS_I(inode)->csum_bytes);
9484         WARN_ON(BTRFS_I(inode)->defrag_bytes);
9485
9486         /*
9487          * This can happen where we create an inode, but somebody else also
9488          * created the same inode and we need to destroy the one we already
9489          * created.
9490          */
9491         if (!root)
9492                 goto free;
9493
9494         if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
9495                      &BTRFS_I(inode)->runtime_flags)) {
9496                 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
9497                         btrfs_ino(inode));
9498                 atomic_dec(&root->orphan_inodes);
9499         }
9500
9501         while (1) {
9502                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9503                 if (!ordered)
9504                         break;
9505                 else {
9506                         btrfs_err(root->fs_info,
9507                                   "found ordered extent %llu %llu on inode cleanup",
9508                                   ordered->file_offset, ordered->len);
9509                         btrfs_remove_ordered_extent(inode, ordered);
9510                         btrfs_put_ordered_extent(ordered);
9511                         btrfs_put_ordered_extent(ordered);
9512                 }
9513         }
9514         btrfs_qgroup_check_reserved_leak(inode);
9515         inode_tree_del(inode);
9516         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9517 free:
9518         call_rcu(&inode->i_rcu, btrfs_i_callback);
9519 }
9520
9521 int btrfs_drop_inode(struct inode *inode)
9522 {
9523         struct btrfs_root *root = BTRFS_I(inode)->root;
9524
9525         if (root == NULL)
9526                 return 1;
9527
9528         /* the snap/subvol tree is on deleting */
9529         if (btrfs_root_refs(&root->root_item) == 0)
9530                 return 1;
9531         else
9532                 return generic_drop_inode(inode);
9533 }
9534
9535 static void init_once(void *foo)
9536 {
9537         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9538
9539         inode_init_once(&ei->vfs_inode);
9540 }
9541
9542 void btrfs_destroy_cachep(void)
9543 {
9544         /*
9545          * Make sure all delayed rcu free inodes are flushed before we
9546          * destroy cache.
9547          */
9548         rcu_barrier();
9549         kmem_cache_destroy(btrfs_inode_cachep);
9550         kmem_cache_destroy(btrfs_trans_handle_cachep);
9551         kmem_cache_destroy(btrfs_transaction_cachep);
9552         kmem_cache_destroy(btrfs_path_cachep);
9553         kmem_cache_destroy(btrfs_free_space_cachep);
9554 }
9555
9556 int btrfs_init_cachep(void)
9557 {
9558         btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9559                         sizeof(struct btrfs_inode), 0,
9560                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9561                         init_once);
9562         if (!btrfs_inode_cachep)
9563                 goto fail;
9564
9565         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9566                         sizeof(struct btrfs_trans_handle), 0,
9567                         SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9568         if (!btrfs_trans_handle_cachep)
9569                 goto fail;
9570
9571         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
9572                         sizeof(struct btrfs_transaction), 0,
9573                         SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9574         if (!btrfs_transaction_cachep)
9575                 goto fail;
9576
9577         btrfs_path_cachep = kmem_cache_create("btrfs_path",
9578                         sizeof(struct btrfs_path), 0,
9579                         SLAB_MEM_SPREAD, NULL);
9580         if (!btrfs_path_cachep)
9581                 goto fail;
9582
9583         btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9584                         sizeof(struct btrfs_free_space), 0,
9585                         SLAB_MEM_SPREAD, NULL);
9586         if (!btrfs_free_space_cachep)
9587                 goto fail;
9588
9589         return 0;
9590 fail:
9591         btrfs_destroy_cachep();
9592         return -ENOMEM;
9593 }
9594
9595 static int btrfs_getattr(struct vfsmount *mnt,
9596                          struct dentry *dentry, struct kstat *stat)
9597 {
9598         u64 delalloc_bytes;
9599         struct inode *inode = d_inode(dentry);
9600         u32 blocksize = inode->i_sb->s_blocksize;
9601
9602         generic_fillattr(inode, stat);
9603         stat->dev = BTRFS_I(inode)->root->anon_dev;
9604
9605         spin_lock(&BTRFS_I(inode)->lock);
9606         delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
9607         spin_unlock(&BTRFS_I(inode)->lock);
9608         stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9609                         ALIGN(delalloc_bytes, blocksize)) >> 9;
9610         return 0;
9611 }
9612
9613 static int btrfs_rename_exchange(struct inode *old_dir,
9614                               struct dentry *old_dentry,
9615                               struct inode *new_dir,
9616                               struct dentry *new_dentry)
9617 {
9618         struct btrfs_trans_handle *trans;
9619         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9620         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9621         struct inode *new_inode = new_dentry->d_inode;
9622         struct inode *old_inode = old_dentry->d_inode;
9623         struct timespec ctime = current_time(old_inode);
9624         struct dentry *parent;
9625         u64 old_ino = btrfs_ino(old_inode);
9626         u64 new_ino = btrfs_ino(new_inode);
9627         u64 old_idx = 0;
9628         u64 new_idx = 0;
9629         u64 root_objectid;
9630         int ret;
9631         int ret2;
9632         bool root_log_pinned = false;
9633         bool dest_log_pinned = false;
9634
9635         /*
9636          * For non-subvolumes allow exchange only within one subvolume, in the
9637          * same inode namespace. Two subvolumes (represented as directory) can
9638          * be exchanged as they're a logical link and have a fixed inode number.
9639          */
9640         if (root != dest &&
9641             (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
9642              new_ino != BTRFS_FIRST_FREE_OBJECTID))
9643                 return -EXDEV;
9644
9645         /* close the race window with snapshot create/destroy ioctl */
9646         if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
9647             new_ino == BTRFS_FIRST_FREE_OBJECTID)
9648                 down_read(&dest->fs_info->subvol_sem);
9649
9650         /*
9651          * We want to reserve the absolute worst case amount of items.  So if
9652          * both inodes are subvols and we need to unlink them then that would
9653          * require 4 item modifications, but if they are both normal inodes it
9654          * would require 5 item modifications, so we'll assume their normal
9655          * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9656          * should cover the worst case number of items we'll modify.
9657          */
9658         trans = btrfs_start_transaction(root, 12);
9659         if (IS_ERR(trans)) {
9660                 ret = PTR_ERR(trans);
9661                 goto out_notrans;
9662         }
9663
9664         if (dest != root)
9665                 btrfs_record_root_in_trans(trans, dest);
9666
9667         /*
9668          * We need to find a free sequence number both in the source and
9669          * in the destination directory for the exchange.
9670          */
9671         ret = btrfs_set_inode_index(new_dir, &old_idx);
9672         if (ret)
9673                 goto out_fail;
9674         ret = btrfs_set_inode_index(old_dir, &new_idx);
9675         if (ret)
9676                 goto out_fail;
9677
9678         BTRFS_I(old_inode)->dir_index = 0ULL;
9679         BTRFS_I(new_inode)->dir_index = 0ULL;
9680
9681         /* Reference for the source. */
9682         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9683                 /* force full log commit if subvolume involved. */
9684                 btrfs_set_log_full_commit(root->fs_info, trans);
9685         } else {
9686                 btrfs_pin_log_trans(root);
9687                 root_log_pinned = true;
9688                 ret = btrfs_insert_inode_ref(trans, dest,
9689                                              new_dentry->d_name.name,
9690                                              new_dentry->d_name.len,
9691                                              old_ino,
9692                                              btrfs_ino(new_dir), old_idx);
9693                 if (ret)
9694                         goto out_fail;
9695         }
9696
9697         /* And now for the dest. */
9698         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9699                 /* force full log commit if subvolume involved. */
9700                 btrfs_set_log_full_commit(dest->fs_info, trans);
9701         } else {
9702                 btrfs_pin_log_trans(dest);
9703                 dest_log_pinned = true;
9704                 ret = btrfs_insert_inode_ref(trans, root,
9705                                              old_dentry->d_name.name,
9706                                              old_dentry->d_name.len,
9707                                              new_ino,
9708                                              btrfs_ino(old_dir), new_idx);
9709                 if (ret)
9710                         goto out_fail;
9711         }
9712
9713         /* Update inode version and ctime/mtime. */
9714         inode_inc_iversion(old_dir);
9715         inode_inc_iversion(new_dir);
9716         inode_inc_iversion(old_inode);
9717         inode_inc_iversion(new_inode);
9718         old_dir->i_ctime = old_dir->i_mtime = ctime;
9719         new_dir->i_ctime = new_dir->i_mtime = ctime;
9720         old_inode->i_ctime = ctime;
9721         new_inode->i_ctime = ctime;
9722
9723         if (old_dentry->d_parent != new_dentry->d_parent) {
9724                 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
9725                 btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
9726         }
9727
9728         /* src is a subvolume */
9729         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9730                 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9731                 ret = btrfs_unlink_subvol(trans, root, old_dir,
9732                                           root_objectid,
9733                                           old_dentry->d_name.name,
9734                                           old_dentry->d_name.len);
9735         } else { /* src is an inode */
9736                 ret = __btrfs_unlink_inode(trans, root, old_dir,
9737                                            old_dentry->d_inode,
9738                                            old_dentry->d_name.name,
9739                                            old_dentry->d_name.len);
9740                 if (!ret)
9741                         ret = btrfs_update_inode(trans, root, old_inode);
9742         }
9743         if (ret) {
9744                 btrfs_abort_transaction(trans, ret);
9745                 goto out_fail;
9746         }
9747
9748         /* dest is a subvolume */
9749         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9750                 root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9751                 ret = btrfs_unlink_subvol(trans, dest, new_dir,
9752                                           root_objectid,
9753                                           new_dentry->d_name.name,
9754                                           new_dentry->d_name.len);
9755         } else { /* dest is an inode */
9756                 ret = __btrfs_unlink_inode(trans, dest, new_dir,
9757                                            new_dentry->d_inode,
9758                                            new_dentry->d_name.name,
9759                                            new_dentry->d_name.len);
9760                 if (!ret)
9761                         ret = btrfs_update_inode(trans, dest, new_inode);
9762         }
9763         if (ret) {
9764                 btrfs_abort_transaction(trans, ret);
9765                 goto out_fail;
9766         }
9767
9768         ret = btrfs_add_link(trans, new_dir, old_inode,
9769                              new_dentry->d_name.name,
9770                              new_dentry->d_name.len, 0, old_idx);
9771         if (ret) {
9772                 btrfs_abort_transaction(trans, ret);
9773                 goto out_fail;
9774         }
9775
9776         ret = btrfs_add_link(trans, old_dir, new_inode,
9777                              old_dentry->d_name.name,
9778                              old_dentry->d_name.len, 0, new_idx);
9779         if (ret) {
9780                 btrfs_abort_transaction(trans, ret);
9781                 goto out_fail;
9782         }
9783
9784         if (old_inode->i_nlink == 1)
9785                 BTRFS_I(old_inode)->dir_index = old_idx;
9786         if (new_inode->i_nlink == 1)
9787                 BTRFS_I(new_inode)->dir_index = new_idx;
9788
9789         if (root_log_pinned) {
9790                 parent = new_dentry->d_parent;
9791                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
9792                 btrfs_end_log_trans(root);
9793                 root_log_pinned = false;
9794         }
9795         if (dest_log_pinned) {
9796                 parent = old_dentry->d_parent;
9797                 btrfs_log_new_name(trans, new_inode, new_dir, parent);
9798                 btrfs_end_log_trans(dest);
9799                 dest_log_pinned = false;
9800         }
9801 out_fail:
9802         /*
9803          * If we have pinned a log and an error happened, we unpin tasks
9804          * trying to sync the log and force them to fallback to a transaction
9805          * commit if the log currently contains any of the inodes involved in
9806          * this rename operation (to ensure we do not persist a log with an
9807          * inconsistent state for any of these inodes or leading to any
9808          * inconsistencies when replayed). If the transaction was aborted, the
9809          * abortion reason is propagated to userspace when attempting to commit
9810          * the transaction. If the log does not contain any of these inodes, we
9811          * allow the tasks to sync it.
9812          */
9813         if (ret && (root_log_pinned || dest_log_pinned)) {
9814                 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
9815                     btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
9816                     btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
9817                     (new_inode &&
9818                      btrfs_inode_in_log(new_inode, root->fs_info->generation)))
9819                     btrfs_set_log_full_commit(root->fs_info, trans);
9820
9821                 if (root_log_pinned) {
9822                         btrfs_end_log_trans(root);
9823                         root_log_pinned = false;
9824                 }
9825                 if (dest_log_pinned) {
9826                         btrfs_end_log_trans(dest);
9827                         dest_log_pinned = false;
9828                 }
9829         }
9830         ret2 = btrfs_end_transaction(trans, root);
9831         ret = ret ? ret : ret2;
9832 out_notrans:
9833         if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
9834             old_ino == BTRFS_FIRST_FREE_OBJECTID)
9835                 up_read(&root->fs_info->subvol_sem);
9836
9837         return ret;
9838 }
9839
9840 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9841                                      struct btrfs_root *root,
9842                                      struct inode *dir,
9843                                      struct dentry *dentry)
9844 {
9845         int ret;
9846         struct inode *inode;
9847         u64 objectid;
9848         u64 index;
9849
9850         ret = btrfs_find_free_ino(root, &objectid);
9851         if (ret)
9852                 return ret;
9853
9854         inode = btrfs_new_inode(trans, root, dir,
9855                                 dentry->d_name.name,
9856                                 dentry->d_name.len,
9857                                 btrfs_ino(dir),
9858                                 objectid,
9859                                 S_IFCHR | WHITEOUT_MODE,
9860                                 &index);
9861
9862         if (IS_ERR(inode)) {
9863                 ret = PTR_ERR(inode);
9864                 return ret;
9865         }
9866
9867         inode->i_op = &btrfs_special_inode_operations;
9868         init_special_inode(inode, inode->i_mode,
9869                 WHITEOUT_DEV);
9870
9871         ret = btrfs_init_inode_security(trans, inode, dir,
9872                                 &dentry->d_name);
9873         if (ret)
9874                 goto out;
9875
9876         ret = btrfs_add_nondir(trans, dir, dentry,
9877                                 inode, 0, index);
9878         if (ret)
9879                 goto out;
9880
9881         ret = btrfs_update_inode(trans, root, inode);
9882 out:
9883         unlock_new_inode(inode);
9884         if (ret)
9885                 inode_dec_link_count(inode);
9886         iput(inode);
9887
9888         return ret;
9889 }
9890
9891 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9892                            struct inode *new_dir, struct dentry *new_dentry,
9893                            unsigned int flags)
9894 {
9895         struct btrfs_trans_handle *trans;
9896         unsigned int trans_num_items;
9897         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9898         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9899         struct inode *new_inode = d_inode(new_dentry);
9900         struct inode *old_inode = d_inode(old_dentry);
9901         u64 index = 0;
9902         u64 root_objectid;
9903         int ret;
9904         u64 old_ino = btrfs_ino(old_inode);
9905         bool log_pinned = false;
9906
9907         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9908                 return -EPERM;
9909
9910         /* we only allow rename subvolume link between subvolumes */
9911         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9912                 return -EXDEV;
9913
9914         if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9915             (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
9916                 return -ENOTEMPTY;
9917
9918         if (S_ISDIR(old_inode->i_mode) && new_inode &&
9919             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9920                 return -ENOTEMPTY;
9921
9922
9923         /* check for collisions, even if the  name isn't there */
9924         ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9925                              new_dentry->d_name.name,
9926                              new_dentry->d_name.len);
9927
9928         if (ret) {
9929                 if (ret == -EEXIST) {
9930                         /* we shouldn't get
9931                          * eexist without a new_inode */
9932                         if (WARN_ON(!new_inode)) {
9933                                 return ret;
9934                         }
9935                 } else {
9936                         /* maybe -EOVERFLOW */
9937                         return ret;
9938                 }
9939         }
9940         ret = 0;
9941
9942         /*
9943          * we're using rename to replace one file with another.  Start IO on it
9944          * now so  we don't add too much work to the end of the transaction
9945          */
9946         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9947                 filemap_flush(old_inode->i_mapping);
9948
9949         /* close the racy window with snapshot create/destroy ioctl */
9950         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9951                 down_read(&root->fs_info->subvol_sem);
9952         /*
9953          * We want to reserve the absolute worst case amount of items.  So if
9954          * both inodes are subvols and we need to unlink them then that would
9955          * require 4 item modifications, but if they are both normal inodes it
9956          * would require 5 item modifications, so we'll assume they are normal
9957          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9958          * should cover the worst case number of items we'll modify.
9959          * If our rename has the whiteout flag, we need more 5 units for the
9960          * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9961          * when selinux is enabled).
9962          */
9963         trans_num_items = 11;
9964         if (flags & RENAME_WHITEOUT)
9965                 trans_num_items += 5;
9966         trans = btrfs_start_transaction(root, trans_num_items);
9967         if (IS_ERR(trans)) {
9968                 ret = PTR_ERR(trans);
9969                 goto out_notrans;
9970         }
9971
9972         if (dest != root)
9973                 btrfs_record_root_in_trans(trans, dest);
9974
9975         ret = btrfs_set_inode_index(new_dir, &index);
9976         if (ret)
9977                 goto out_fail;
9978
9979         BTRFS_I(old_inode)->dir_index = 0ULL;
9980         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9981                 /* force full log commit if subvolume involved. */
9982                 btrfs_set_log_full_commit(root->fs_info, trans);
9983         } else {
9984                 btrfs_pin_log_trans(root);
9985                 log_pinned = true;
9986                 ret = btrfs_insert_inode_ref(trans, dest,
9987                                              new_dentry->d_name.name,
9988                                              new_dentry->d_name.len,
9989                                              old_ino,
9990                                              btrfs_ino(new_dir), index);
9991                 if (ret)
9992                         goto out_fail;
9993         }
9994
9995         inode_inc_iversion(old_dir);
9996         inode_inc_iversion(new_dir);
9997         inode_inc_iversion(old_inode);
9998         old_dir->i_ctime = old_dir->i_mtime =
9999         new_dir->i_ctime = new_dir->i_mtime =
10000         old_inode->i_ctime = current_time(old_dir);
10001
10002         if (old_dentry->d_parent != new_dentry->d_parent)
10003                 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
10004
10005         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
10006                 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
10007                 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
10008                                         old_dentry->d_name.name,
10009                                         old_dentry->d_name.len);
10010         } else {
10011                 ret = __btrfs_unlink_inode(trans, root, old_dir,
10012                                         d_inode(old_dentry),
10013                                         old_dentry->d_name.name,
10014                                         old_dentry->d_name.len);
10015                 if (!ret)
10016                         ret = btrfs_update_inode(trans, root, old_inode);
10017         }
10018         if (ret) {
10019                 btrfs_abort_transaction(trans, ret);
10020                 goto out_fail;
10021         }
10022
10023         if (new_inode) {
10024                 inode_inc_iversion(new_inode);
10025                 new_inode->i_ctime = current_time(new_inode);
10026                 if (unlikely(btrfs_ino(new_inode) ==
10027                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
10028                         root_objectid = BTRFS_I(new_inode)->location.objectid;
10029                         ret = btrfs_unlink_subvol(trans, dest, new_dir,
10030                                                 root_objectid,
10031                                                 new_dentry->d_name.name,
10032                                                 new_dentry->d_name.len);
10033                         BUG_ON(new_inode->i_nlink == 0);
10034                 } else {
10035                         ret = btrfs_unlink_inode(trans, dest, new_dir,
10036                                                  d_inode(new_dentry),
10037                                                  new_dentry->d_name.name,
10038                                                  new_dentry->d_name.len);
10039                 }
10040                 if (!ret && new_inode->i_nlink == 0)
10041                         ret = btrfs_orphan_add(trans, d_inode(new_dentry));
10042                 if (ret) {
10043                         btrfs_abort_transaction(trans, ret);
10044                         goto out_fail;
10045                 }
10046         }
10047
10048         ret = btrfs_add_link(trans, new_dir, old_inode,
10049                              new_dentry->d_name.name,
10050                              new_dentry->d_name.len, 0, index);
10051         if (ret) {
10052                 btrfs_abort_transaction(trans, ret);
10053                 goto out_fail;
10054         }
10055
10056         if (old_inode->i_nlink == 1)
10057                 BTRFS_I(old_inode)->dir_index = index;
10058
10059         if (log_pinned) {
10060                 struct dentry *parent = new_dentry->d_parent;
10061
10062                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
10063                 btrfs_end_log_trans(root);
10064                 log_pinned = false;
10065         }
10066
10067         if (flags & RENAME_WHITEOUT) {
10068                 ret = btrfs_whiteout_for_rename(trans, root, old_dir,
10069                                                 old_dentry);
10070
10071                 if (ret) {
10072                         btrfs_abort_transaction(trans, ret);
10073                         goto out_fail;
10074                 }
10075         }
10076 out_fail:
10077         /*
10078          * If we have pinned the log and an error happened, we unpin tasks
10079          * trying to sync the log and force them to fallback to a transaction
10080          * commit if the log currently contains any of the inodes involved in
10081          * this rename operation (to ensure we do not persist a log with an
10082          * inconsistent state for any of these inodes or leading to any
10083          * inconsistencies when replayed). If the transaction was aborted, the
10084          * abortion reason is propagated to userspace when attempting to commit
10085          * the transaction. If the log does not contain any of these inodes, we
10086          * allow the tasks to sync it.
10087          */
10088         if (ret && log_pinned) {
10089                 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
10090                     btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
10091                     btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
10092                     (new_inode &&
10093                      btrfs_inode_in_log(new_inode, root->fs_info->generation)))
10094                     btrfs_set_log_full_commit(root->fs_info, trans);
10095
10096                 btrfs_end_log_trans(root);
10097                 log_pinned = false;
10098         }
10099         btrfs_end_transaction(trans, root);
10100 out_notrans:
10101         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
10102                 up_read(&root->fs_info->subvol_sem);
10103
10104         return ret;
10105 }
10106
10107 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
10108                          struct inode *new_dir, struct dentry *new_dentry,
10109                          unsigned int flags)
10110 {
10111         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
10112                 return -EINVAL;
10113
10114         if (flags & RENAME_EXCHANGE)
10115                 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
10116                                           new_dentry);
10117
10118         return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
10119 }
10120
10121 static void btrfs_run_delalloc_work(struct btrfs_work *work)
10122 {
10123         struct btrfs_delalloc_work *delalloc_work;
10124         struct inode *inode;
10125
10126         delalloc_work = container_of(work, struct btrfs_delalloc_work,
10127                                      work);
10128         inode = delalloc_work->inode;
10129         filemap_flush(inode->i_mapping);
10130         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
10131                                 &BTRFS_I(inode)->runtime_flags))
10132                 filemap_flush(inode->i_mapping);
10133
10134         if (delalloc_work->delay_iput)
10135                 btrfs_add_delayed_iput(inode);
10136         else
10137                 iput(inode);
10138         complete(&delalloc_work->completion);
10139 }
10140
10141 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
10142                                                     int delay_iput)
10143 {
10144         struct btrfs_delalloc_work *work;
10145
10146         work = kmalloc(sizeof(*work), GFP_NOFS);
10147         if (!work)
10148                 return NULL;
10149
10150         init_completion(&work->completion);
10151         INIT_LIST_HEAD(&work->list);
10152         work->inode = inode;
10153         work->delay_iput = delay_iput;
10154         WARN_ON_ONCE(!inode);
10155         btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10156                         btrfs_run_delalloc_work, NULL, NULL);
10157
10158         return work;
10159 }
10160
10161 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
10162 {
10163         wait_for_completion(&work->completion);
10164         kfree(work);
10165 }
10166
10167 /*
10168  * some fairly slow code that needs optimization. This walks the list
10169  * of all the inodes with pending delalloc and forces them to disk.
10170  */
10171 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
10172                                    int nr)
10173 {
10174         struct btrfs_inode *binode;
10175         struct inode *inode;
10176         struct btrfs_delalloc_work *work, *next;
10177         struct list_head works;
10178         struct list_head splice;
10179         int ret = 0;
10180
10181         INIT_LIST_HEAD(&works);
10182         INIT_LIST_HEAD(&splice);
10183
10184         mutex_lock(&root->delalloc_mutex);
10185         spin_lock(&root->delalloc_lock);
10186         list_splice_init(&root->delalloc_inodes, &splice);
10187         while (!list_empty(&splice)) {
10188                 binode = list_entry(splice.next, struct btrfs_inode,
10189                                     delalloc_inodes);
10190
10191                 list_move_tail(&binode->delalloc_inodes,
10192                                &root->delalloc_inodes);
10193                 inode = igrab(&binode->vfs_inode);
10194                 if (!inode) {
10195                         cond_resched_lock(&root->delalloc_lock);
10196                         continue;
10197                 }
10198                 spin_unlock(&root->delalloc_lock);
10199
10200                 work = btrfs_alloc_delalloc_work(inode, delay_iput);
10201                 if (!work) {
10202                         if (delay_iput)
10203                                 btrfs_add_delayed_iput(inode);
10204                         else
10205                                 iput(inode);
10206                         ret = -ENOMEM;
10207                         goto out;
10208                 }
10209                 list_add_tail(&work->list, &works);
10210                 btrfs_queue_work(root->fs_info->flush_workers,
10211                                  &work->work);
10212                 ret++;
10213                 if (nr != -1 && ret >= nr)
10214                         goto out;
10215                 cond_resched();
10216                 spin_lock(&root->delalloc_lock);
10217         }
10218         spin_unlock(&root->delalloc_lock);
10219
10220 out:
10221         list_for_each_entry_safe(work, next, &works, list) {
10222                 list_del_init(&work->list);
10223                 btrfs_wait_and_free_delalloc_work(work);
10224         }
10225
10226         if (!list_empty_careful(&splice)) {
10227                 spin_lock(&root->delalloc_lock);
10228                 list_splice_tail(&splice, &root->delalloc_inodes);
10229                 spin_unlock(&root->delalloc_lock);
10230         }
10231         mutex_unlock(&root->delalloc_mutex);
10232         return ret;
10233 }
10234
10235 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
10236 {
10237         int ret;
10238
10239         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
10240                 return -EROFS;
10241
10242         ret = __start_delalloc_inodes(root, delay_iput, -1);
10243         if (ret > 0)
10244                 ret = 0;
10245         /*
10246          * the filemap_flush will queue IO into the worker threads, but
10247          * we have to make sure the IO is actually started and that
10248          * ordered extents get created before we return
10249          */
10250         atomic_inc(&root->fs_info->async_submit_draining);
10251         while (atomic_read(&root->fs_info->nr_async_submits) ||
10252               atomic_read(&root->fs_info->async_delalloc_pages)) {
10253                 wait_event(root->fs_info->async_submit_wait,
10254                    (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
10255                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
10256         }
10257         atomic_dec(&root->fs_info->async_submit_draining);
10258         return ret;
10259 }
10260
10261 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
10262                                int nr)
10263 {
10264         struct btrfs_root *root;
10265         struct list_head splice;
10266         int ret;
10267
10268         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10269                 return -EROFS;
10270
10271         INIT_LIST_HEAD(&splice);
10272
10273         mutex_lock(&fs_info->delalloc_root_mutex);
10274         spin_lock(&fs_info->delalloc_root_lock);
10275         list_splice_init(&fs_info->delalloc_roots, &splice);
10276         while (!list_empty(&splice) && nr) {
10277                 root = list_first_entry(&splice, struct btrfs_root,
10278                                         delalloc_root);
10279                 root = btrfs_grab_fs_root(root);
10280                 BUG_ON(!root);
10281                 list_move_tail(&root->delalloc_root,
10282                                &fs_info->delalloc_roots);
10283                 spin_unlock(&fs_info->delalloc_root_lock);
10284
10285                 ret = __start_delalloc_inodes(root, delay_iput, nr);
10286                 btrfs_put_fs_root(root);
10287                 if (ret < 0)
10288                         goto out;
10289
10290                 if (nr != -1) {
10291                         nr -= ret;
10292                         WARN_ON(nr < 0);
10293                 }
10294                 spin_lock(&fs_info->delalloc_root_lock);
10295         }
10296         spin_unlock(&fs_info->delalloc_root_lock);
10297
10298         ret = 0;
10299         atomic_inc(&fs_info->async_submit_draining);
10300         while (atomic_read(&fs_info->nr_async_submits) ||
10301               atomic_read(&fs_info->async_delalloc_pages)) {
10302                 wait_event(fs_info->async_submit_wait,
10303                    (atomic_read(&fs_info->nr_async_submits) == 0 &&
10304                     atomic_read(&fs_info->async_delalloc_pages) == 0));
10305         }
10306         atomic_dec(&fs_info->async_submit_draining);
10307 out:
10308         if (!list_empty_careful(&splice)) {
10309                 spin_lock(&fs_info->delalloc_root_lock);
10310                 list_splice_tail(&splice, &fs_info->delalloc_roots);
10311                 spin_unlock(&fs_info->delalloc_root_lock);
10312         }
10313         mutex_unlock(&fs_info->delalloc_root_mutex);
10314         return ret;
10315 }
10316
10317 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
10318                          const char *symname)
10319 {
10320         struct btrfs_trans_handle *trans;
10321         struct btrfs_root *root = BTRFS_I(dir)->root;
10322         struct btrfs_path *path;
10323         struct btrfs_key key;
10324         struct inode *inode = NULL;
10325         int err;
10326         int drop_inode = 0;
10327         u64 objectid;
10328         u64 index = 0;
10329         int name_len;
10330         int datasize;
10331         unsigned long ptr;
10332         struct btrfs_file_extent_item *ei;
10333         struct extent_buffer *leaf;
10334
10335         name_len = strlen(symname);
10336         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
10337                 return -ENAMETOOLONG;
10338
10339         /*
10340          * 2 items for inode item and ref
10341          * 2 items for dir items
10342          * 1 item for updating parent inode item
10343          * 1 item for the inline extent item
10344          * 1 item for xattr if selinux is on
10345          */
10346         trans = btrfs_start_transaction(root, 7);
10347         if (IS_ERR(trans))
10348                 return PTR_ERR(trans);
10349
10350         err = btrfs_find_free_ino(root, &objectid);
10351         if (err)
10352                 goto out_unlock;
10353
10354         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
10355                                 dentry->d_name.len, btrfs_ino(dir), objectid,
10356                                 S_IFLNK|S_IRWXUGO, &index);
10357         if (IS_ERR(inode)) {
10358                 err = PTR_ERR(inode);
10359                 goto out_unlock;
10360         }
10361
10362         /*
10363         * If the active LSM wants to access the inode during
10364         * d_instantiate it needs these. Smack checks to see
10365         * if the filesystem supports xattrs by looking at the
10366         * ops vector.
10367         */
10368         inode->i_fop = &btrfs_file_operations;
10369         inode->i_op = &btrfs_file_inode_operations;
10370         inode->i_mapping->a_ops = &btrfs_aops;
10371         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10372
10373         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10374         if (err)
10375                 goto out_unlock_inode;
10376
10377         path = btrfs_alloc_path();
10378         if (!path) {
10379                 err = -ENOMEM;
10380                 goto out_unlock_inode;
10381         }
10382         key.objectid = btrfs_ino(inode);
10383         key.offset = 0;
10384         key.type = BTRFS_EXTENT_DATA_KEY;
10385         datasize = btrfs_file_extent_calc_inline_size(name_len);
10386         err = btrfs_insert_empty_item(trans, root, path, &key,
10387                                       datasize);
10388         if (err) {
10389                 btrfs_free_path(path);
10390                 goto out_unlock_inode;
10391         }
10392         leaf = path->nodes[0];
10393         ei = btrfs_item_ptr(leaf, path->slots[0],
10394                             struct btrfs_file_extent_item);
10395         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
10396         btrfs_set_file_extent_type(leaf, ei,
10397                                    BTRFS_FILE_EXTENT_INLINE);
10398         btrfs_set_file_extent_encryption(leaf, ei, 0);
10399         btrfs_set_file_extent_compression(leaf, ei, 0);
10400         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
10401         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
10402
10403         ptr = btrfs_file_extent_inline_start(ei);
10404         write_extent_buffer(leaf, symname, ptr, name_len);
10405         btrfs_mark_buffer_dirty(leaf);
10406         btrfs_free_path(path);
10407
10408         inode->i_op = &btrfs_symlink_inode_operations;
10409         inode_nohighmem(inode);
10410         inode->i_mapping->a_ops = &btrfs_symlink_aops;
10411         inode_set_bytes(inode, name_len);
10412         btrfs_i_size_write(inode, name_len);
10413         err = btrfs_update_inode(trans, root, inode);
10414         /*
10415          * Last step, add directory indexes for our symlink inode. This is the
10416          * last step to avoid extra cleanup of these indexes if an error happens
10417          * elsewhere above.
10418          */
10419         if (!err)
10420                 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
10421         if (err) {
10422                 drop_inode = 1;
10423                 goto out_unlock_inode;
10424         }
10425
10426         d_instantiate_new(dentry, inode);
10427
10428 out_unlock:
10429         btrfs_end_transaction(trans, root);
10430         if (drop_inode) {
10431                 inode_dec_link_count(inode);
10432                 iput(inode);
10433         }
10434         btrfs_btree_balance_dirty(root);
10435         return err;
10436
10437 out_unlock_inode:
10438         drop_inode = 1;
10439         unlock_new_inode(inode);
10440         goto out_unlock;
10441 }
10442
10443 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10444                                        u64 start, u64 num_bytes, u64 min_size,
10445                                        loff_t actual_len, u64 *alloc_hint,
10446                                        struct btrfs_trans_handle *trans)
10447 {
10448         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10449         struct extent_map *em;
10450         struct btrfs_root *root = BTRFS_I(inode)->root;
10451         struct btrfs_key ins;
10452         u64 cur_offset = start;
10453         u64 i_size;
10454         u64 cur_bytes;
10455         u64 last_alloc = (u64)-1;
10456         int ret = 0;
10457         bool own_trans = true;
10458         u64 end = start + num_bytes - 1;
10459
10460         if (trans)
10461                 own_trans = false;
10462         while (num_bytes > 0) {
10463                 if (own_trans) {
10464                         trans = btrfs_start_transaction(root, 3);
10465                         if (IS_ERR(trans)) {
10466                                 ret = PTR_ERR(trans);
10467                                 break;
10468                         }
10469                 }
10470
10471                 cur_bytes = min_t(u64, num_bytes, SZ_256M);
10472                 cur_bytes = max(cur_bytes, min_size);
10473                 /*
10474                  * If we are severely fragmented we could end up with really
10475                  * small allocations, so if the allocator is returning small
10476                  * chunks lets make its job easier by only searching for those
10477                  * sized chunks.
10478                  */
10479                 cur_bytes = min(cur_bytes, last_alloc);
10480                 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10481                                 min_size, 0, *alloc_hint, &ins, 1, 0);
10482                 if (ret) {
10483                         if (own_trans)
10484                                 btrfs_end_transaction(trans, root);
10485                         break;
10486                 }
10487                 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
10488
10489                 last_alloc = ins.offset;
10490                 ret = insert_reserved_file_extent(trans, inode,
10491                                                   cur_offset, ins.objectid,
10492                                                   ins.offset, ins.offset,
10493                                                   ins.offset, 0, 0, 0,
10494                                                   BTRFS_FILE_EXTENT_PREALLOC);
10495                 if (ret) {
10496                         btrfs_free_reserved_extent(root, ins.objectid,
10497                                                    ins.offset, 0);
10498                         btrfs_abort_transaction(trans, ret);
10499                         if (own_trans)
10500                                 btrfs_end_transaction(trans, root);
10501                         break;
10502                 }
10503
10504                 btrfs_drop_extent_cache(inode, cur_offset,
10505                                         cur_offset + ins.offset -1, 0);
10506
10507                 em = alloc_extent_map();
10508                 if (!em) {
10509                         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10510                                 &BTRFS_I(inode)->runtime_flags);
10511                         goto next;
10512                 }
10513
10514                 em->start = cur_offset;
10515                 em->orig_start = cur_offset;
10516                 em->len = ins.offset;
10517                 em->block_start = ins.objectid;
10518                 em->block_len = ins.offset;
10519                 em->orig_block_len = ins.offset;
10520                 em->ram_bytes = ins.offset;
10521                 em->bdev = root->fs_info->fs_devices->latest_bdev;
10522                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10523                 em->generation = trans->transid;
10524
10525                 while (1) {
10526                         write_lock(&em_tree->lock);
10527                         ret = add_extent_mapping(em_tree, em, 1);
10528                         write_unlock(&em_tree->lock);
10529                         if (ret != -EEXIST)
10530                                 break;
10531                         btrfs_drop_extent_cache(inode, cur_offset,
10532                                                 cur_offset + ins.offset - 1,
10533                                                 0);
10534                 }
10535                 free_extent_map(em);
10536 next:
10537                 num_bytes -= ins.offset;
10538                 cur_offset += ins.offset;
10539                 *alloc_hint = ins.objectid + ins.offset;
10540
10541                 inode_inc_iversion(inode);
10542                 inode->i_ctime = current_time(inode);
10543                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10544                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10545                     (actual_len > inode->i_size) &&
10546                     (cur_offset > inode->i_size)) {
10547                         if (cur_offset > actual_len)
10548                                 i_size = actual_len;
10549                         else
10550                                 i_size = cur_offset;
10551                         i_size_write(inode, i_size);
10552                         btrfs_ordered_update_i_size(inode, i_size, NULL);
10553                 }
10554
10555                 ret = btrfs_update_inode(trans, root, inode);
10556
10557                 if (ret) {
10558                         btrfs_abort_transaction(trans, ret);
10559                         if (own_trans)
10560                                 btrfs_end_transaction(trans, root);
10561                         break;
10562                 }
10563
10564                 if (own_trans)
10565                         btrfs_end_transaction(trans, root);
10566         }
10567         if (cur_offset < end)
10568                 btrfs_free_reserved_data_space(inode, cur_offset,
10569                         end - cur_offset + 1);
10570         return ret;
10571 }
10572
10573 int btrfs_prealloc_file_range(struct inode *inode, int mode,
10574                               u64 start, u64 num_bytes, u64 min_size,
10575                               loff_t actual_len, u64 *alloc_hint)
10576 {
10577         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10578                                            min_size, actual_len, alloc_hint,
10579                                            NULL);
10580 }
10581
10582 int btrfs_prealloc_file_range_trans(struct inode *inode,
10583                                     struct btrfs_trans_handle *trans, int mode,
10584                                     u64 start, u64 num_bytes, u64 min_size,
10585                                     loff_t actual_len, u64 *alloc_hint)
10586 {
10587         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10588                                            min_size, actual_len, alloc_hint, trans);
10589 }
10590
10591 static int btrfs_set_page_dirty(struct page *page)
10592 {
10593         return __set_page_dirty_nobuffers(page);
10594 }
10595
10596 static int btrfs_permission(struct inode *inode, int mask)
10597 {
10598         struct btrfs_root *root = BTRFS_I(inode)->root;
10599         umode_t mode = inode->i_mode;
10600
10601         if (mask & MAY_WRITE &&
10602             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10603                 if (btrfs_root_readonly(root))
10604                         return -EROFS;
10605                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10606                         return -EACCES;
10607         }
10608         return generic_permission(inode, mask);
10609 }
10610
10611 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
10612 {
10613         struct btrfs_trans_handle *trans;
10614         struct btrfs_root *root = BTRFS_I(dir)->root;
10615         struct inode *inode = NULL;
10616         u64 objectid;
10617         u64 index;
10618         int ret = 0;
10619
10620         /*
10621          * 5 units required for adding orphan entry
10622          */
10623         trans = btrfs_start_transaction(root, 5);
10624         if (IS_ERR(trans))
10625                 return PTR_ERR(trans);
10626
10627         ret = btrfs_find_free_ino(root, &objectid);
10628         if (ret)
10629                 goto out;
10630
10631         inode = btrfs_new_inode(trans, root, dir, NULL, 0,
10632                                 btrfs_ino(dir), objectid, mode, &index);
10633         if (IS_ERR(inode)) {
10634                 ret = PTR_ERR(inode);
10635                 inode = NULL;
10636                 goto out;
10637         }
10638
10639         inode->i_fop = &btrfs_file_operations;
10640         inode->i_op = &btrfs_file_inode_operations;
10641
10642         inode->i_mapping->a_ops = &btrfs_aops;
10643         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10644
10645         ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10646         if (ret)
10647                 goto out_inode;
10648
10649         ret = btrfs_update_inode(trans, root, inode);
10650         if (ret)
10651                 goto out_inode;
10652         ret = btrfs_orphan_add(trans, inode);
10653         if (ret)
10654                 goto out_inode;
10655
10656         /*
10657          * We set number of links to 0 in btrfs_new_inode(), and here we set
10658          * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10659          * through:
10660          *
10661          *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10662          */
10663         set_nlink(inode, 1);
10664         unlock_new_inode(inode);
10665         d_tmpfile(dentry, inode);
10666         mark_inode_dirty(inode);
10667
10668 out:
10669         btrfs_end_transaction(trans, root);
10670         if (ret)
10671                 iput(inode);
10672         btrfs_balance_delayed_items(root);
10673         btrfs_btree_balance_dirty(root);
10674         return ret;
10675
10676 out_inode:
10677         unlock_new_inode(inode);
10678         goto out;
10679
10680 }
10681
10682 static const struct inode_operations btrfs_dir_inode_operations = {
10683         .getattr        = btrfs_getattr,
10684         .lookup         = btrfs_lookup,
10685         .create         = btrfs_create,
10686         .unlink         = btrfs_unlink,
10687         .link           = btrfs_link,
10688         .mkdir          = btrfs_mkdir,
10689         .rmdir          = btrfs_rmdir,
10690         .rename         = btrfs_rename2,
10691         .symlink        = btrfs_symlink,
10692         .setattr        = btrfs_setattr,
10693         .mknod          = btrfs_mknod,
10694         .listxattr      = btrfs_listxattr,
10695         .permission     = btrfs_permission,
10696         .get_acl        = btrfs_get_acl,
10697         .set_acl        = btrfs_set_acl,
10698         .update_time    = btrfs_update_time,
10699         .tmpfile        = btrfs_tmpfile,
10700 };
10701 static const struct inode_operations btrfs_dir_ro_inode_operations = {
10702         .lookup         = btrfs_lookup,
10703         .permission     = btrfs_permission,
10704         .update_time    = btrfs_update_time,
10705 };
10706
10707 static const struct file_operations btrfs_dir_file_operations = {
10708         .llseek         = generic_file_llseek,
10709         .read           = generic_read_dir,
10710         .iterate_shared = btrfs_real_readdir,
10711         .unlocked_ioctl = btrfs_ioctl,
10712 #ifdef CONFIG_COMPAT
10713         .compat_ioctl   = btrfs_compat_ioctl,
10714 #endif
10715         .release        = btrfs_release_file,
10716         .fsync          = btrfs_sync_file,
10717 };
10718
10719 static const struct extent_io_ops btrfs_extent_io_ops = {
10720         .fill_delalloc = run_delalloc_range,
10721         .submit_bio_hook = btrfs_submit_bio_hook,
10722         .merge_bio_hook = btrfs_merge_bio_hook,
10723         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
10724         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
10725         .writepage_start_hook = btrfs_writepage_start_hook,
10726         .set_bit_hook = btrfs_set_bit_hook,
10727         .clear_bit_hook = btrfs_clear_bit_hook,
10728         .merge_extent_hook = btrfs_merge_extent_hook,
10729         .split_extent_hook = btrfs_split_extent_hook,
10730 };
10731
10732 /*
10733  * btrfs doesn't support the bmap operation because swapfiles
10734  * use bmap to make a mapping of extents in the file.  They assume
10735  * these extents won't change over the life of the file and they
10736  * use the bmap result to do IO directly to the drive.
10737  *
10738  * the btrfs bmap call would return logical addresses that aren't
10739  * suitable for IO and they also will change frequently as COW
10740  * operations happen.  So, swapfile + btrfs == corruption.
10741  *
10742  * For now we're avoiding this by dropping bmap.
10743  */
10744 static const struct address_space_operations btrfs_aops = {
10745         .readpage       = btrfs_readpage,
10746         .writepage      = btrfs_writepage,
10747         .writepages     = btrfs_writepages,
10748         .readpages      = btrfs_readpages,
10749         .direct_IO      = btrfs_direct_IO,
10750         .invalidatepage = btrfs_invalidatepage,
10751         .releasepage    = btrfs_releasepage,
10752         .set_page_dirty = btrfs_set_page_dirty,
10753         .error_remove_page = generic_error_remove_page,
10754 };
10755
10756 static const struct address_space_operations btrfs_symlink_aops = {
10757         .readpage       = btrfs_readpage,
10758         .writepage      = btrfs_writepage,
10759         .invalidatepage = btrfs_invalidatepage,
10760         .releasepage    = btrfs_releasepage,
10761 };
10762
10763 static const struct inode_operations btrfs_file_inode_operations = {
10764         .getattr        = btrfs_getattr,
10765         .setattr        = btrfs_setattr,
10766         .listxattr      = btrfs_listxattr,
10767         .permission     = btrfs_permission,
10768         .fiemap         = btrfs_fiemap,
10769         .get_acl        = btrfs_get_acl,
10770         .set_acl        = btrfs_set_acl,
10771         .update_time    = btrfs_update_time,
10772 };
10773 static const struct inode_operations btrfs_special_inode_operations = {
10774         .getattr        = btrfs_getattr,
10775         .setattr        = btrfs_setattr,
10776         .permission     = btrfs_permission,
10777         .listxattr      = btrfs_listxattr,
10778         .get_acl        = btrfs_get_acl,
10779         .set_acl        = btrfs_set_acl,
10780         .update_time    = btrfs_update_time,
10781 };
10782 static const struct inode_operations btrfs_symlink_inode_operations = {
10783         .readlink       = generic_readlink,
10784         .get_link       = page_get_link,
10785         .getattr        = btrfs_getattr,
10786         .setattr        = btrfs_setattr,
10787         .permission     = btrfs_permission,
10788         .listxattr      = btrfs_listxattr,
10789         .update_time    = btrfs_update_time,
10790 };
10791
10792 const struct dentry_operations btrfs_dentry_operations = {
10793         .d_delete       = btrfs_dentry_delete,
10794         .d_release      = btrfs_dentry_release,
10795 };