fs/btrfs/send.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2012 Alexander Block.  All rights reserved.
   4  */
   5
   6 #include <linux/bsearch.h>
   7 #include <linux/fs.h>
   8 #include <linux/file.h>
   9 #include <linux/sort.h>
  10 #include <linux/mount.h>
  11 #include <linux/xattr.h>
  12 #include <linux/posix_acl_xattr.h>
  13 #include <linux/radix-tree.h>
  14 #include <linux/vmalloc.h>
  15 #include <linux/string.h>
  16 #include <linux/compat.h>
  17 #include <linux/crc32c.h>
  18 #include <linux/fsverity.h>
  19
  20 #include "send.h"
  21 #include "ctree.h"
  22 #include "backref.h"
  23 #include "locking.h"
  24 #include "disk-io.h"
  25 #include "btrfs_inode.h"
  26 #include "transaction.h"
  27 #include "compression.h"
  28 #include "xattr.h"
  29 #include "print-tree.h"
  30
  31 /*
  32  * Maximum number of references an extent can have in order for us to attempt to
  33  * issue clone operations instead of write operations. This currently exists to
  34  * avoid hitting limitations of the backreference walking code (taking a lot of
  35  * time and using too much memory for extents with large number of references).
  36  */
  37 #define SEND_MAX_EXTENT_REFS    64
  38
  39 /*
  40  * A fs_path is a helper to dynamically build path names with unknown size.
  41  * It reallocates the internal buffer on demand.
  42  * It allows fast adding of path elements on the right side (normal path) and
  43  * fast adding to the left side (reversed path). A reversed path can also be
  44  * unreversed if needed.
  45  */
  46 struct fs_path {
  47         union {
  48                 struct {
  49                         char *start;
  50                         char *end;
  51
  52                         char *buf;
  53                         unsigned short buf_len:15;
  54                         unsigned short reversed:1;
  55                         char inline_buf[];
  56                 };
  57                 /*
  58                  * Average path length does not exceed 200 bytes, we'll have
  59                  * better packing in the slab and higher chance to satisfy
  60                  * a allocation later during send.
  61                  */
  62                 char pad[256];
  63         };
  64 };
  65 #define FS_PATH_INLINE_SIZE \
  66         (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
  67
  68
  69 /* reused for each extent */
  70 struct clone_root {
  71         struct btrfs_root *root;
  72         u64 ino;
  73         u64 offset;
  74
  75         u64 found_refs;
  76 };
  77
  78 #define SEND_CTX_MAX_NAME_CACHE_SIZE 128
  79 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
  80
  81 struct send_ctx {
  82         struct file *send_filp;
  83         loff_t send_off;
  84         char *send_buf;
  85         u32 send_size;
  86         u32 send_max_size;
  87         /*
  88          * Whether BTRFS_SEND_A_DATA attribute was already added to current
  89          * command (since protocol v2, data must be the last attribute).
  90          */
  91         bool put_data;
  92         struct page **send_buf_pages;
  93         u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
  94         /* Protocol version compatibility requested */
  95         u32 proto;
  96
  97         struct btrfs_root *send_root;
  98         struct btrfs_root *parent_root;
  99         struct clone_root *clone_roots;
 100         int clone_roots_cnt;
 101
 102         /* current state of the compare_tree call */
 103         struct btrfs_path *left_path;
 104         struct btrfs_path *right_path;
 105         struct btrfs_key *cmp_key;
 106
 107         /*
 108          * Keep track of the generation of the last transaction that was used
 109          * for relocating a block group. This is periodically checked in order
 110          * to detect if a relocation happened since the last check, so that we
 111          * don't operate on stale extent buffers for nodes (level >= 1) or on
 112          * stale disk_bytenr values of file extent items.
 113          */
 114         u64 last_reloc_trans;
 115
 116         /*
 117          * infos of the currently processed inode. In case of deleted inodes,
 118          * these are the values from the deleted inode.
 119          */
 120         u64 cur_ino;
 121         u64 cur_inode_gen;
 122         u64 cur_inode_size;
 123         u64 cur_inode_mode;
 124         u64 cur_inode_rdev;
 125         u64 cur_inode_last_extent;
 126         u64 cur_inode_next_write_offset;
 127         bool cur_inode_new;
 128         bool cur_inode_new_gen;
 129         bool cur_inode_deleted;
 130         bool ignore_cur_inode;
 131         bool cur_inode_needs_verity;
 132         void *verity_descriptor;
 133
 134         u64 send_progress;
 135
 136         struct list_head new_refs;
 137         struct list_head deleted_refs;
 138
 139         struct radix_tree_root name_cache;
 140         struct list_head name_cache_list;
 141         int name_cache_size;
 142
 143         /*
 144          * The inode we are currently processing. It's not NULL only when we
 145          * need to issue write commands for data extents from this inode.
 146          */
 147         struct inode *cur_inode;
 148         struct file_ra_state ra;
 149         u64 page_cache_clear_start;
 150         bool clean_page_cache;
 151
 152         /*
 153          * We process inodes by their increasing order, so if before an
 154          * incremental send we reverse the parent/child relationship of
 155          * directories such that a directory with a lower inode number was
 156          * the parent of a directory with a higher inode number, and the one
 157          * becoming the new parent got renamed too, we can't rename/move the
 158          * directory with lower inode number when we finish processing it - we
 159          * must process the directory with higher inode number first, then
 160          * rename/move it and then rename/move the directory with lower inode
 161          * number. Example follows.
 162          *
 163          * Tree state when the first send was performed:
 164          *
 165          * .
 166          * |-- a                   (ino 257)
 167          *     |-- b               (ino 258)
 168          *         |
 169          *         |
 170          *         |-- c           (ino 259)
 171          *         |   |-- d       (ino 260)
 172          *         |
 173          *         |-- c2          (ino 261)
 174          *
 175          * Tree state when the second (incremental) send is performed:
 176          *
 177          * .
 178          * |-- a                   (ino 257)
 179          *     |-- b               (ino 258)
 180          *         |-- c2          (ino 261)
 181          *             |-- d2      (ino 260)
 182          *                 |-- cc  (ino 259)
 183          *
 184          * The sequence of steps that lead to the second state was:
 185          *
 186          * mv /a/b/c/d /a/b/c2/d2
 187          * mv /a/b/c /a/b/c2/d2/cc
 188          *
 189          * "c" has lower inode number, but we can't move it (2nd mv operation)
 190          * before we move "d", which has higher inode number.
 191          *
 192          * So we just memorize which move/rename operations must be performed
 193          * later when their respective parent is processed and moved/renamed.
 194          */
 195
 196         /* Indexed by parent directory inode number. */
 197         struct rb_root pending_dir_moves;
 198
 199         /*
 200          * Reverse index, indexed by the inode number of a directory that
 201          * is waiting for the move/rename of its immediate parent before its
 202          * own move/rename can be performed.
 203          */
 204         struct rb_root waiting_dir_moves;
 205
 206         /*
 207          * A directory that is going to be rm'ed might have a child directory
 208          * which is in the pending directory moves index above. In this case,
 209          * the directory can only be removed after the move/rename of its child
 210          * is performed. Example:
 211          *
 212          * Parent snapshot:
 213          *
 214          * .                        (ino 256)
 215          * |-- a/                   (ino 257)
 216          *     |-- b/               (ino 258)
 217          *         |-- c/           (ino 259)
 218          *         |   |-- x/       (ino 260)
 219          *         |
 220          *         |-- y/           (ino 261)
 221          *
 222          * Send snapshot:
 223          *
 224          * .                        (ino 256)
 225          * |-- a/                   (ino 257)
 226          *     |-- b/               (ino 258)
 227          *         |-- YY/          (ino 261)
 228          *              |-- x/      (ino 260)
 229          *
 230          * Sequence of steps that lead to the send snapshot:
 231          * rm -f /a/b/c/foo.txt
 232          * mv /a/b/y /a/b/YY
 233          * mv /a/b/c/x /a/b/YY
 234          * rmdir /a/b/c
 235          *
 236          * When the child is processed, its move/rename is delayed until its
 237          * parent is processed (as explained above), but all other operations
 238          * like update utimes, chown, chgrp, etc, are performed and the paths
 239          * that it uses for those operations must use the orphanized name of
 240          * its parent (the directory we're going to rm later), so we need to
 241          * memorize that name.
 242          *
 243          * Indexed by the inode number of the directory to be deleted.
 244          */
 245         struct rb_root orphan_dirs;
 246
 247         struct rb_root rbtree_new_refs;
 248         struct rb_root rbtree_deleted_refs;
 249 };
 250
 251 struct pending_dir_move {
 252         struct rb_node node;
 253         struct list_head list;
 254         u64 parent_ino;
 255         u64 ino;
 256         u64 gen;
 257         struct list_head update_refs;
 258 };
 259
 260 struct waiting_dir_move {
 261         struct rb_node node;
 262         u64 ino;
 263         /*
 264          * There might be some directory that could not be removed because it
 265          * was waiting for this directory inode to be moved first. Therefore
 266          * after this directory is moved, we can try to rmdir the ino rmdir_ino.
 267          */
 268         u64 rmdir_ino;
 269         u64 rmdir_gen;
 270         bool orphanized;
 271 };
 272
 273 struct orphan_dir_info {
 274         struct rb_node node;
 275         u64 ino;
 276         u64 gen;
 277         u64 last_dir_index_offset;
 278 };
 279
 280 struct name_cache_entry {
 281         struct list_head list;
 282         /*
 283          * radix_tree has only 32bit entries but we need to handle 64bit inums.
 284          * We use the lower 32bit of the 64bit inum to store it in the tree. If
 285          * more then one inum would fall into the same entry, we use radix_list
 286          * to store the additional entries. radix_list is also used to store
 287          * entries where two entries have the same inum but different
 288          * generations.
 289          */
 290         struct list_head radix_list;
 291         u64 ino;
 292         u64 gen;
 293         u64 parent_ino;
 294         u64 parent_gen;
 295         int ret;
 296         int need_later_update;
 297         int name_len;
 298         char name[];
 299 };
 300
 301 #define ADVANCE                                                 1
 302 #define ADVANCE_ONLY_NEXT                                       -1
 303
 304 enum btrfs_compare_tree_result {
 305         BTRFS_COMPARE_TREE_NEW,
 306         BTRFS_COMPARE_TREE_DELETED,
 307         BTRFS_COMPARE_TREE_CHANGED,
 308         BTRFS_COMPARE_TREE_SAME,
 309 };
 310
 311 __cold
 312 static void inconsistent_snapshot_error(struct send_ctx *sctx,
 313                                         enum btrfs_compare_tree_result result,
 314                                         const char *what)
 315 {
 316         const char *result_string;
 317
 318         switch (result) {
 319         case BTRFS_COMPARE_TREE_NEW:
 320                 result_string = "new";
 321                 break;
 322         case BTRFS_COMPARE_TREE_DELETED:
 323                 result_string = "deleted";
 324                 break;
 325         case BTRFS_COMPARE_TREE_CHANGED:
 326                 result_string = "updated";
 327                 break;
 328         case BTRFS_COMPARE_TREE_SAME:
 329                 ASSERT(0);
 330                 result_string = "unchanged";
 331                 break;
 332         default:
 333                 ASSERT(0);
 334                 result_string = "unexpected";
 335         }
 336
 337         btrfs_err(sctx->send_root->fs_info,
 338                   "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
 339                   result_string, what, sctx->cmp_key->objectid,
 340                   sctx->send_root->root_key.objectid,
 341                   (sctx->parent_root ?
 342                    sctx->parent_root->root_key.objectid : 0));
 343 }
 344
 345 __maybe_unused
 346 static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
 347 {
 348         switch (sctx->proto) {
 349         case 1:  return cmd <= BTRFS_SEND_C_MAX_V1;
 350         case 2:  return cmd <= BTRFS_SEND_C_MAX_V2;
 351         case 3:  return cmd <= BTRFS_SEND_C_MAX_V3;
 352         default: return false;
 353         }
 354 }
 355
 356 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 357
 358 static struct waiting_dir_move *
 359 get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
 360
 361 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
 362
 363 static int need_send_hole(struct send_ctx *sctx)
 364 {
 365         return (sctx->parent_root && !sctx->cur_inode_new &&
 366                 !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
 367                 S_ISREG(sctx->cur_inode_mode));
 368 }
 369
 370 static void fs_path_reset(struct fs_path *p)
 371 {
 372         if (p->reversed) {
 373                 p->start = p->buf + p->buf_len - 1;
 374                 p->end = p->start;
 375                 *p->start = 0;
 376         } else {
 377                 p->start = p->buf;
 378                 p->end = p->start;
 379                 *p->start = 0;
 380         }
 381 }
 382
 383 static struct fs_path *fs_path_alloc(void)
 384 {
 385         struct fs_path *p;
 386
 387         p = kmalloc(sizeof(*p), GFP_KERNEL);
 388         if (!p)
 389                 return NULL;
 390         p->reversed = 0;
 391         p->buf = p->inline_buf;
 392         p->buf_len = FS_PATH_INLINE_SIZE;
 393         fs_path_reset(p);
 394         return p;
 395 }
 396
 397 static struct fs_path *fs_path_alloc_reversed(void)
 398 {
 399         struct fs_path *p;
 400
 401         p = fs_path_alloc();
 402         if (!p)
 403                 return NULL;
 404         p->reversed = 1;
 405         fs_path_reset(p);
 406         return p;
 407 }
 408
 409 static void fs_path_free(struct fs_path *p)
 410 {
 411         if (!p)
 412                 return;
 413         if (p->buf != p->inline_buf)
 414                 kfree(p->buf);
 415         kfree(p);
 416 }
 417
 418 static int fs_path_len(struct fs_path *p)
 419 {
 420         return p->end - p->start;
 421 }
 422
 423 static int fs_path_ensure_buf(struct fs_path *p, int len)
 424 {
 425         char *tmp_buf;
 426         int path_len;
 427         int old_buf_len;
 428
 429         len++;
 430
 431         if (p->buf_len >= len)
 432                 return 0;
 433
 434         if (len > PATH_MAX) {
 435                 WARN_ON(1);
 436                 return -ENOMEM;
 437         }
 438
 439         path_len = p->end - p->start;
 440         old_buf_len = p->buf_len;
 441
 442         /*
 443          * First time the inline_buf does not suffice
 444          */
 445         if (p->buf == p->inline_buf) {
 446                 tmp_buf = kmalloc(len, GFP_KERNEL);
 447                 if (tmp_buf)
 448                         memcpy(tmp_buf, p->buf, old_buf_len);
 449         } else {
 450                 tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
 451         }
 452         if (!tmp_buf)
 453                 return -ENOMEM;
 454         p->buf = tmp_buf;
 455         /*
 456          * The real size of the buffer is bigger, this will let the fast path
 457          * happen most of the time
 458          */
 459         p->buf_len = ksize(p->buf);
 460
 461         if (p->reversed) {
 462                 tmp_buf = p->buf + old_buf_len - path_len - 1;
 463                 p->end = p->buf + p->buf_len - 1;
 464                 p->start = p->end - path_len;
 465                 memmove(p->start, tmp_buf, path_len + 1);
 466         } else {
 467                 p->start = p->buf;
 468                 p->end = p->start + path_len;
 469         }
 470         return 0;
 471 }
 472
 473 static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
 474                                    char **prepared)
 475 {
 476         int ret;
 477         int new_len;
 478
 479         new_len = p->end - p->start + name_len;
 480         if (p->start != p->end)
 481                 new_len++;
 482         ret = fs_path_ensure_buf(p, new_len);
 483         if (ret < 0)
 484                 goto out;
 485
 486         if (p->reversed) {
 487                 if (p->start != p->end)
 488                         *--p->start = '/';
 489                 p->start -= name_len;
 490                 *prepared = p->start;
 491         } else {
 492                 if (p->start != p->end)
 493                         *p->end++ = '/';
 494                 *prepared = p->end;
 495                 p->end += name_len;
 496                 *p->end = 0;
 497         }
 498
 499 out:
 500         return ret;
 501 }
 502
 503 static int fs_path_add(struct fs_path *p, const char *name, int name_len)
 504 {
 505         int ret;
 506         char *prepared;
 507
 508         ret = fs_path_prepare_for_add(p, name_len, &prepared);
 509         if (ret < 0)
 510                 goto out;
 511         memcpy(prepared, name, name_len);
 512
 513 out:
 514         return ret;
 515 }
 516
 517 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
 518 {
 519         int ret;
 520         char *prepared;
 521
 522         ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
 523         if (ret < 0)
 524                 goto out;
 525         memcpy(prepared, p2->start, p2->end - p2->start);
 526
 527 out:
 528         return ret;
 529 }
 530
 531 static int fs_path_add_from_extent_buffer(struct fs_path *p,
 532                                           struct extent_buffer *eb,
 533                                           unsigned long off, int len)
 534 {
 535         int ret;
 536         char *prepared;
 537
 538         ret = fs_path_prepare_for_add(p, len, &prepared);
 539         if (ret < 0)
 540                 goto out;
 541
 542         read_extent_buffer(eb, prepared, off, len);
 543
 544 out:
 545         return ret;
 546 }
 547
 548 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 549 {
 550         p->reversed = from->reversed;
 551         fs_path_reset(p);
 552
 553         return fs_path_add_path(p, from);
 554 }
 555
 556 static void fs_path_unreverse(struct fs_path *p)
 557 {
 558         char *tmp;
 559         int len;
 560
 561         if (!p->reversed)
 562                 return;
 563
 564         tmp = p->start;
 565         len = p->end - p->start;
 566         p->start = p->buf;
 567         p->end = p->start + len;
 568         memmove(p->start, tmp, len + 1);
 569         p->reversed = 0;
 570 }
 571
 572 static struct btrfs_path *alloc_path_for_send(void)
 573 {
 574         struct btrfs_path *path;
 575
 576         path = btrfs_alloc_path();
 577         if (!path)
 578                 return NULL;
 579         path->search_commit_root = 1;
 580         path->skip_locking = 1;
 581         path->need_commit_sem = 1;
 582         return path;
 583 }
 584
 585 static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 586 {
 587         int ret;
 588         u32 pos = 0;
 589
 590         while (pos < len) {
 591                 ret = kernel_write(filp, buf + pos, len - pos, off);
 592                 if (ret < 0)
 593                         return ret;
 594                 if (ret == 0)
 595                         return -EIO;
 596                 pos += ret;
 597         }
 598
 599         return 0;
 600 }
 601
 602 static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
 603 {
 604         struct btrfs_tlv_header *hdr;
 605         int total_len = sizeof(*hdr) + len;
 606         int left = sctx->send_max_size - sctx->send_size;
 607
 608         if (WARN_ON_ONCE(sctx->put_data))
 609                 return -EINVAL;
 610
 611         if (unlikely(left < total_len))
 612                 return -EOVERFLOW;
 613
 614         hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
 615         put_unaligned_le16(attr, &hdr->tlv_type);
 616         put_unaligned_le16(len, &hdr->tlv_len);
 617         memcpy(hdr + 1, data, len);
 618         sctx->send_size += total_len;
 619
 620         return 0;
 621 }
 622
 623 #define TLV_PUT_DEFINE_INT(bits) \
 624         static int tlv_put_u##bits(struct send_ctx *sctx,               \
 625                         u##bits attr, u##bits value)                    \
 626         {                                                               \
 627                 __le##bits __tmp = cpu_to_le##bits(value);              \
 628                 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));      \
 629         }
 630
 631 TLV_PUT_DEFINE_INT(8)
 632 TLV_PUT_DEFINE_INT(32)
 633 TLV_PUT_DEFINE_INT(64)
 634
 635 static int tlv_put_string(struct send_ctx *sctx, u16 attr,
 636                           const char *str, int len)
 637 {
 638         if (len == -1)
 639                 len = strlen(str);
 640         return tlv_put(sctx, attr, str, len);
 641 }
 642
 643 static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
 644                         const u8 *uuid)
 645 {
 646         return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
 647 }
 648
 649 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
 650                                   struct extent_buffer *eb,
 651                                   struct btrfs_timespec *ts)
 652 {
 653         struct btrfs_timespec bts;
 654         read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
 655         return tlv_put(sctx, attr, &bts, sizeof(bts));
 656 }
 657
 658
 659 #define TLV_PUT(sctx, attrtype, data, attrlen) \
 660         do { \
 661                 ret = tlv_put(sctx, attrtype, data, attrlen); \
 662                 if (ret < 0) \
 663                         goto tlv_put_failure; \
 664         } while (0)
 665
 666 #define TLV_PUT_INT(sctx, attrtype, bits, value) \
 667         do { \
 668                 ret = tlv_put_u##bits(sctx, attrtype, value); \
 669                 if (ret < 0) \
 670                         goto tlv_put_failure; \
 671         } while (0)
 672
 673 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
 674 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
 675 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
 676 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
 677 #define TLV_PUT_STRING(sctx, attrtype, str, len) \
 678         do { \
 679                 ret = tlv_put_string(sctx, attrtype, str, len); \
 680                 if (ret < 0) \
 681                         goto tlv_put_failure; \
 682         } while (0)
 683 #define TLV_PUT_PATH(sctx, attrtype, p) \
 684         do { \
 685                 ret = tlv_put_string(sctx, attrtype, p->start, \
 686                         p->end - p->start); \
 687                 if (ret < 0) \
 688                         goto tlv_put_failure; \
 689         } while(0)
 690 #define TLV_PUT_UUID(sctx, attrtype, uuid) \
 691         do { \
 692                 ret = tlv_put_uuid(sctx, attrtype, uuid); \
 693                 if (ret < 0) \
 694                         goto tlv_put_failure; \
 695         } while (0)
 696 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
 697         do { \
 698                 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
 699                 if (ret < 0) \
 700                         goto tlv_put_failure; \
 701         } while (0)
 702
 703 static int send_header(struct send_ctx *sctx)
 704 {
 705         struct btrfs_stream_header hdr;
 706
 707         strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
 708         hdr.version = cpu_to_le32(sctx->proto);
 709         return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
 710                                         &sctx->send_off);
 711 }
 712
 713 /*
 714  * For each command/item we want to send to userspace, we call this function.
 715  */
 716 static int begin_cmd(struct send_ctx *sctx, int cmd)
 717 {
 718         struct btrfs_cmd_header *hdr;
 719
 720         if (WARN_ON(!sctx->send_buf))
 721                 return -EINVAL;
 722
 723         BUG_ON(sctx->send_size);
 724
 725         sctx->send_size += sizeof(*hdr);
 726         hdr = (struct btrfs_cmd_header *)sctx->send_buf;
 727         put_unaligned_le16(cmd, &hdr->cmd);
 728
 729         return 0;
 730 }
 731
 732 static int send_cmd(struct send_ctx *sctx)
 733 {
 734         int ret;
 735         struct btrfs_cmd_header *hdr;
 736         u32 crc;
 737
 738         hdr = (struct btrfs_cmd_header *)sctx->send_buf;
 739         put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
 740         put_unaligned_le32(0, &hdr->crc);
 741
 742         crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 743         put_unaligned_le32(crc, &hdr->crc);
 744
 745         ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
 746                                         &sctx->send_off);
 747
 748         sctx->send_size = 0;
 749         sctx->put_data = false;
 750
 751         return ret;
 752 }
 753
 754 /*
 755  * Sends a move instruction to user space
 756  */
 757 static int send_rename(struct send_ctx *sctx,
 758                      struct fs_path *from, struct fs_path *to)
 759 {
 760         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 761         int ret;
 762
 763         btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
 764
 765         ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
 766         if (ret < 0)
 767                 goto out;
 768
 769         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
 770         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
 771
 772         ret = send_cmd(sctx);
 773
 774 tlv_put_failure:
 775 out:
 776         return ret;
 777 }
 778
 779 /*
 780  * Sends a link instruction to user space
 781  */
 782 static int send_link(struct send_ctx *sctx,
 783                      struct fs_path *path, struct fs_path *lnk)
 784 {
 785         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 786         int ret;
 787
 788         btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
 789
 790         ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
 791         if (ret < 0)
 792                 goto out;
 793
 794         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
 795         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
 796
 797         ret = send_cmd(sctx);
 798
 799 tlv_put_failure:
 800 out:
 801         return ret;
 802 }
 803
 804 /*
 805  * Sends an unlink instruction to user space
 806  */
 807 static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
 808 {
 809         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 810         int ret;
 811
 812         btrfs_debug(fs_info, "send_unlink %s", path->start);
 813
 814         ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
 815         if (ret < 0)
 816                 goto out;
 817
 818         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
 819
 820         ret = send_cmd(sctx);
 821
 822 tlv_put_failure:
 823 out:
 824         return ret;
 825 }
 826
 827 /*
 828  * Sends a rmdir instruction to user space
 829  */
 830 static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
 831 {
 832         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 833         int ret;
 834
 835         btrfs_debug(fs_info, "send_rmdir %s", path->start);
 836
 837         ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
 838         if (ret < 0)
 839                 goto out;
 840
 841         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
 842
 843         ret = send_cmd(sctx);
 844
 845 tlv_put_failure:
 846 out:
 847         return ret;
 848 }
 849
 850 struct btrfs_inode_info {
 851         u64 size;
 852         u64 gen;
 853         u64 mode;
 854         u64 uid;
 855         u64 gid;
 856         u64 rdev;
 857         u64 fileattr;
 858         u64 nlink;
 859 };
 860
 861 /*
 862  * Helper function to retrieve some fields from an inode item.
 863  */
 864 static int get_inode_info(struct btrfs_root *root, u64 ino,
 865                           struct btrfs_inode_info *info)
 866 {
 867         int ret;
 868         struct btrfs_path *path;
 869         struct btrfs_inode_item *ii;
 870         struct btrfs_key key;
 871
 872         path = alloc_path_for_send();
 873         if (!path)
 874                 return -ENOMEM;
 875
 876         key.objectid = ino;
 877         key.type = BTRFS_INODE_ITEM_KEY;
 878         key.offset = 0;
 879         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 880         if (ret) {
 881                 if (ret > 0)
 882                         ret = -ENOENT;
 883                 goto out;
 884         }
 885
 886         if (!info)
 887                 goto out;
 888
 889         ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
 890                         struct btrfs_inode_item);
 891         info->size = btrfs_inode_size(path->nodes[0], ii);
 892         info->gen = btrfs_inode_generation(path->nodes[0], ii);
 893         info->mode = btrfs_inode_mode(path->nodes[0], ii);
 894         info->uid = btrfs_inode_uid(path->nodes[0], ii);
 895         info->gid = btrfs_inode_gid(path->nodes[0], ii);
 896         info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
 897         info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
 898         /*
 899          * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
 900          * otherwise logically split to 32/32 parts.
 901          */
 902         info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
 903
 904 out:
 905         btrfs_free_path(path);
 906         return ret;
 907 }
 908
 909 static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
 910 {
 911         int ret;
 912         struct btrfs_inode_info info;
 913
 914         if (!gen)
 915                 return -EPERM;
 916
 917         ret = get_inode_info(root, ino, &info);
 918         if (!ret)
 919                 *gen = info.gen;
 920         return ret;
 921 }
 922
 923 typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
 924                                    struct fs_path *p,
 925                                    void *ctx);
 926
 927 /*
 928  * Helper function to iterate the entries in ONE btrfs_inode_ref or
 929  * btrfs_inode_extref.
 930  * The iterate callback may return a non zero value to stop iteration. This can
 931  * be a negative value for error codes or 1 to simply stop it.
 932  *
 933  * path must point to the INODE_REF or INODE_EXTREF when called.
 934  */
 935 static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 936                              struct btrfs_key *found_key, int resolve,
 937                              iterate_inode_ref_t iterate, void *ctx)
 938 {
 939         struct extent_buffer *eb = path->nodes[0];
 940         struct btrfs_inode_ref *iref;
 941         struct btrfs_inode_extref *extref;
 942         struct btrfs_path *tmp_path;
 943         struct fs_path *p;
 944         u32 cur = 0;
 945         u32 total;
 946         int slot = path->slots[0];
 947         u32 name_len;
 948         char *start;
 949         int ret = 0;
 950         int num = 0;
 951         int index;
 952         u64 dir;
 953         unsigned long name_off;
 954         unsigned long elem_size;
 955         unsigned long ptr;
 956
 957         p = fs_path_alloc_reversed();
 958         if (!p)
 959                 return -ENOMEM;
 960
 961         tmp_path = alloc_path_for_send();
 962         if (!tmp_path) {
 963                 fs_path_free(p);
 964                 return -ENOMEM;
 965         }
 966
 967
 968         if (found_key->type == BTRFS_INODE_REF_KEY) {
 969                 ptr = (unsigned long)btrfs_item_ptr(eb, slot,
 970                                                     struct btrfs_inode_ref);
 971                 total = btrfs_item_size(eb, slot);
 972                 elem_size = sizeof(*iref);
 973         } else {
 974                 ptr = btrfs_item_ptr_offset(eb, slot);
 975                 total = btrfs_item_size(eb, slot);
 976                 elem_size = sizeof(*extref);
 977         }
 978
 979         while (cur < total) {
 980                 fs_path_reset(p);
 981
 982                 if (found_key->type == BTRFS_INODE_REF_KEY) {
 983                         iref = (struct btrfs_inode_ref *)(ptr + cur);
 984                         name_len = btrfs_inode_ref_name_len(eb, iref);
 985                         name_off = (unsigned long)(iref + 1);
 986                         index = btrfs_inode_ref_index(eb, iref);
 987                         dir = found_key->offset;
 988                 } else {
 989                         extref = (struct btrfs_inode_extref *)(ptr + cur);
 990                         name_len = btrfs_inode_extref_name_len(eb, extref);
 991                         name_off = (unsigned long)&extref->name;
 992                         index = btrfs_inode_extref_index(eb, extref);
 993                         dir = btrfs_inode_extref_parent(eb, extref);
 994                 }
 995
 996                 if (resolve) {
 997                         start = btrfs_ref_to_path(root, tmp_path, name_len,
 998                                                   name_off, eb, dir,
 999                                                   p->buf, p->buf_len);
1000                         if (IS_ERR(start)) {
1001                                 ret = PTR_ERR(start);
1002                                 goto out;
1003                         }
1004                         if (start < p->buf) {
1005                                 /* overflow , try again with larger buffer */
1006                                 ret = fs_path_ensure_buf(p,
1007                                                 p->buf_len + p->buf - start);
1008                                 if (ret < 0)
1009                                         goto out;
1010                                 start = btrfs_ref_to_path(root, tmp_path,
1011                                                           name_len, name_off,
1012                                                           eb, dir,
1013                                                           p->buf, p->buf_len);
1014                                 if (IS_ERR(start)) {
1015                                         ret = PTR_ERR(start);
1016                                         goto out;
1017                                 }
1018                                 if (unlikely(start < p->buf)) {
1019                                         btrfs_err(root->fs_info,
1020                         "send: path ref buffer underflow for key (%llu %u %llu)",
1021                                                   found_key->objectid,
1022                                                   found_key->type,
1023                                                   found_key->offset);
1024                                         ret = -EINVAL;
1025                                         goto out;
1026                                 }
1027                         }
1028                         p->start = start;
1029                 } else {
1030                         ret = fs_path_add_from_extent_buffer(p, eb, name_off,
1031                                                              name_len);
1032                         if (ret < 0)
1033                                 goto out;
1034                 }
1035
1036                 cur += elem_size + name_len;
1037                 ret = iterate(num, dir, index, p, ctx);
1038                 if (ret)
1039                         goto out;
1040                 num++;
1041         }
1042
1043 out:
1044         btrfs_free_path(tmp_path);
1045         fs_path_free(p);
1046         return ret;
1047 }
1048
1049 typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
1050                                   const char *name, int name_len,
1051                                   const char *data, int data_len,
1052                                   void *ctx);
1053
1054 /*
1055  * Helper function to iterate the entries in ONE btrfs_dir_item.
1056  * The iterate callback may return a non zero value to stop iteration. This can
1057  * be a negative value for error codes or 1 to simply stop it.
1058  *
1059  * path must point to the dir item when called.
1060  */
1061 static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1062                             iterate_dir_item_t iterate, void *ctx)
1063 {
1064         int ret = 0;
1065         struct extent_buffer *eb;
1066         struct btrfs_dir_item *di;
1067         struct btrfs_key di_key;
1068         char *buf = NULL;
1069         int buf_len;
1070         u32 name_len;
1071         u32 data_len;
1072         u32 cur;
1073         u32 len;
1074         u32 total;
1075         int slot;
1076         int num;
1077
1078         /*
1079          * Start with a small buffer (1 page). If later we end up needing more
1080          * space, which can happen for xattrs on a fs with a leaf size greater
1081          * then the page size, attempt to increase the buffer. Typically xattr
1082          * values are small.
1083          */
1084         buf_len = PATH_MAX;
1085         buf = kmalloc(buf_len, GFP_KERNEL);
1086         if (!buf) {
1087                 ret = -ENOMEM;
1088                 goto out;
1089         }
1090
1091         eb = path->nodes[0];
1092         slot = path->slots[0];
1093         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1094         cur = 0;
1095         len = 0;
1096         total = btrfs_item_size(eb, slot);
1097
1098         num = 0;
1099         while (cur < total) {
1100                 name_len = btrfs_dir_name_len(eb, di);
1101                 data_len = btrfs_dir_data_len(eb, di);
1102                 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1103
1104                 if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
1105                         if (name_len > XATTR_NAME_MAX) {
1106                                 ret = -ENAMETOOLONG;
1107                                 goto out;
1108                         }
1109                         if (name_len + data_len >
1110                                         BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
1111                                 ret = -E2BIG;
1112                                 goto out;
1113                         }
1114                 } else {
1115                         /*
1116                          * Path too long
1117                          */
1118                         if (name_len + data_len > PATH_MAX) {
1119                                 ret = -ENAMETOOLONG;
1120                                 goto out;
1121                         }
1122                 }
1123
1124                 if (name_len + data_len > buf_len) {
1125                         buf_len = name_len + data_len;
1126                         if (is_vmalloc_addr(buf)) {
1127                                 vfree(buf);
1128                                 buf = NULL;
1129                         } else {
1130                                 char *tmp = krealloc(buf, buf_len,
1131                                                 GFP_KERNEL | __GFP_NOWARN);
1132
1133                                 if (!tmp)
1134                                         kfree(buf);
1135                                 buf = tmp;
1136                         }
1137                         if (!buf) {
1138                                 buf = kvmalloc(buf_len, GFP_KERNEL);
1139                                 if (!buf) {
1140                                         ret = -ENOMEM;
1141                                         goto out;
1142                                 }
1143                         }
1144                 }
1145
1146                 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
1147                                 name_len + data_len);
1148
1149                 len = sizeof(*di) + name_len + data_len;
1150                 di = (struct btrfs_dir_item *)((char *)di + len);
1151                 cur += len;
1152
1153                 ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1154                               data_len, ctx);
1155                 if (ret < 0)
1156                         goto out;
1157                 if (ret) {
1158                         ret = 0;
1159                         goto out;
1160                 }
1161
1162                 num++;
1163         }
1164
1165 out:
1166         kvfree(buf);
1167         return ret;
1168 }
1169
1170 static int __copy_first_ref(int num, u64 dir, int index,
1171                             struct fs_path *p, void *ctx)
1172 {
1173         int ret;
1174         struct fs_path *pt = ctx;
1175
1176         ret = fs_path_copy(pt, p);
1177         if (ret < 0)
1178                 return ret;
1179
1180         /* we want the first only */
1181         return 1;
1182 }
1183
1184 /*
1185  * Retrieve the first path of an inode. If an inode has more then one
1186  * ref/hardlink, this is ignored.
1187  */
1188 static int get_inode_path(struct btrfs_root *root,
1189                           u64 ino, struct fs_path *path)
1190 {
1191         int ret;
1192         struct btrfs_key key, found_key;
1193         struct btrfs_path *p;
1194
1195         p = alloc_path_for_send();
1196         if (!p)
1197                 return -ENOMEM;
1198
1199         fs_path_reset(path);
1200
1201         key.objectid = ino;
1202         key.type = BTRFS_INODE_REF_KEY;
1203         key.offset = 0;
1204
1205         ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
1206         if (ret < 0)
1207                 goto out;
1208         if (ret) {
1209                 ret = 1;
1210                 goto out;
1211         }
1212         btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
1213         if (found_key.objectid != ino ||
1214             (found_key.type != BTRFS_INODE_REF_KEY &&
1215              found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1216                 ret = -ENOENT;
1217                 goto out;
1218         }
1219
1220         ret = iterate_inode_ref(root, p, &found_key, 1,
1221                                 __copy_first_ref, path);
1222         if (ret < 0)
1223                 goto out;
1224         ret = 0;
1225
1226 out:
1227         btrfs_free_path(p);
1228         return ret;
1229 }
1230
1231 struct backref_ctx {
1232         struct send_ctx *sctx;
1233
1234         /* number of total found references */
1235         u64 found;
1236
1237         /*
1238          * used for clones found in send_root. clones found behind cur_objectid
1239          * and cur_offset are not considered as allowed clones.
1240          */
1241         u64 cur_objectid;
1242         u64 cur_offset;
1243
1244         /* may be truncated in case it's the last extent in a file */
1245         u64 extent_len;
1246
1247         /* Just to check for bugs in backref resolving */
1248         int found_itself;
1249 };
1250
1251 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1252 {
1253         u64 root = (u64)(uintptr_t)key;
1254         const struct clone_root *cr = elt;
1255
1256         if (root < cr->root->root_key.objectid)
1257                 return -1;
1258         if (root > cr->root->root_key.objectid)
1259                 return 1;
1260         return 0;
1261 }
1262
1263 static int __clone_root_cmp_sort(const void *e1, const void *e2)
1264 {
1265         const struct clone_root *cr1 = e1;
1266         const struct clone_root *cr2 = e2;
1267
1268         if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
1269                 return -1;
1270         if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
1271                 return 1;
1272         return 0;
1273 }
1274
1275 /*
1276  * Called for every backref that is found for the current extent.
1277  * Results are collected in sctx->clone_roots->ino/offset/found_refs
1278  */
1279 static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1280 {
1281         struct backref_ctx *bctx = ctx_;
1282         struct clone_root *found;
1283
1284         /* First check if the root is in the list of accepted clone sources */
1285         found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
1286                         bctx->sctx->clone_roots_cnt,
1287                         sizeof(struct clone_root),
1288                         __clone_root_cmp_bsearch);
1289         if (!found)
1290                 return 0;
1291
1292         if (found->root == bctx->sctx->send_root &&
1293             ino == bctx->cur_objectid &&
1294             offset == bctx->cur_offset) {
1295                 bctx->found_itself = 1;
1296         }
1297
1298         /*
1299          * Make sure we don't consider clones from send_root that are
1300          * behind the current inode/offset.
1301          */
1302         if (found->root == bctx->sctx->send_root) {
1303                 /*
1304                  * If the source inode was not yet processed we can't issue a
1305                  * clone operation, as the source extent does not exist yet at
1306                  * the destination of the stream.
1307                  */
1308                 if (ino > bctx->cur_objectid)
1309                         return 0;
1310                 /*
1311                  * We clone from the inode currently being sent as long as the
1312                  * source extent is already processed, otherwise we could try
1313                  * to clone from an extent that does not exist yet at the
1314                  * destination of the stream.
1315                  */
1316                 if (ino == bctx->cur_objectid &&
1317                     offset + bctx->extent_len >
1318                     bctx->sctx->cur_inode_next_write_offset)
1319                         return 0;
1320         }
1321
1322         bctx->found++;
1323         found->found_refs++;
1324         if (ino < found->ino) {
1325                 found->ino = ino;
1326                 found->offset = offset;
1327         } else if (found->ino == ino) {
1328                 /*
1329                  * same extent found more then once in the same file.
1330                  */
1331                 if (found->offset > offset + bctx->extent_len)
1332                         found->offset = offset;
1333         }
1334
1335         return 0;
1336 }
1337
1338 /*
1339  * Given an inode, offset and extent item, it finds a good clone for a clone
1340  * instruction. Returns -ENOENT when none could be found. The function makes
1341  * sure that the returned clone is usable at the point where sending is at the
1342  * moment. This means, that no clones are accepted which lie behind the current
1343  * inode+offset.
1344  *
1345  * path must point to the extent item when called.
1346  */
1347 static int find_extent_clone(struct send_ctx *sctx,
1348                              struct btrfs_path *path,
1349                              u64 ino, u64 data_offset,
1350                              u64 ino_size,
1351                              struct clone_root **found)
1352 {
1353         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1354         int ret;
1355         int extent_type;
1356         u64 logical;
1357         u64 disk_byte;
1358         u64 num_bytes;
1359         u64 extent_item_pos;
1360         u64 flags = 0;
1361         struct btrfs_file_extent_item *fi;
1362         struct extent_buffer *eb = path->nodes[0];
1363         struct backref_ctx backref_ctx = {0};
1364         struct clone_root *cur_clone_root;
1365         struct btrfs_key found_key;
1366         struct btrfs_path *tmp_path;
1367         struct btrfs_extent_item *ei;
1368         int compressed;
1369         u32 i;
1370
1371         tmp_path = alloc_path_for_send();
1372         if (!tmp_path)
1373                 return -ENOMEM;
1374
1375         /* We only use this path under the commit sem */
1376         tmp_path->need_commit_sem = 0;
1377
1378         if (data_offset >= ino_size) {
1379                 /*
1380                  * There may be extents that lie behind the file's size.
1381                  * I at least had this in combination with snapshotting while
1382                  * writing large files.
1383                  */
1384                 ret = 0;
1385                 goto out;
1386         }
1387
1388         fi = btrfs_item_ptr(eb, path->slots[0],
1389                         struct btrfs_file_extent_item);
1390         extent_type = btrfs_file_extent_type(eb, fi);
1391         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1392                 ret = -ENOENT;
1393                 goto out;
1394         }
1395         compressed = btrfs_file_extent_compression(eb, fi);
1396
1397         num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1398         disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1399         if (disk_byte == 0) {
1400                 ret = -ENOENT;
1401                 goto out;
1402         }
1403         logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1404
1405         down_read(&fs_info->commit_root_sem);
1406         ret = extent_from_logical(fs_info, disk_byte, tmp_path,
1407                                   &found_key, &flags);
1408         up_read(&fs_info->commit_root_sem);
1409
1410         if (ret < 0)
1411                 goto out;
1412         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1413                 ret = -EIO;
1414                 goto out;
1415         }
1416
1417         ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
1418                             struct btrfs_extent_item);
1419         /*
1420          * Backreference walking (iterate_extent_inodes() below) is currently
1421          * too expensive when an extent has a large number of references, both
1422          * in time spent and used memory. So for now just fallback to write
1423          * operations instead of clone operations when an extent has more than
1424          * a certain amount of references.
1425          */
1426         if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
1427                 ret = -ENOENT;
1428                 goto out;
1429         }
1430         btrfs_release_path(tmp_path);
1431
1432         /*
1433          * Setup the clone roots.
1434          */
1435         for (i = 0; i < sctx->clone_roots_cnt; i++) {
1436                 cur_clone_root = sctx->clone_roots + i;
1437                 cur_clone_root->ino = (u64)-1;
1438                 cur_clone_root->offset = 0;
1439                 cur_clone_root->found_refs = 0;
1440         }
1441
1442         backref_ctx.sctx = sctx;
1443         backref_ctx.found = 0;
1444         backref_ctx.cur_objectid = ino;
1445         backref_ctx.cur_offset = data_offset;
1446         backref_ctx.found_itself = 0;
1447         backref_ctx.extent_len = num_bytes;
1448
1449         /*
1450          * The last extent of a file may be too large due to page alignment.
1451          * We need to adjust extent_len in this case so that the checks in
1452          * __iterate_backrefs work.
1453          */
1454         if (data_offset + num_bytes >= ino_size)
1455                 backref_ctx.extent_len = ino_size - data_offset;
1456
1457         /*
1458          * Now collect all backrefs.
1459          */
1460         if (compressed == BTRFS_COMPRESS_NONE)
1461                 extent_item_pos = logical - found_key.objectid;
1462         else
1463                 extent_item_pos = 0;
1464         ret = iterate_extent_inodes(fs_info, found_key.objectid,
1465                                     extent_item_pos, 1, __iterate_backrefs,
1466                                     &backref_ctx, false);
1467
1468         if (ret < 0)
1469                 goto out;
1470
1471         down_read(&fs_info->commit_root_sem);
1472         if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
1473                 /*
1474                  * A transaction commit for a transaction in which block group
1475                  * relocation was done just happened.
1476                  * The disk_bytenr of the file extent item we processed is
1477                  * possibly stale, referring to the extent's location before
1478                  * relocation. So act as if we haven't found any clone sources
1479                  * and fallback to write commands, which will read the correct
1480                  * data from the new extent location. Otherwise we will fail
1481                  * below because we haven't found our own back reference or we
1482                  * could be getting incorrect sources in case the old extent
1483                  * was already reallocated after the relocation.
1484                  */
1485                 up_read(&fs_info->commit_root_sem);
1486                 ret = -ENOENT;
1487                 goto out;
1488         }
1489         up_read(&fs_info->commit_root_sem);
1490
1491         if (!backref_ctx.found_itself) {
1492                 /* found a bug in backref code? */
1493                 ret = -EIO;
1494                 btrfs_err(fs_info,
1495                           "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
1496                           ino, data_offset, disk_byte, found_key.objectid);
1497                 goto out;
1498         }
1499
1500         btrfs_debug(fs_info,
1501                     "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1502                     data_offset, ino, num_bytes, logical);
1503
1504         if (!backref_ctx.found)
1505                 btrfs_debug(fs_info, "no clones found");
1506
1507         cur_clone_root = NULL;
1508         for (i = 0; i < sctx->clone_roots_cnt; i++) {
1509                 if (sctx->clone_roots[i].found_refs) {
1510                         if (!cur_clone_root)
1511                                 cur_clone_root = sctx->clone_roots + i;
1512                         else if (sctx->clone_roots[i].root == sctx->send_root)
1513                                 /* prefer clones from send_root over others */
1514                                 cur_clone_root = sctx->clone_roots + i;
1515                 }
1516
1517         }
1518
1519         if (cur_clone_root) {
1520                 *found = cur_clone_root;
1521                 ret = 0;
1522         } else {
1523                 ret = -ENOENT;
1524         }
1525
1526 out:
1527         btrfs_free_path(tmp_path);
1528         return ret;
1529 }
1530
1531 static int read_symlink(struct btrfs_root *root,
1532                         u64 ino,
1533                         struct fs_path *dest)
1534 {
1535         int ret;
1536         struct btrfs_path *path;
1537         struct btrfs_key key;
1538         struct btrfs_file_extent_item *ei;
1539         u8 type;
1540         u8 compression;
1541         unsigned long off;
1542         int len;
1543
1544         path = alloc_path_for_send();
1545         if (!path)
1546                 return -ENOMEM;
1547
1548         key.objectid = ino;
1549         key.type = BTRFS_EXTENT_DATA_KEY;
1550         key.offset = 0;
1551         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1552         if (ret < 0)
1553                 goto out;
1554         if (ret) {
1555                 /*
1556                  * An empty symlink inode. Can happen in rare error paths when
1557                  * creating a symlink (transaction committed before the inode
1558                  * eviction handler removed the symlink inode items and a crash
1559                  * happened in between or the subvol was snapshoted in between).
1560                  * Print an informative message to dmesg/syslog so that the user
1561                  * can delete the symlink.
1562                  */
1563                 btrfs_err(root->fs_info,
1564                           "Found empty symlink inode %llu at root %llu",
1565                           ino, root->root_key.objectid);
1566                 ret = -EIO;
1567                 goto out;
1568         }
1569
1570         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1571                         struct btrfs_file_extent_item);
1572         type = btrfs_file_extent_type(path->nodes[0], ei);
1573         compression = btrfs_file_extent_compression(path->nodes[0], ei);
1574         BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
1575         BUG_ON(compression);
1576
1577         off = btrfs_file_extent_inline_start(ei);
1578         len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
1579
1580         ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1581
1582 out:
1583         btrfs_free_path(path);
1584         return ret;
1585 }
1586
1587 /*
1588  * Helper function to generate a file name that is unique in the root of
1589  * send_root and parent_root. This is used to generate names for orphan inodes.
1590  */
1591 static int gen_unique_name(struct send_ctx *sctx,
1592                            u64 ino, u64 gen,
1593                            struct fs_path *dest)
1594 {
1595         int ret = 0;
1596         struct btrfs_path *path;
1597         struct btrfs_dir_item *di;
1598         char tmp[64];
1599         int len;
1600         u64 idx = 0;
1601
1602         path = alloc_path_for_send();
1603         if (!path)
1604                 return -ENOMEM;
1605
1606         while (1) {
1607                 struct fscrypt_str tmp_name;
1608
1609                 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1610                                 ino, gen, idx);
1611                 ASSERT(len < sizeof(tmp));
1612                 tmp_name.name = tmp;
1613                 tmp_name.len = strlen(tmp);
1614
1615                 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1616                                 path, BTRFS_FIRST_FREE_OBJECTID,
1617                                 &tmp_name, 0);
1618                 btrfs_release_path(path);
1619                 if (IS_ERR(di)) {
1620                         ret = PTR_ERR(di);
1621                         goto out;
1622                 }
1623                 if (di) {
1624                         /* not unique, try again */
1625                         idx++;
1626                         continue;
1627                 }
1628
1629                 if (!sctx->parent_root) {
1630                         /* unique */
1631                         ret = 0;
1632                         break;
1633                 }
1634
1635                 di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
1636                                 path, BTRFS_FIRST_FREE_OBJECTID,
1637                                 &tmp_name, 0);
1638                 btrfs_release_path(path);
1639                 if (IS_ERR(di)) {
1640                         ret = PTR_ERR(di);
1641                         goto out;
1642                 }
1643                 if (di) {
1644                         /* not unique, try again */
1645                         idx++;
1646                         continue;
1647                 }
1648                 /* unique */
1649                 break;
1650         }
1651
1652         ret = fs_path_add(dest, tmp, strlen(tmp));
1653
1654 out:
1655         btrfs_free_path(path);
1656         return ret;
1657 }
1658
1659 enum inode_state {
1660         inode_state_no_change,
1661         inode_state_will_create,
1662         inode_state_did_create,
1663         inode_state_will_delete,
1664         inode_state_did_delete,
1665 };
1666
1667 static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1668 {
1669         int ret;
1670         int left_ret;
1671         int right_ret;
1672         u64 left_gen;
1673         u64 right_gen = 0;
1674         struct btrfs_inode_info info;
1675
1676         ret = get_inode_info(sctx->send_root, ino, &info);
1677         if (ret < 0 && ret != -ENOENT)
1678                 goto out;
1679         left_ret = (info.nlink == 0) ? -ENOENT : ret;
1680         left_gen = info.gen;
1681
1682         if (!sctx->parent_root) {
1683                 right_ret = -ENOENT;
1684         } else {
1685                 ret = get_inode_info(sctx->parent_root, ino, &info);
1686                 if (ret < 0 && ret != -ENOENT)
1687                         goto out;
1688                 right_ret = (info.nlink == 0) ? -ENOENT : ret;
1689                 right_gen = info.gen;
1690         }
1691
1692         if (!left_ret && !right_ret) {
1693                 if (left_gen == gen && right_gen == gen) {
1694                         ret = inode_state_no_change;
1695                 } else if (left_gen == gen) {
1696                         if (ino < sctx->send_progress)
1697                                 ret = inode_state_did_create;
1698                         else
1699                                 ret = inode_state_will_create;
1700                 } else if (right_gen == gen) {
1701                         if (ino < sctx->send_progress)
1702                                 ret = inode_state_did_delete;
1703                         else
1704                                 ret = inode_state_will_delete;
1705                 } else  {
1706                         ret = -ENOENT;
1707                 }
1708         } else if (!left_ret) {
1709                 if (left_gen == gen) {
1710                         if (ino < sctx->send_progress)
1711                                 ret = inode_state_did_create;
1712                         else
1713                                 ret = inode_state_will_create;
1714                 } else {
1715                         ret = -ENOENT;
1716                 }
1717         } else if (!right_ret) {
1718                 if (right_gen == gen) {
1719                         if (ino < sctx->send_progress)
1720                                 ret = inode_state_did_delete;
1721                         else
1722                                 ret = inode_state_will_delete;
1723                 } else {
1724                         ret = -ENOENT;
1725                 }
1726         } else {
1727                 ret = -ENOENT;
1728         }
1729
1730 out:
1731         return ret;
1732 }
1733
1734 static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
1735 {
1736         int ret;
1737
1738         if (ino == BTRFS_FIRST_FREE_OBJECTID)
1739                 return 1;
1740
1741         ret = get_cur_inode_state(sctx, ino, gen);
1742         if (ret < 0)
1743                 goto out;
1744
1745         if (ret == inode_state_no_change ||
1746             ret == inode_state_did_create ||
1747             ret == inode_state_will_delete)
1748                 ret = 1;
1749         else
1750                 ret = 0;
1751
1752 out:
1753         return ret;
1754 }
1755
1756 /*
1757  * Helper function to lookup a dir item in a dir.
1758  */
1759 static int lookup_dir_item_inode(struct btrfs_root *root,
1760                                  u64 dir, const char *name, int name_len,
1761                                  u64 *found_inode)
1762 {
1763         int ret = 0;
1764         struct btrfs_dir_item *di;
1765         struct btrfs_key key;
1766         struct btrfs_path *path;
1767         struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
1768
1769         path = alloc_path_for_send();
1770         if (!path)
1771                 return -ENOMEM;
1772
1773         di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
1774         if (IS_ERR_OR_NULL(di)) {
1775                 ret = di ? PTR_ERR(di) : -ENOENT;
1776                 goto out;
1777         }
1778         btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1779         if (key.type == BTRFS_ROOT_ITEM_KEY) {
1780                 ret = -ENOENT;
1781                 goto out;
1782         }
1783         *found_inode = key.objectid;
1784
1785 out:
1786         btrfs_free_path(path);
1787         return ret;
1788 }
1789
1790 /*
1791  * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1792  * generation of the parent dir and the name of the dir entry.
1793  */
1794 static int get_first_ref(struct btrfs_root *root, u64 ino,
1795                          u64 *dir, u64 *dir_gen, struct fs_path *name)
1796 {
1797         int ret;
1798         struct btrfs_key key;
1799         struct btrfs_key found_key;
1800         struct btrfs_path *path;
1801         int len;
1802         u64 parent_dir;
1803
1804         path = alloc_path_for_send();
1805         if (!path)
1806                 return -ENOMEM;
1807
1808         key.objectid = ino;
1809         key.type = BTRFS_INODE_REF_KEY;
1810         key.offset = 0;
1811
1812         ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
1813         if (ret < 0)
1814                 goto out;
1815         if (!ret)
1816                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1817                                 path->slots[0]);
1818         if (ret || found_key.objectid != ino ||
1819             (found_key.type != BTRFS_INODE_REF_KEY &&
1820              found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1821                 ret = -ENOENT;
1822                 goto out;
1823         }
1824
1825         if (found_key.type == BTRFS_INODE_REF_KEY) {
1826                 struct btrfs_inode_ref *iref;
1827                 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1828                                       struct btrfs_inode_ref);
1829                 len = btrfs_inode_ref_name_len(path->nodes[0], iref);
1830                 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1831                                                      (unsigned long)(iref + 1),
1832                                                      len);
1833                 parent_dir = found_key.offset;
1834         } else {
1835                 struct btrfs_inode_extref *extref;
1836                 extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1837                                         struct btrfs_inode_extref);
1838                 len = btrfs_inode_extref_name_len(path->nodes[0], extref);
1839                 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1840                                         (unsigned long)&extref->name, len);
1841                 parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
1842         }
1843         if (ret < 0)
1844                 goto out;
1845         btrfs_release_path(path);
1846
1847         if (dir_gen) {
1848                 ret = get_inode_gen(root, parent_dir, dir_gen);
1849                 if (ret < 0)
1850                         goto out;
1851         }
1852
1853         *dir = parent_dir;
1854
1855 out:
1856         btrfs_free_path(path);
1857         return ret;
1858 }
1859
1860 static int is_first_ref(struct btrfs_root *root,
1861                         u64 ino, u64 dir,
1862                         const char *name, int name_len)
1863 {
1864         int ret;
1865         struct fs_path *tmp_name;
1866         u64 tmp_dir;
1867
1868         tmp_name = fs_path_alloc();
1869         if (!tmp_name)
1870                 return -ENOMEM;
1871
1872         ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
1873         if (ret < 0)
1874                 goto out;
1875
1876         if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
1877                 ret = 0;
1878                 goto out;
1879         }
1880
1881         ret = !memcmp(tmp_name->start, name, name_len);
1882
1883 out:
1884         fs_path_free(tmp_name);
1885         return ret;
1886 }
1887
1888 /*
1889  * Used by process_recorded_refs to determine if a new ref would overwrite an
1890  * already existing ref. In case it detects an overwrite, it returns the
1891  * inode/gen in who_ino/who_gen.
1892  * When an overwrite is detected, process_recorded_refs does proper orphanizing
1893  * to make sure later references to the overwritten inode are possible.
1894  * Orphanizing is however only required for the first ref of an inode.
1895  * process_recorded_refs does an additional is_first_ref check to see if
1896  * orphanizing is really required.
1897  */
1898 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1899                               const char *name, int name_len,
1900                               u64 *who_ino, u64 *who_gen, u64 *who_mode)
1901 {
1902         int ret = 0;
1903         u64 gen;
1904         u64 other_inode = 0;
1905         struct btrfs_inode_info info;
1906
1907         if (!sctx->parent_root)
1908                 goto out;
1909
1910         ret = is_inode_existent(sctx, dir, dir_gen);
1911         if (ret <= 0)
1912                 goto out;
1913
1914         /*
1915          * If we have a parent root we need to verify that the parent dir was
1916          * not deleted and then re-created, if it was then we have no overwrite
1917          * and we can just unlink this entry.
1918          */
1919         if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
1920                 ret = get_inode_gen(sctx->parent_root, dir, &gen);
1921                 if (ret < 0 && ret != -ENOENT)
1922                         goto out;
1923                 if (ret) {
1924                         ret = 0;
1925                         goto out;
1926                 }
1927                 if (gen != dir_gen)
1928                         goto out;
1929         }
1930
1931         ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
1932                                     &other_inode);
1933         if (ret < 0 && ret != -ENOENT)
1934                 goto out;
1935         if (ret) {
1936                 ret = 0;
1937                 goto out;
1938         }
1939
1940         /*
1941          * Check if the overwritten ref was already processed. If yes, the ref
1942          * was already unlinked/moved, so we can safely assume that we will not
1943          * overwrite anything at this point in time.
1944          */
1945         if (other_inode > sctx->send_progress ||
1946             is_waiting_for_move(sctx, other_inode)) {
1947                 ret = get_inode_info(sctx->parent_root, other_inode, &info);
1948                 if (ret < 0)
1949                         goto out;
1950
1951                 ret = 1;
1952                 *who_ino = other_inode;
1953                 *who_gen = info.gen;
1954                 *who_mode = info.mode;
1955         } else {
1956                 ret = 0;
1957         }
1958
1959 out:
1960         return ret;
1961 }
1962
1963 /*
1964  * Checks if the ref was overwritten by an already processed inode. This is
1965  * used by __get_cur_name_and_parent to find out if the ref was orphanized and
1966  * thus the orphan name needs be used.
1967  * process_recorded_refs also uses it to avoid unlinking of refs that were
1968  * overwritten.
1969  */
1970 static int did_overwrite_ref(struct send_ctx *sctx,
1971                             u64 dir, u64 dir_gen,
1972                             u64 ino, u64 ino_gen,
1973                             const char *name, int name_len)
1974 {
1975         int ret = 0;
1976         u64 gen;
1977         u64 ow_inode;
1978
1979         if (!sctx->parent_root)
1980                 goto out;
1981
1982         ret = is_inode_existent(sctx, dir, dir_gen);
1983         if (ret <= 0)
1984                 goto out;
1985
1986         if (dir != BTRFS_FIRST_FREE_OBJECTID) {
1987                 ret = get_inode_gen(sctx->send_root, dir, &gen);
1988                 if (ret < 0 && ret != -ENOENT)
1989                         goto out;
1990                 if (ret) {
1991                         ret = 0;
1992                         goto out;
1993                 }
1994                 if (gen != dir_gen)
1995                         goto out;
1996         }
1997
1998         /* check if the ref was overwritten by another ref */
1999         ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
2000                                     &ow_inode);
2001         if (ret < 0 && ret != -ENOENT)
2002                 goto out;
2003         if (ret) {
2004                 /* was never and will never be overwritten */
2005                 ret = 0;
2006                 goto out;
2007         }
2008
2009         ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
2010         if (ret < 0)
2011                 goto out;
2012
2013         if (ow_inode == ino && gen == ino_gen) {
2014                 ret = 0;
2015                 goto out;
2016         }
2017
2018         /*
2019          * We know that it is or will be overwritten. Check this now.
2020          * The current inode being processed might have been the one that caused
2021          * inode 'ino' to be orphanized, therefore check if ow_inode matches
2022          * the current inode being processed.
2023          */
2024         if ((ow_inode < sctx->send_progress) ||
2025             (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
2026              gen == sctx->cur_inode_gen))
2027                 ret = 1;
2028         else
2029                 ret = 0;
2030
2031 out:
2032         return ret;
2033 }
2034
2035 /*
2036  * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2037  * that got overwritten. This is used by process_recorded_refs to determine
2038  * if it has to use the path as returned by get_cur_path or the orphan name.
2039  */
2040 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
2041 {
2042         int ret = 0;
2043         struct fs_path *name = NULL;
2044         u64 dir;
2045         u64 dir_gen;
2046
2047         if (!sctx->parent_root)
2048                 goto out;
2049
2050         name = fs_path_alloc();
2051         if (!name)
2052                 return -ENOMEM;
2053
2054         ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
2055         if (ret < 0)
2056                 goto out;
2057
2058         ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
2059                         name->start, fs_path_len(name));
2060
2061 out:
2062         fs_path_free(name);
2063         return ret;
2064 }
2065
2066 /*
2067  * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
2068  * so we need to do some special handling in case we have clashes. This function
2069  * takes care of this with the help of name_cache_entry::radix_list.
2070  * In case of error, nce is kfreed.
2071  */
2072 static int name_cache_insert(struct send_ctx *sctx,
2073                              struct name_cache_entry *nce)
2074 {
2075         int ret = 0;
2076         struct list_head *nce_head;
2077
2078         nce_head = radix_tree_lookup(&sctx->name_cache,
2079                         (unsigned long)nce->ino);
2080         if (!nce_head) {
2081                 nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
2082                 if (!nce_head) {
2083                         kfree(nce);
2084                         return -ENOMEM;
2085                 }
2086                 INIT_LIST_HEAD(nce_head);
2087
2088                 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
2089                 if (ret < 0) {
2090                         kfree(nce_head);
2091                         kfree(nce);
2092                         return ret;
2093                 }
2094         }
2095         list_add_tail(&nce->radix_list, nce_head);
2096         list_add_tail(&nce->list, &sctx->name_cache_list);
2097         sctx->name_cache_size++;
2098
2099         return ret;
2100 }
2101
2102 static void name_cache_delete(struct send_ctx *sctx,
2103                               struct name_cache_entry *nce)
2104 {
2105         struct list_head *nce_head;
2106
2107         nce_head = radix_tree_lookup(&sctx->name_cache,
2108                         (unsigned long)nce->ino);
2109         if (!nce_head) {
2110                 btrfs_err(sctx->send_root->fs_info,
2111               "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
2112                         nce->ino, sctx->name_cache_size);
2113         }
2114
2115         list_del(&nce->radix_list);
2116         list_del(&nce->list);
2117         sctx->name_cache_size--;
2118
2119         /*
2120          * We may not get to the final release of nce_head if the lookup fails
2121          */
2122         if (nce_head && list_empty(nce_head)) {
2123                 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
2124                 kfree(nce_head);
2125         }
2126 }
2127
2128 static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
2129                                                     u64 ino, u64 gen)
2130 {
2131         struct list_head *nce_head;
2132         struct name_cache_entry *cur;
2133
2134         nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
2135         if (!nce_head)
2136                 return NULL;
2137
2138         list_for_each_entry(cur, nce_head, radix_list) {
2139                 if (cur->ino == ino && cur->gen == gen)
2140                         return cur;
2141         }
2142         return NULL;
2143 }
2144
2145 /*
2146  * Remove some entries from the beginning of name_cache_list.
2147  */
2148 static void name_cache_clean_unused(struct send_ctx *sctx)
2149 {
2150         struct name_cache_entry *nce;
2151
2152         if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
2153                 return;
2154
2155         while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
2156                 nce = list_entry(sctx->name_cache_list.next,
2157                                 struct name_cache_entry, list);
2158                 name_cache_delete(sctx, nce);
2159                 kfree(nce);
2160         }
2161 }
2162
2163 static void name_cache_free(struct send_ctx *sctx)
2164 {
2165         struct name_cache_entry *nce;
2166
2167         while (!list_empty(&sctx->name_cache_list)) {
2168                 nce = list_entry(sctx->name_cache_list.next,
2169                                 struct name_cache_entry, list);
2170                 name_cache_delete(sctx, nce);
2171                 kfree(nce);
2172         }
2173 }
2174
2175 /*
2176  * Used by get_cur_path for each ref up to the root.
2177  * Returns 0 if it succeeded.
2178  * Returns 1 if the inode is not existent or got overwritten. In that case, the
2179  * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2180  * is returned, parent_ino/parent_gen are not guaranteed to be valid.
2181  * Returns <0 in case of error.
2182  */
2183 static int __get_cur_name_and_parent(struct send_ctx *sctx,
2184                                      u64 ino, u64 gen,
2185                                      u64 *parent_ino,
2186                                      u64 *parent_gen,
2187                                      struct fs_path *dest)
2188 {
2189         int ret;
2190         int nce_ret;
2191         struct name_cache_entry *nce = NULL;
2192
2193         /*
2194          * First check if we already did a call to this function with the same
2195          * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2196          * return the cached result.
2197          */
2198         nce = name_cache_search(sctx, ino, gen);
2199         if (nce) {
2200                 if (ino < sctx->send_progress && nce->need_later_update) {
2201                         name_cache_delete(sctx, nce);
2202                         kfree(nce);
2203                         nce = NULL;
2204                 } else {
2205                         /*
2206                          * Removes the entry from the list and adds it back to
2207                          * the end.  This marks the entry as recently used so
2208                          * that name_cache_clean_unused does not remove it.
2209                          */
2210                         list_move_tail(&nce->list, &sctx->name_cache_list);
2211
2212                         *parent_ino = nce->parent_ino;
2213                         *parent_gen = nce->parent_gen;
2214                         ret = fs_path_add(dest, nce->name, nce->name_len);
2215                         if (ret < 0)
2216                                 goto out;
2217                         ret = nce->ret;
2218                         goto out;
2219                 }
2220         }
2221
2222         /*
2223          * If the inode is not existent yet, add the orphan name and return 1.
2224          * This should only happen for the parent dir that we determine in
2225          * record_new_ref_if_needed().
2226          */
2227         ret = is_inode_existent(sctx, ino, gen);
2228         if (ret < 0)
2229                 goto out;
2230
2231         if (!ret) {
2232                 ret = gen_unique_name(sctx, ino, gen, dest);
2233                 if (ret < 0)
2234                         goto out;
2235                 ret = 1;
2236                 goto out_cache;
2237         }
2238
2239         /*
2240          * Depending on whether the inode was already processed or not, use
2241          * send_root or parent_root for ref lookup.
2242          */
2243         if (ino < sctx->send_progress)
2244                 ret = get_first_ref(sctx->send_root, ino,
2245                                     parent_ino, parent_gen, dest);
2246         else
2247                 ret = get_first_ref(sctx->parent_root, ino,
2248                                     parent_ino, parent_gen, dest);
2249         if (ret < 0)
2250                 goto out;
2251
2252         /*
2253          * Check if the ref was overwritten by an inode's ref that was processed
2254          * earlier. If yes, treat as orphan and return 1.
2255          */
2256         ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
2257                         dest->start, dest->end - dest->start);
2258         if (ret < 0)
2259                 goto out;
2260         if (ret) {
2261                 fs_path_reset(dest);
2262                 ret = gen_unique_name(sctx, ino, gen, dest);
2263                 if (ret < 0)
2264                         goto out;
2265                 ret = 1;
2266         }
2267
2268 out_cache:
2269         /*
2270          * Store the result of the lookup in the name cache.
2271          */
2272         nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
2273         if (!nce) {
2274                 ret = -ENOMEM;
2275                 goto out;
2276         }
2277
2278         nce->ino = ino;
2279         nce->gen = gen;
2280         nce->parent_ino = *parent_ino;
2281         nce->parent_gen = *parent_gen;
2282         nce->name_len = fs_path_len(dest);
2283         nce->ret = ret;
2284         strcpy(nce->name, dest->start);
2285
2286         if (ino < sctx->send_progress)
2287                 nce->need_later_update = 0;
2288         else
2289                 nce->need_later_update = 1;
2290
2291         nce_ret = name_cache_insert(sctx, nce);
2292         if (nce_ret < 0)
2293                 ret = nce_ret;
2294         name_cache_clean_unused(sctx);
2295
2296 out:
2297         return ret;
2298 }
2299
2300 /*
2301  * Magic happens here. This function returns the first ref to an inode as it
2302  * would look like while receiving the stream at this point in time.
2303  * We walk the path up to the root. For every inode in between, we check if it
2304  * was already processed/sent. If yes, we continue with the parent as found
2305  * in send_root. If not, we continue with the parent as found in parent_root.
2306  * If we encounter an inode that was deleted at this point in time, we use the
2307  * inodes "orphan" name instead of the real name and stop. Same with new inodes
2308  * that were not created yet and overwritten inodes/refs.
2309  *
2310  * When do we have orphan inodes:
2311  * 1. When an inode is freshly created and thus no valid refs are available yet
2312  * 2. When a directory lost all it's refs (deleted) but still has dir items
2313  *    inside which were not processed yet (pending for move/delete). If anyone
2314  *    tried to get the path to the dir items, it would get a path inside that
2315  *    orphan directory.
2316  * 3. When an inode is moved around or gets new links, it may overwrite the ref
2317  *    of an unprocessed inode. If in that case the first ref would be
2318  *    overwritten, the overwritten inode gets "orphanized". Later when we
2319  *    process this overwritten inode, it is restored at a new place by moving
2320  *    the orphan inode.
2321  *
2322  * sctx->send_progress tells this function at which point in time receiving
2323  * would be.
2324  */
2325 static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2326                         struct fs_path *dest)
2327 {
2328         int ret = 0;
2329         struct fs_path *name = NULL;
2330         u64 parent_inode = 0;
2331         u64 parent_gen = 0;
2332         int stop = 0;
2333
2334         name = fs_path_alloc();
2335         if (!name) {
2336                 ret = -ENOMEM;
2337                 goto out;
2338         }
2339
2340         dest->reversed = 1;
2341         fs_path_reset(dest);
2342
2343         while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2344                 struct waiting_dir_move *wdm;
2345
2346                 fs_path_reset(name);
2347
2348                 if (is_waiting_for_rm(sctx, ino, gen)) {
2349                         ret = gen_unique_name(sctx, ino, gen, name);
2350                         if (ret < 0)
2351                                 goto out;
2352                         ret = fs_path_add_path(dest, name);
2353                         break;
2354                 }
2355
2356                 wdm = get_waiting_dir_move(sctx, ino);
2357                 if (wdm && wdm->orphanized) {
2358                         ret = gen_unique_name(sctx, ino, gen, name);
2359                         stop = 1;
2360                 } else if (wdm) {
2361                         ret = get_first_ref(sctx->parent_root, ino,
2362                                             &parent_inode, &parent_gen, name);
2363                 } else {
2364                         ret = __get_cur_name_and_parent(sctx, ino, gen,
2365                                                         &parent_inode,
2366                                                         &parent_gen, name);
2367                         if (ret)
2368                                 stop = 1;
2369                 }
2370
2371                 if (ret < 0)
2372                         goto out;
2373
2374                 ret = fs_path_add_path(dest, name);
2375                 if (ret < 0)
2376                         goto out;
2377
2378                 ino = parent_inode;
2379                 gen = parent_gen;
2380         }
2381
2382 out:
2383         fs_path_free(name);
2384         if (!ret)
2385                 fs_path_unreverse(dest);
2386         return ret;
2387 }
2388
2389 /*
2390  * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2391  */
2392 static int send_subvol_begin(struct send_ctx *sctx)
2393 {
2394         int ret;
2395         struct btrfs_root *send_root = sctx->send_root;
2396         struct btrfs_root *parent_root = sctx->parent_root;
2397         struct btrfs_path *path;
2398         struct btrfs_key key;
2399         struct btrfs_root_ref *ref;
2400         struct extent_buffer *leaf;
2401         char *name = NULL;
2402         int namelen;
2403
2404         path = btrfs_alloc_path();
2405         if (!path)
2406                 return -ENOMEM;
2407
2408         name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2409         if (!name) {
2410                 btrfs_free_path(path);
2411                 return -ENOMEM;
2412         }
2413
2414         key.objectid = send_root->root_key.objectid;
2415         key.type = BTRFS_ROOT_BACKREF_KEY;
2416         key.offset = 0;
2417
2418         ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
2419                                 &key, path, 1, 0);
2420         if (ret < 0)
2421                 goto out;
2422         if (ret) {
2423                 ret = -ENOENT;
2424                 goto out;
2425         }
2426
2427         leaf = path->nodes[0];
2428         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2429         if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2430             key.objectid != send_root->root_key.objectid) {
2431                 ret = -ENOENT;
2432                 goto out;
2433         }
2434         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
2435         namelen = btrfs_root_ref_name_len(leaf, ref);
2436         read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2437         btrfs_release_path(path);
2438
2439         if (parent_root) {
2440                 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2441                 if (ret < 0)
2442                         goto out;
2443         } else {
2444                 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
2445                 if (ret < 0)
2446                         goto out;
2447         }
2448
2449         TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2450
2451         if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
2452                 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2453                             sctx->send_root->root_item.received_uuid);
2454         else
2455                 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2456                             sctx->send_root->root_item.uuid);
2457
2458         TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2459                     btrfs_root_ctransid(&sctx->send_root->root_item));
2460         if (parent_root) {
2461                 if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
2462                         TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2463                                      parent_root->root_item.received_uuid);
2464                 else
2465                         TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2466                                      parent_root->root_item.uuid);
2467                 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2468                             btrfs_root_ctransid(&sctx->parent_root->root_item));
2469         }
2470
2471         ret = send_cmd(sctx);
2472
2473 tlv_put_failure:
2474 out:
2475         btrfs_free_path(path);
2476         kfree(name);
2477         return ret;
2478 }
2479
2480 static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2481 {
2482         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2483         int ret = 0;
2484         struct fs_path *p;
2485
2486         btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
2487
2488         p = fs_path_alloc();
2489         if (!p)
2490                 return -ENOMEM;
2491
2492         ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
2493         if (ret < 0)
2494                 goto out;
2495
2496         ret = get_cur_path(sctx, ino, gen, p);
2497         if (ret < 0)
2498                 goto out;
2499         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2500         TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2501
2502         ret = send_cmd(sctx);
2503
2504 tlv_put_failure:
2505 out:
2506         fs_path_free(p);
2507         return ret;
2508 }
2509
2510 static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2511 {
2512         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2513         int ret = 0;
2514         struct fs_path *p;
2515
2516         btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
2517
2518         p = fs_path_alloc();
2519         if (!p)
2520                 return -ENOMEM;
2521
2522         ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
2523         if (ret < 0)
2524                 goto out;
2525
2526         ret = get_cur_path(sctx, ino, gen, p);
2527         if (ret < 0)
2528                 goto out;
2529         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2530         TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
2531
2532         ret = send_cmd(sctx);
2533
2534 tlv_put_failure:
2535 out:
2536         fs_path_free(p);
2537         return ret;
2538 }
2539
2540 static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
2541 {
2542         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2543         int ret = 0;
2544         struct fs_path *p;
2545
2546         if (sctx->proto < 2)
2547                 return 0;
2548
2549         btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
2550
2551         p = fs_path_alloc();
2552         if (!p)
2553                 return -ENOMEM;
2554
2555         ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
2556         if (ret < 0)
2557                 goto out;
2558
2559         ret = get_cur_path(sctx, ino, gen, p);
2560         if (ret < 0)
2561                 goto out;
2562         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2563         TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
2564
2565         ret = send_cmd(sctx);
2566
2567 tlv_put_failure:
2568 out:
2569         fs_path_free(p);
2570         return ret;
2571 }
2572
2573 static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2574 {
2575         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2576         int ret = 0;
2577         struct fs_path *p;
2578
2579         btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
2580                     ino, uid, gid);
2581
2582         p = fs_path_alloc();
2583         if (!p)
2584                 return -ENOMEM;
2585
2586         ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
2587         if (ret < 0)
2588                 goto out;
2589
2590         ret = get_cur_path(sctx, ino, gen, p);
2591         if (ret < 0)
2592                 goto out;
2593         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2594         TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2595         TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2596
2597         ret = send_cmd(sctx);
2598
2599 tlv_put_failure:
2600 out:
2601         fs_path_free(p);
2602         return ret;
2603 }
2604
2605 static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2606 {
2607         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2608         int ret = 0;
2609         struct fs_path *p = NULL;
2610         struct btrfs_inode_item *ii;
2611         struct btrfs_path *path = NULL;
2612         struct extent_buffer *eb;
2613         struct btrfs_key key;
2614         int slot;
2615
2616         btrfs_debug(fs_info, "send_utimes %llu", ino);
2617
2618         p = fs_path_alloc();
2619         if (!p)
2620                 return -ENOMEM;
2621
2622         path = alloc_path_for_send();
2623         if (!path) {
2624                 ret = -ENOMEM;
2625                 goto out;
2626         }
2627
2628         key.objectid = ino;
2629         key.type = BTRFS_INODE_ITEM_KEY;
2630         key.offset = 0;
2631         ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2632         if (ret > 0)
2633                 ret = -ENOENT;
2634         if (ret < 0)
2635                 goto out;
2636
2637         eb = path->nodes[0];
2638         slot = path->slots[0];
2639         ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2640
2641         ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
2642         if (ret < 0)
2643                 goto out;
2644
2645         ret = get_cur_path(sctx, ino, gen, p);
2646         if (ret < 0)
2647                 goto out;
2648         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2649         TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2650         TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2651         TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2652         if (sctx->proto >= 2)
2653                 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
2654
2655         ret = send_cmd(sctx);
2656
2657 tlv_put_failure:
2658 out:
2659         fs_path_free(p);
2660         btrfs_free_path(path);
2661         return ret;
2662 }
2663
2664 /*
2665  * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2666  * a valid path yet because we did not process the refs yet. So, the inode
2667  * is created as orphan.
2668  */
2669 static int send_create_inode(struct send_ctx *sctx, u64 ino)
2670 {
2671         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2672         int ret = 0;
2673         struct fs_path *p;
2674         int cmd;
2675         struct btrfs_inode_info info;
2676         u64 gen;
2677         u64 mode;
2678         u64 rdev;
2679
2680         btrfs_debug(fs_info, "send_create_inode %llu", ino);
2681
2682         p = fs_path_alloc();
2683         if (!p)
2684                 return -ENOMEM;
2685
2686         if (ino != sctx->cur_ino) {
2687                 ret = get_inode_info(sctx->send_root, ino, &info);
2688                 if (ret < 0)
2689                         goto out;
2690                 gen = info.gen;
2691                 mode = info.mode;
2692                 rdev = info.rdev;
2693         } else {
2694                 gen = sctx->cur_inode_gen;
2695                 mode = sctx->cur_inode_mode;
2696                 rdev = sctx->cur_inode_rdev;
2697         }
2698
2699         if (S_ISREG(mode)) {
2700                 cmd = BTRFS_SEND_C_MKFILE;
2701         } else if (S_ISDIR(mode)) {
2702                 cmd = BTRFS_SEND_C_MKDIR;
2703         } else if (S_ISLNK(mode)) {
2704                 cmd = BTRFS_SEND_C_SYMLINK;
2705         } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2706                 cmd = BTRFS_SEND_C_MKNOD;
2707         } else if (S_ISFIFO(mode)) {
2708                 cmd = BTRFS_SEND_C_MKFIFO;
2709         } else if (S_ISSOCK(mode)) {
2710                 cmd = BTRFS_SEND_C_MKSOCK;
2711         } else {
2712                 btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2713                                 (int)(mode & S_IFMT));
2714                 ret = -EOPNOTSUPP;
2715                 goto out;
2716         }
2717
2718         ret = begin_cmd(sctx, cmd);
2719         if (ret < 0)
2720                 goto out;
2721
2722         ret = gen_unique_name(sctx, ino, gen, p);
2723         if (ret < 0)
2724                 goto out;
2725
2726         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2727         TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2728
2729         if (S_ISLNK(mode)) {
2730                 fs_path_reset(p);
2731                 ret = read_symlink(sctx->send_root, ino, p);
2732                 if (ret < 0)
2733                         goto out;
2734                 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2735         } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2736                    S_ISFIFO(mode) || S_ISSOCK(mode)) {
2737                 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2738                 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2739         }
2740
2741         ret = send_cmd(sctx);
2742         if (ret < 0)
2743                 goto out;
2744
2745
2746 tlv_put_failure:
2747 out:
2748         fs_path_free(p);
2749         return ret;
2750 }
2751
2752 /*
2753  * We need some special handling for inodes that get processed before the parent
2754  * directory got created. See process_recorded_refs for details.
2755  * This function does the check if we already created the dir out of order.
2756  */
2757 static int did_create_dir(struct send_ctx *sctx, u64 dir)
2758 {
2759         int ret = 0;
2760         int iter_ret = 0;
2761         struct btrfs_path *path = NULL;
2762         struct btrfs_key key;
2763         struct btrfs_key found_key;
2764         struct btrfs_key di_key;
2765         struct btrfs_dir_item *di;
2766
2767         path = alloc_path_for_send();
2768         if (!path)
2769                 return -ENOMEM;
2770
2771         key.objectid = dir;
2772         key.type = BTRFS_DIR_INDEX_KEY;
2773         key.offset = 0;
2774
2775         btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
2776                 struct extent_buffer *eb = path->nodes[0];
2777
2778                 if (found_key.objectid != key.objectid ||
2779                     found_key.type != key.type) {
2780                         ret = 0;
2781                         break;
2782                 }
2783
2784                 di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
2785                 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2786
2787                 if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
2788                     di_key.objectid < sctx->send_progress) {
2789                         ret = 1;
2790                         break;
2791                 }
2792         }
2793         /* Catch error found during iteration */
2794         if (iter_ret < 0)
2795                 ret = iter_ret;
2796
2797         btrfs_free_path(path);
2798         return ret;
2799 }
2800
2801 /*
2802  * Only creates the inode if it is:
2803  * 1. Not a directory
2804  * 2. Or a directory which was not created already due to out of order
2805  *    directories. See did_create_dir and process_recorded_refs for details.
2806  */
2807 static int send_create_inode_if_needed(struct send_ctx *sctx)
2808 {
2809         int ret;
2810
2811         if (S_ISDIR(sctx->cur_inode_mode)) {
2812                 ret = did_create_dir(sctx, sctx->cur_ino);
2813                 if (ret < 0)
2814                         return ret;
2815                 else if (ret > 0)
2816                         return 0;
2817         }
2818
2819         return send_create_inode(sctx, sctx->cur_ino);
2820 }
2821
2822 struct recorded_ref {
2823         struct list_head list;
2824         char *name;
2825         struct fs_path *full_path;
2826         u64 dir;
2827         u64 dir_gen;
2828         int name_len;
2829         struct rb_node node;
2830         struct rb_root *root;
2831 };
2832
2833 static struct recorded_ref *recorded_ref_alloc(void)
2834 {
2835         struct recorded_ref *ref;
2836
2837         ref = kzalloc(sizeof(*ref), GFP_KERNEL);
2838         if (!ref)
2839                 return NULL;
2840         RB_CLEAR_NODE(&ref->node);
2841         INIT_LIST_HEAD(&ref->list);
2842         return ref;
2843 }
2844
2845 static void recorded_ref_free(struct recorded_ref *ref)
2846 {
2847         if (!ref)
2848                 return;
2849         if (!RB_EMPTY_NODE(&ref->node))
2850                 rb_erase(&ref->node, ref->root);
2851         list_del(&ref->list);
2852         fs_path_free(ref->full_path);
2853         kfree(ref);
2854 }
2855
2856 static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
2857 {
2858         ref->full_path = path;
2859         ref->name = (char *)kbasename(ref->full_path->start);
2860         ref->name_len = ref->full_path->end - ref->name;
2861 }
2862
2863 static int dup_ref(struct recorded_ref *ref, struct list_head *list)
2864 {
2865         struct recorded_ref *new;
2866
2867         new = recorded_ref_alloc();
2868         if (!new)
2869                 return -ENOMEM;
2870
2871         new->dir = ref->dir;
2872         new->dir_gen = ref->dir_gen;
2873         list_add_tail(&new->list, list);
2874         return 0;
2875 }
2876
2877 static void __free_recorded_refs(struct list_head *head)
2878 {
2879         struct recorded_ref *cur;
2880
2881         while (!list_empty(head)) {
2882                 cur = list_entry(head->next, struct recorded_ref, list);
2883                 recorded_ref_free(cur);
2884         }
2885 }
2886
2887 static void free_recorded_refs(struct send_ctx *sctx)
2888 {
2889         __free_recorded_refs(&sctx->new_refs);
2890         __free_recorded_refs(&sctx->deleted_refs);
2891 }
2892
2893 /*
2894  * Renames/moves a file/dir to its orphan name. Used when the first
2895  * ref of an unprocessed inode gets overwritten and for all non empty
2896  * directories.
2897  */
2898 static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2899                           struct fs_path *path)
2900 {
2901         int ret;
2902         struct fs_path *orphan;
2903
2904         orphan = fs_path_alloc();
2905         if (!orphan)
2906                 return -ENOMEM;
2907
2908         ret = gen_unique_name(sctx, ino, gen, orphan);
2909         if (ret < 0)
2910                 goto out;
2911
2912         ret = send_rename(sctx, path, orphan);
2913
2914 out:
2915         fs_path_free(orphan);
2916         return ret;
2917 }
2918
2919 static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
2920                                                    u64 dir_ino, u64 dir_gen)
2921 {
2922         struct rb_node **p = &sctx->orphan_dirs.rb_node;
2923         struct rb_node *parent = NULL;
2924         struct orphan_dir_info *entry, *odi;
2925
2926         while (*p) {
2927                 parent = *p;
2928                 entry = rb_entry(parent, struct orphan_dir_info, node);
2929                 if (dir_ino < entry->ino)
2930                         p = &(*p)->rb_left;
2931                 else if (dir_ino > entry->ino)
2932                         p = &(*p)->rb_right;
2933                 else if (dir_gen < entry->gen)
2934                         p = &(*p)->rb_left;
2935                 else if (dir_gen > entry->gen)
2936                         p = &(*p)->rb_right;
2937                 else
2938                         return entry;
2939         }
2940
2941         odi = kmalloc(sizeof(*odi), GFP_KERNEL);
2942         if (!odi)
2943                 return ERR_PTR(-ENOMEM);
2944         odi->ino = dir_ino;
2945         odi->gen = dir_gen;
2946         odi->last_dir_index_offset = 0;
2947
2948         rb_link_node(&odi->node, parent, p);
2949         rb_insert_color(&odi->node, &sctx->orphan_dirs);
2950         return odi;
2951 }
2952
2953 static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
2954                                                    u64 dir_ino, u64 gen)
2955 {
2956         struct rb_node *n = sctx->orphan_dirs.rb_node;
2957         struct orphan_dir_info *entry;
2958
2959         while (n) {
2960                 entry = rb_entry(n, struct orphan_dir_info, node);
2961                 if (dir_ino < entry->ino)
2962                         n = n->rb_left;
2963                 else if (dir_ino > entry->ino)
2964                         n = n->rb_right;
2965                 else if (gen < entry->gen)
2966                         n = n->rb_left;
2967                 else if (gen > entry->gen)
2968                         n = n->rb_right;
2969                 else
2970                         return entry;
2971         }
2972         return NULL;
2973 }
2974
2975 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
2976 {
2977         struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
2978
2979         return odi != NULL;
2980 }
2981
2982 static void free_orphan_dir_info(struct send_ctx *sctx,
2983                                  struct orphan_dir_info *odi)
2984 {
2985         if (!odi)
2986                 return;
2987         rb_erase(&odi->node, &sctx->orphan_dirs);
2988         kfree(odi);
2989 }
2990
2991 /*
2992  * Returns 1 if a directory can be removed at this point in time.
2993  * We check this by iterating all dir items and checking if the inode behind
2994  * the dir item was already processed.
2995  */
2996 static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2997                      u64 send_progress)
2998 {
2999         int ret = 0;
3000         int iter_ret = 0;
3001         struct btrfs_root *root = sctx->parent_root;
3002         struct btrfs_path *path;
3003         struct btrfs_key key;
3004         struct btrfs_key found_key;
3005         struct btrfs_key loc;
3006         struct btrfs_dir_item *di;
3007         struct orphan_dir_info *odi = NULL;
3008
3009         /*
3010          * Don't try to rmdir the top/root subvolume dir.
3011          */
3012         if (dir == BTRFS_FIRST_FREE_OBJECTID)
3013                 return 0;
3014
3015         path = alloc_path_for_send();
3016         if (!path)
3017                 return -ENOMEM;
3018
3019         key.objectid = dir;
3020         key.type = BTRFS_DIR_INDEX_KEY;
3021         key.offset = 0;
3022
3023         odi = get_orphan_dir_info(sctx, dir, dir_gen);
3024         if (odi)
3025                 key.offset = odi->last_dir_index_offset;
3026
3027         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
3028                 struct waiting_dir_move *dm;
3029
3030                 if (found_key.objectid != key.objectid ||
3031                     found_key.type != key.type)
3032                         break;
3033
3034                 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
3035                                 struct btrfs_dir_item);
3036                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
3037
3038                 dm = get_waiting_dir_move(sctx, loc.objectid);
3039                 if (dm) {
3040                         odi = add_orphan_dir_info(sctx, dir, dir_gen);
3041                         if (IS_ERR(odi)) {
3042                                 ret = PTR_ERR(odi);
3043                                 goto out;
3044                         }
3045                         odi->gen = dir_gen;
3046                         odi->last_dir_index_offset = found_key.offset;
3047                         dm->rmdir_ino = dir;
3048                         dm->rmdir_gen = dir_gen;
3049                         ret = 0;
3050                         goto out;
3051                 }
3052
3053                 if (loc.objectid > send_progress) {
3054                         odi = add_orphan_dir_info(sctx, dir, dir_gen);
3055                         if (IS_ERR(odi)) {
3056                                 ret = PTR_ERR(odi);
3057                                 goto out;
3058                         }
3059                         odi->gen = dir_gen;
3060                         odi->last_dir_index_offset = found_key.offset;
3061                         ret = 0;
3062                         goto out;
3063                 }
3064         }
3065         if (iter_ret < 0) {
3066                 ret = iter_ret;
3067                 goto out;
3068         }
3069         free_orphan_dir_info(sctx, odi);
3070
3071         ret = 1;
3072
3073 out:
3074         btrfs_free_path(path);
3075         return ret;
3076 }
3077
3078 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
3079 {
3080         struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3081
3082         return entry != NULL;
3083 }
3084
3085 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3086 {
3087         struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
3088         struct rb_node *parent = NULL;
3089         struct waiting_dir_move *entry, *dm;
3090
3091         dm = kmalloc(sizeof(*dm), GFP_KERNEL);
3092         if (!dm)
3093                 return -ENOMEM;
3094         dm->ino = ino;
3095         dm->rmdir_ino = 0;
3096         dm->rmdir_gen = 0;
3097         dm->orphanized = orphanized;
3098
3099         while (*p) {
3100                 parent = *p;
3101                 entry = rb_entry(parent, struct waiting_dir_move, node);
3102                 if (ino < entry->ino) {
3103                         p = &(*p)->rb_left;
3104                 } else if (ino > entry->ino) {
3105                         p = &(*p)->rb_right;
3106                 } else {
3107                         kfree(dm);
3108                         return -EEXIST;
3109                 }
3110         }
3111
3112         rb_link_node(&dm->node, parent, p);
3113         rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
3114         return 0;
3115 }
3116
3117 static struct waiting_dir_move *
3118 get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3119 {
3120         struct rb_node *n = sctx->waiting_dir_moves.rb_node;
3121         struct waiting_dir_move *entry;
3122
3123         while (n) {
3124                 entry = rb_entry(n, struct waiting_dir_move, node);
3125                 if (ino < entry->ino)
3126                         n = n->rb_left;
3127                 else if (ino > entry->ino)
3128                         n = n->rb_right;
3129                 else
3130                         return entry;
3131         }
3132         return NULL;
3133 }
3134
3135 static void free_waiting_dir_move(struct send_ctx *sctx,
3136                                   struct waiting_dir_move *dm)
3137 {
3138         if (!dm)
3139                 return;
3140         rb_erase(&dm->node, &sctx->waiting_dir_moves);
3141         kfree(dm);
3142 }
3143
3144 static int add_pending_dir_move(struct send_ctx *sctx,
3145                                 u64 ino,
3146                                 u64 ino_gen,
3147                                 u64 parent_ino,
3148                                 struct list_head *new_refs,
3149                                 struct list_head *deleted_refs,
3150                                 const bool is_orphan)
3151 {
3152         struct rb_node **p = &sctx->pending_dir_moves.rb_node;
3153         struct rb_node *parent = NULL;
3154         struct pending_dir_move *entry = NULL, *pm;
3155         struct recorded_ref *cur;
3156         int exists = 0;
3157         int ret;
3158
3159         pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3160         if (!pm)
3161                 return -ENOMEM;
3162         pm->parent_ino = parent_ino;
3163         pm->ino = ino;
3164         pm->gen = ino_gen;
3165         INIT_LIST_HEAD(&pm->list);
3166         INIT_LIST_HEAD(&pm->update_refs);
3167         RB_CLEAR_NODE(&pm->node);
3168
3169         while (*p) {
3170                 parent = *p;
3171                 entry = rb_entry(parent, struct pending_dir_move, node);
3172                 if (parent_ino < entry->parent_ino) {
3173                         p = &(*p)->rb_left;
3174                 } else if (parent_ino > entry->parent_ino) {
3175                         p = &(*p)->rb_right;
3176                 } else {
3177                         exists = 1;
3178                         break;
3179                 }
3180         }
3181
3182         list_for_each_entry(cur, deleted_refs, list) {
3183                 ret = dup_ref(cur, &pm->update_refs);
3184                 if (ret < 0)
3185                         goto out;
3186         }
3187         list_for_each_entry(cur, new_refs, list) {
3188                 ret = dup_ref(cur, &pm->update_refs);
3189                 if (ret < 0)
3190                         goto out;
3191         }
3192
3193         ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
3194         if (ret)
3195                 goto out;
3196
3197         if (exists) {
3198                 list_add_tail(&pm->list, &entry->list);
3199         } else {
3200                 rb_link_node(&pm->node, parent, p);
3201                 rb_insert_color(&pm->node, &sctx->pending_dir_moves);
3202         }
3203         ret = 0;
3204 out:
3205         if (ret) {
3206                 __free_recorded_refs(&pm->update_refs);
3207                 kfree(pm);
3208         }
3209         return ret;
3210 }
3211
3212 static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3213                                                       u64 parent_ino)
3214 {
3215         struct rb_node *n = sctx->pending_dir_moves.rb_node;
3216         struct pending_dir_move *entry;
3217
3218         while (n) {
3219                 entry = rb_entry(n, struct pending_dir_move, node);
3220                 if (parent_ino < entry->parent_ino)
3221                         n = n->rb_left;
3222                 else if (parent_ino > entry->parent_ino)
3223                         n = n->rb_right;
3224                 else
3225                         return entry;
3226         }
3227         return NULL;
3228 }
3229
3230 static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3231                      u64 ino, u64 gen, u64 *ancestor_ino)
3232 {
3233         int ret = 0;
3234         u64 parent_inode = 0;
3235         u64 parent_gen = 0;
3236         u64 start_ino = ino;
3237
3238         *ancestor_ino = 0;
3239         while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3240                 fs_path_reset(name);
3241
3242                 if (is_waiting_for_rm(sctx, ino, gen))
3243                         break;
3244                 if (is_waiting_for_move(sctx, ino)) {
3245                         if (*ancestor_ino == 0)
3246                                 *ancestor_ino = ino;
3247                         ret = get_first_ref(sctx->parent_root, ino,
3248                                             &parent_inode, &parent_gen, name);
3249                 } else {
3250                         ret = __get_cur_name_and_parent(sctx, ino, gen,
3251                                                         &parent_inode,
3252                                                         &parent_gen, name);
3253                         if (ret > 0) {
3254                                 ret = 0;
3255                                 break;
3256                         }
3257                 }
3258                 if (ret < 0)
3259                         break;
3260                 if (parent_inode == start_ino) {
3261                         ret = 1;
3262                         if (*ancestor_ino == 0)
3263                                 *ancestor_ino = ino;
3264                         break;
3265                 }
3266                 ino = parent_inode;
3267                 gen = parent_gen;
3268         }
3269         return ret;
3270 }
3271
3272 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3273 {
3274         struct fs_path *from_path = NULL;
3275         struct fs_path *to_path = NULL;
3276         struct fs_path *name = NULL;
3277         u64 orig_progress = sctx->send_progress;
3278         struct recorded_ref *cur;
3279         u64 parent_ino, parent_gen;
3280         struct waiting_dir_move *dm = NULL;
3281         u64 rmdir_ino = 0;
3282         u64 rmdir_gen;
3283         u64 ancestor;
3284         bool is_orphan;
3285         int ret;
3286
3287         name = fs_path_alloc();
3288         from_path = fs_path_alloc();
3289         if (!name || !from_path) {
3290                 ret = -ENOMEM;
3291                 goto out;
3292         }
3293
3294         dm = get_waiting_dir_move(sctx, pm->ino);
3295         ASSERT(dm);
3296         rmdir_ino = dm->rmdir_ino;
3297         rmdir_gen = dm->rmdir_gen;
3298         is_orphan = dm->orphanized;
3299         free_waiting_dir_move(sctx, dm);
3300
3301         if (is_orphan) {
3302                 ret = gen_unique_name(sctx, pm->ino,
3303                                       pm->gen, from_path);
3304         } else {
3305                 ret = get_first_ref(sctx->parent_root, pm->ino,
3306                                     &parent_ino, &parent_gen, name);
3307                 if (ret < 0)
3308                         goto out;
3309                 ret = get_cur_path(sctx, parent_ino, parent_gen,
3310                                    from_path);
3311                 if (ret < 0)
3312                         goto out;
3313                 ret = fs_path_add_path(from_path, name);
3314         }
3315         if (ret < 0)
3316                 goto out;
3317
3318         sctx->send_progress = sctx->cur_ino + 1;
3319         ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3320         if (ret < 0)
3321                 goto out;
3322         if (ret) {
3323                 LIST_HEAD(deleted_refs);
3324                 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3325                 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3326                                            &pm->update_refs, &deleted_refs,
3327                                            is_orphan);
3328                 if (ret < 0)
3329                         goto out;
3330                 if (rmdir_ino) {
3331                         dm = get_waiting_dir_move(sctx, pm->ino);
3332                         ASSERT(dm);
3333                         dm->rmdir_ino = rmdir_ino;
3334                         dm->rmdir_gen = rmdir_gen;
3335                 }
3336                 goto out;
3337         }
3338         fs_path_reset(name);
3339         to_path = name;
3340         name = NULL;
3341         ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
3342         if (ret < 0)
3343                 goto out;
3344
3345         ret = send_rename(sctx, from_path, to_path);
3346         if (ret < 0)
3347                 goto out;
3348
3349         if (rmdir_ino) {
3350                 struct orphan_dir_info *odi;
3351                 u64 gen;
3352
3353                 odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
3354                 if (!odi) {
3355                         /* already deleted */
3356                         goto finish;
3357                 }
3358                 gen = odi->gen;
3359
3360                 ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
3361                 if (ret < 0)
3362                         goto out;
3363                 if (!ret)
3364                         goto finish;
3365
3366                 name = fs_path_alloc();
3367                 if (!name) {
3368                         ret = -ENOMEM;
3369                         goto out;
3370                 }
3371                 ret = get_cur_path(sctx, rmdir_ino, gen, name);
3372                 if (ret < 0)
3373                         goto out;
3374                 ret = send_rmdir(sctx, name);
3375                 if (ret < 0)
3376                         goto out;
3377         }
3378
3379 finish:
3380         ret = send_utimes(sctx, pm->ino, pm->gen);
3381         if (ret < 0)
3382                 goto out;
3383
3384         /*
3385          * After rename/move, need to update the utimes of both new parent(s)
3386          * and old parent(s).
3387          */
3388         list_for_each_entry(cur, &pm->update_refs, list) {
3389                 /*
3390                  * The parent inode might have been deleted in the send snapshot
3391                  */
3392                 ret = get_inode_info(sctx->send_root, cur->dir, NULL);
3393                 if (ret == -ENOENT) {
3394                         ret = 0;
3395                         continue;
3396                 }
3397                 if (ret < 0)
3398                         goto out;
3399
3400                 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3401                 if (ret < 0)
3402                         goto out;
3403         }
3404
3405 out:
3406         fs_path_free(name);
3407         fs_path_free(from_path);
3408         fs_path_free(to_path);
3409         sctx->send_progress = orig_progress;
3410
3411         return ret;
3412 }
3413
3414 static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
3415 {
3416         if (!list_empty(&m->list))
3417                 list_del(&m->list);
3418         if (!RB_EMPTY_NODE(&m->node))
3419                 rb_erase(&m->node, &sctx->pending_dir_moves);
3420         __free_recorded_refs(&m->update_refs);
3421         kfree(m);
3422 }
3423
3424 static void tail_append_pending_moves(struct send_ctx *sctx,
3425                                       struct pending_dir_move *moves,
3426                                       struct list_head *stack)
3427 {
3428         if (list_empty(&moves->list)) {
3429                 list_add_tail(&moves->list, stack);
3430         } else {
3431                 LIST_HEAD(list);
3432                 list_splice_init(&moves->list, &list);
3433                 list_add_tail(&moves->list, stack);
3434                 list_splice_tail(&list, stack);
3435         }
3436         if (!RB_EMPTY_NODE(&moves->node)) {
3437                 rb_erase(&moves->node, &sctx->pending_dir_moves);
3438                 RB_CLEAR_NODE(&moves->node);
3439         }
3440 }
3441
3442 static int apply_children_dir_moves(struct send_ctx *sctx)
3443 {
3444         struct pending_dir_move *pm;
3445         struct list_head stack;
3446         u64 parent_ino = sctx->cur_ino;
3447         int ret = 0;
3448
3449         pm = get_pending_dir_moves(sctx, parent_ino);
3450         if (!pm)
3451                 return 0;
3452
3453         INIT_LIST_HEAD(&stack);
3454         tail_append_pending_moves(sctx, pm, &stack);
3455
3456         while (!list_empty(&stack)) {
3457                 pm = list_first_entry(&stack, struct pending_dir_move, list);
3458                 parent_ino = pm->ino;
3459                 ret = apply_dir_move(sctx, pm);
3460                 free_pending_move(sctx, pm);
3461                 if (ret)
3462                         goto out;
3463                 pm = get_pending_dir_moves(sctx, parent_ino);
3464                 if (pm)
3465                         tail_append_pending_moves(sctx, pm, &stack);
3466         }
3467         return 0;
3468
3469 out:
3470         while (!list_empty(&stack)) {
3471                 pm = list_first_entry(&stack, struct pending_dir_move, list);
3472                 free_pending_move(sctx, pm);
3473         }
3474         return ret;
3475 }
3476
3477 /*
3478  * We might need to delay a directory rename even when no ancestor directory
3479  * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3480  * renamed. This happens when we rename a directory to the old name (the name
3481  * in the parent root) of some other unrelated directory that got its rename
3482  * delayed due to some ancestor with higher number that got renamed.
3483  *
3484  * Example:
3485  *
3486  * Parent snapshot:
3487  * .                                       (ino 256)
3488  * |---- a/                                (ino 257)
3489  * |     |---- file                        (ino 260)
3490  * |
3491  * |---- b/                                (ino 258)
3492  * |---- c/                                (ino 259)
3493  *
3494  * Send snapshot:
3495  * .                                       (ino 256)
3496  * |---- a/                                (ino 258)
3497  * |---- x/                                (ino 259)
3498  *       |---- y/                          (ino 257)
3499  *             |----- file                 (ino 260)
3500  *
3501  * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3502  * from 'a' to 'x/y' happening first, which in turn depends on the rename of
3503  * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3504  * must issue is:
3505  *
3506  * 1 - rename 259 from 'c' to 'x'
3507  * 2 - rename 257 from 'a' to 'x/y'
3508  * 3 - rename 258 from 'b' to 'a'
3509  *
3510  * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3511  * be done right away and < 0 on error.
3512  */
3513 static int wait_for_dest_dir_move(struct send_ctx *sctx,
3514                                   struct recorded_ref *parent_ref,
3515                                   const bool is_orphan)
3516 {
3517         struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
3518         struct btrfs_path *path;
3519         struct btrfs_key key;
3520         struct btrfs_key di_key;
3521         struct btrfs_dir_item *di;
3522         u64 left_gen;
3523         u64 right_gen;
3524         int ret = 0;
3525         struct waiting_dir_move *wdm;
3526
3527         if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3528                 return 0;
3529
3530         path = alloc_path_for_send();
3531         if (!path)
3532                 return -ENOMEM;
3533
3534         key.objectid = parent_ref->dir;
3535         key.type = BTRFS_DIR_ITEM_KEY;
3536         key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
3537
3538         ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
3539         if (ret < 0) {
3540                 goto out;
3541         } else if (ret > 0) {
3542                 ret = 0;
3543                 goto out;
3544         }
3545
3546         di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
3547                                        parent_ref->name_len);
3548         if (!di) {
3549                 ret = 0;
3550                 goto out;
3551         }
3552         /*
3553          * di_key.objectid has the number of the inode that has a dentry in the
3554          * parent directory with the same name that sctx->cur_ino is being
3555          * renamed to. We need to check if that inode is in the send root as
3556          * well and if it is currently marked as an inode with a pending rename,
3557          * if it is, we need to delay the rename of sctx->cur_ino as well, so
3558          * that it happens after that other inode is renamed.
3559          */
3560         btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
3561         if (di_key.type != BTRFS_INODE_ITEM_KEY) {
3562                 ret = 0;
3563                 goto out;
3564         }
3565
3566         ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
3567         if (ret < 0)
3568                 goto out;
3569         ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
3570         if (ret < 0) {
3571                 if (ret == -ENOENT)
3572                         ret = 0;
3573                 goto out;
3574         }
3575
3576         /* Different inode, no need to delay the rename of sctx->cur_ino */
3577         if (right_gen != left_gen) {
3578                 ret = 0;
3579                 goto out;
3580         }
3581
3582         wdm = get_waiting_dir_move(sctx, di_key.objectid);
3583         if (wdm && !wdm->orphanized) {
3584                 ret = add_pending_dir_move(sctx,
3585                                            sctx->cur_ino,
3586                                            sctx->cur_inode_gen,
3587                                            di_key.objectid,
3588                                            &sctx->new_refs,
3589                                            &sctx->deleted_refs,
3590                                            is_orphan);
3591                 if (!ret)
3592                         ret = 1;
3593         }
3594 out:
3595         btrfs_free_path(path);
3596         return ret;
3597 }
3598
3599 /*
3600  * Check if inode ino2, or any of its ancestors, is inode ino1.
3601  * Return 1 if true, 0 if false and < 0 on error.
3602  */
3603 static int check_ino_in_path(struct btrfs_root *root,
3604                              const u64 ino1,
3605                              const u64 ino1_gen,
3606                              const u64 ino2,
3607                              const u64 ino2_gen,
3608                              struct fs_path *fs_path)
3609 {
3610         u64 ino = ino2;
3611
3612         if (ino1 == ino2)
3613                 return ino1_gen == ino2_gen;
3614
3615         while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3616                 u64 parent;
3617                 u64 parent_gen;
3618                 int ret;
3619
3620                 fs_path_reset(fs_path);
3621                 ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
3622                 if (ret < 0)
3623                         return ret;
3624                 if (parent == ino1)
3625                         return parent_gen == ino1_gen;
3626                 ino = parent;
3627         }
3628         return 0;
3629 }
3630
3631 /*
3632  * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3633  * possible path (in case ino2 is not a directory and has multiple hard links).
3634  * Return 1 if true, 0 if false and < 0 on error.
3635  */
3636 static int is_ancestor(struct btrfs_root *root,
3637                        const u64 ino1,
3638                        const u64 ino1_gen,
3639                        const u64 ino2,
3640                        struct fs_path *fs_path)
3641 {
3642         bool free_fs_path = false;
3643         int ret = 0;
3644         int iter_ret = 0;
3645         struct btrfs_path *path = NULL;
3646         struct btrfs_key key;
3647
3648         if (!fs_path) {
3649                 fs_path = fs_path_alloc();
3650                 if (!fs_path)
3651                         return -ENOMEM;
3652                 free_fs_path = true;
3653         }
3654
3655         path = alloc_path_for_send();
3656         if (!path) {
3657                 ret = -ENOMEM;
3658                 goto out;
3659         }
3660
3661         key.objectid = ino2;
3662         key.type = BTRFS_INODE_REF_KEY;
3663         key.offset = 0;
3664
3665         btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
3666                 struct extent_buffer *leaf = path->nodes[0];
3667                 int slot = path->slots[0];
3668                 u32 cur_offset = 0;
3669                 u32 item_size;
3670
3671                 if (key.objectid != ino2)
3672                         break;
3673                 if (key.type != BTRFS_INODE_REF_KEY &&
3674                     key.type != BTRFS_INODE_EXTREF_KEY)
3675                         break;
3676
3677                 item_size = btrfs_item_size(leaf, slot);
3678                 while (cur_offset < item_size) {
3679                         u64 parent;
3680                         u64 parent_gen;
3681
3682                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
3683                                 unsigned long ptr;
3684                                 struct btrfs_inode_extref *extref;
3685
3686                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3687                                 extref = (struct btrfs_inode_extref *)
3688                                         (ptr + cur_offset);
3689                                 parent = btrfs_inode_extref_parent(leaf,
3690                                                                    extref);
3691                                 cur_offset += sizeof(*extref);
3692                                 cur_offset += btrfs_inode_extref_name_len(leaf,
3693                                                                   extref);
3694                         } else {
3695                                 parent = key.offset;
3696                                 cur_offset = item_size;
3697                         }
3698
3699                         ret = get_inode_gen(root, parent, &parent_gen);
3700                         if (ret < 0)
3701                                 goto out;
3702                         ret = check_ino_in_path(root, ino1, ino1_gen,
3703                                                 parent, parent_gen, fs_path);
3704                         if (ret)
3705                                 goto out;
3706                 }
3707         }
3708         ret = 0;
3709         if (iter_ret < 0)
3710                 ret = iter_ret;
3711
3712 out:
3713         btrfs_free_path(path);
3714         if (free_fs_path)
3715                 fs_path_free(fs_path);
3716         return ret;
3717 }
3718
3719 static int wait_for_parent_move(struct send_ctx *sctx,
3720                                 struct recorded_ref *parent_ref,
3721                                 const bool is_orphan)
3722 {
3723         int ret = 0;
3724         u64 ino = parent_ref->dir;
3725         u64 ino_gen = parent_ref->dir_gen;
3726         u64 parent_ino_before, parent_ino_after;
3727         struct fs_path *path_before = NULL;
3728         struct fs_path *path_after = NULL;
3729         int len1, len2;
3730
3731         path_after = fs_path_alloc();
3732         path_before = fs_path_alloc();
3733         if (!path_after || !path_before) {
3734                 ret = -ENOMEM;
3735                 goto out;
3736         }
3737
3738         /*
3739          * Our current directory inode may not yet be renamed/moved because some
3740          * ancestor (immediate or not) has to be renamed/moved first. So find if
3741          * such ancestor exists and make sure our own rename/move happens after
3742          * that ancestor is processed to avoid path build infinite loops (done
3743          * at get_cur_path()).
3744          */
3745         while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3746                 u64 parent_ino_after_gen;
3747
3748                 if (is_waiting_for_move(sctx, ino)) {
3749                         /*
3750                          * If the current inode is an ancestor of ino in the
3751                          * parent root, we need to delay the rename of the
3752                          * current inode, otherwise don't delayed the rename
3753                          * because we can end up with a circular dependency
3754                          * of renames, resulting in some directories never
3755                          * getting the respective rename operations issued in
3756                          * the send stream or getting into infinite path build
3757                          * loops.
3758                          */
3759                         ret = is_ancestor(sctx->parent_root,
3760                                           sctx->cur_ino, sctx->cur_inode_gen,
3761                                           ino, path_before);
3762                         if (ret)
3763                                 break;
3764                 }
3765
3766                 fs_path_reset(path_before);
3767                 fs_path_reset(path_after);
3768
3769                 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3770                                     &parent_ino_after_gen, path_after);
3771                 if (ret < 0)
3772                         goto out;
3773                 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3774                                     NULL, path_before);
3775                 if (ret < 0 && ret != -ENOENT) {
3776                         goto out;
3777                 } else if (ret == -ENOENT) {
3778                         ret = 0;
3779                         break;
3780                 }
3781
3782                 len1 = fs_path_len(path_before);
3783                 len2 = fs_path_len(path_after);
3784                 if (ino > sctx->cur_ino &&
3785                     (parent_ino_before != parent_ino_after || len1 != len2 ||
3786                      memcmp(path_before->start, path_after->start, len1))) {
3787                         u64 parent_ino_gen;
3788
3789                         ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
3790                         if (ret < 0)
3791                                 goto out;
3792                         if (ino_gen == parent_ino_gen) {
3793                                 ret = 1;
3794                                 break;
3795                         }
3796                 }
3797                 ino = parent_ino_after;
3798                 ino_gen = parent_ino_after_gen;
3799         }
3800
3801 out:
3802         fs_path_free(path_before);
3803         fs_path_free(path_after);
3804
3805         if (ret == 1) {
3806                 ret = add_pending_dir_move(sctx,
3807                                            sctx->cur_ino,
3808                                            sctx->cur_inode_gen,
3809                                            ino,
3810                                            &sctx->new_refs,
3811                                            &sctx->deleted_refs,
3812                                            is_orphan);
3813                 if (!ret)
3814                         ret = 1;
3815         }
3816
3817         return ret;
3818 }
3819
3820 static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
3821 {
3822         int ret;
3823         struct fs_path *new_path;
3824
3825         /*
3826          * Our reference's name member points to its full_path member string, so
3827          * we use here a new path.
3828          */
3829         new_path = fs_path_alloc();
3830         if (!new_path)
3831                 return -ENOMEM;
3832
3833         ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
3834         if (ret < 0) {
3835                 fs_path_free(new_path);
3836                 return ret;
3837         }
3838         ret = fs_path_add(new_path, ref->name, ref->name_len);
3839         if (ret < 0) {
3840                 fs_path_free(new_path);
3841                 return ret;
3842         }
3843
3844         fs_path_free(ref->full_path);
3845         set_ref_path(ref, new_path);
3846
3847         return 0;
3848 }
3849
3850 /*
3851  * When processing the new references for an inode we may orphanize an existing
3852  * directory inode because its old name conflicts with one of the new references
3853  * of the current inode. Later, when processing another new reference of our
3854  * inode, we might need to orphanize another inode, but the path we have in the
3855  * reference reflects the pre-orphanization name of the directory we previously
3856  * orphanized. For example:
3857  *
3858  * parent snapshot looks like:
3859  *
3860  * .                                     (ino 256)
3861  * |----- f1                             (ino 257)
3862  * |----- f2                             (ino 258)
3863  * |----- d1/                            (ino 259)
3864  *        |----- d2/                     (ino 260)
3865  *
3866  * send snapshot looks like:
3867  *
3868  * .                                     (ino 256)
3869  * |----- d1                             (ino 258)
3870  * |----- f2/                            (ino 259)
3871  *        |----- f2_link/                (ino 260)
3872  *        |       |----- f1              (ino 257)
3873  *        |
3874  *        |----- d2                      (ino 258)
3875  *
3876  * When processing inode 257 we compute the name for inode 259 as "d1", and we
3877  * cache it in the name cache. Later when we start processing inode 258, when
3878  * collecting all its new references we set a full path of "d1/d2" for its new
3879  * reference with name "d2". When we start processing the new references we
3880  * start by processing the new reference with name "d1", and this results in
3881  * orphanizing inode 259, since its old reference causes a conflict. Then we
3882  * move on the next new reference, with name "d2", and we find out we must
3883  * orphanize inode 260, as its old reference conflicts with ours - but for the
3884  * orphanization we use a source path corresponding to the path we stored in the
3885  * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
3886  * receiver fail since the path component "d1/" no longer exists, it was renamed
3887  * to "o259-6-0/" when processing the previous new reference. So in this case we
3888  * must recompute the path in the new reference and use it for the new
3889  * orphanization operation.
3890  */
3891 static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
3892 {
3893         char *name;
3894         int ret;
3895
3896         name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
3897         if (!name)
3898                 return -ENOMEM;
3899
3900         fs_path_reset(ref->full_path);
3901         ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
3902         if (ret < 0)
3903                 goto out;
3904
3905         ret = fs_path_add(ref->full_path, name, ref->name_len);
3906         if (ret < 0)
3907                 goto out;
3908
3909         /* Update the reference's base name pointer. */
3910         set_ref_path(ref, ref->full_path);
3911 out:
3912         kfree(name);
3913         return ret;
3914 }
3915
3916 /*
3917  * This does all the move/link/unlink/rmdir magic.
3918  */
3919 static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3920 {
3921         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
3922         int ret = 0;
3923         struct recorded_ref *cur;
3924         struct recorded_ref *cur2;
3925         struct list_head check_dirs;
3926         struct fs_path *valid_path = NULL;
3927         u64 ow_inode = 0;
3928         u64 ow_gen;
3929         u64 ow_mode;
3930         int did_overwrite = 0;
3931         int is_orphan = 0;
3932         u64 last_dir_ino_rm = 0;
3933         bool can_rename = true;
3934         bool orphanized_dir = false;
3935         bool orphanized_ancestor = false;
3936
3937         btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
3938
3939         /*
3940          * This should never happen as the root dir always has the same ref
3941          * which is always '..'
3942          */
3943         BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
3944         INIT_LIST_HEAD(&check_dirs);
3945
3946         valid_path = fs_path_alloc();
3947         if (!valid_path) {
3948                 ret = -ENOMEM;
3949                 goto out;
3950         }
3951
3952         /*
3953          * First, check if the first ref of the current inode was overwritten
3954          * before. If yes, we know that the current inode was already orphanized
3955          * and thus use the orphan name. If not, we can use get_cur_path to
3956          * get the path of the first ref as it would like while receiving at
3957          * this point in time.
3958          * New inodes are always orphan at the beginning, so force to use the
3959          * orphan name in this case.
3960          * The first ref is stored in valid_path and will be updated if it
3961          * gets moved around.
3962          */
3963         if (!sctx->cur_inode_new) {
3964                 ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
3965                                 sctx->cur_inode_gen);
3966                 if (ret < 0)
3967                         goto out;
3968                 if (ret)
3969                         did_overwrite = 1;
3970         }
3971         if (sctx->cur_inode_new || did_overwrite) {
3972                 ret = gen_unique_name(sctx, sctx->cur_ino,
3973                                 sctx->cur_inode_gen, valid_path);
3974                 if (ret < 0)
3975                         goto out;
3976                 is_orphan = 1;
3977         } else {
3978                 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3979                                 valid_path);
3980                 if (ret < 0)
3981                         goto out;
3982         }
3983
3984         /*
3985          * Before doing any rename and link operations, do a first pass on the
3986          * new references to orphanize any unprocessed inodes that may have a
3987          * reference that conflicts with one of the new references of the current
3988          * inode. This needs to happen first because a new reference may conflict
3989          * with the old reference of a parent directory, so we must make sure
3990          * that the path used for link and rename commands don't use an
3991          * orphanized name when an ancestor was not yet orphanized.
3992          *
3993          * Example:
3994          *
3995          * Parent snapshot:
3996          *
3997          * .                                                      (ino 256)
3998          * |----- testdir/                                        (ino 259)
3999          * |          |----- a                                    (ino 257)
4000          * |
4001          * |----- b                                               (ino 258)
4002          *
4003          * Send snapshot:
4004          *
4005          * .                                                      (ino 256)
4006          * |----- testdir_2/                                      (ino 259)
4007          * |          |----- a                                    (ino 260)
4008          * |
4009          * |----- testdir                                         (ino 257)
4010          * |----- b                                               (ino 257)
4011          * |----- b2                                              (ino 258)
4012          *
4013          * Processing the new reference for inode 257 with name "b" may happen
4014          * before processing the new reference with name "testdir". If so, we
4015          * must make sure that by the time we send a link command to create the
4016          * hard link "b", inode 259 was already orphanized, since the generated
4017          * path in "valid_path" already contains the orphanized name for 259.
4018          * We are processing inode 257, so only later when processing 259 we do
4019          * the rename operation to change its temporary (orphanized) name to
4020          * "testdir_2".
4021          */
4022         list_for_each_entry(cur, &sctx->new_refs, list) {
4023                 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
4024                 if (ret < 0)
4025                         goto out;
4026                 if (ret == inode_state_will_create)
4027                         continue;
4028
4029                 /*
4030                  * Check if this new ref would overwrite the first ref of another
4031                  * unprocessed inode. If yes, orphanize the overwritten inode.
4032                  * If we find an overwritten ref that is not the first ref,
4033                  * simply unlink it.
4034                  */
4035                 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
4036                                 cur->name, cur->name_len,
4037                                 &ow_inode, &ow_gen, &ow_mode);
4038                 if (ret < 0)
4039                         goto out;
4040                 if (ret) {
4041                         ret = is_first_ref(sctx->parent_root,
4042                                            ow_inode, cur->dir, cur->name,
4043                                            cur->name_len);
4044                         if (ret < 0)
4045                                 goto out;
4046                         if (ret) {
4047                                 struct name_cache_entry *nce;
4048                                 struct waiting_dir_move *wdm;
4049
4050                                 if (orphanized_dir) {
4051                                         ret = refresh_ref_path(sctx, cur);
4052                                         if (ret < 0)
4053                                                 goto out;
4054                                 }
4055
4056                                 ret = orphanize_inode(sctx, ow_inode, ow_gen,
4057                                                 cur->full_path);
4058                                 if (ret < 0)
4059                                         goto out;
4060                                 if (S_ISDIR(ow_mode))
4061                                         orphanized_dir = true;
4062
4063                                 /*
4064                                  * If ow_inode has its rename operation delayed
4065                                  * make sure that its orphanized name is used in
4066                                  * the source path when performing its rename
4067                                  * operation.
4068                                  */
4069                                 if (is_waiting_for_move(sctx, ow_inode)) {
4070                                         wdm = get_waiting_dir_move(sctx,
4071                                                                    ow_inode);
4072                                         ASSERT(wdm);
4073                                         wdm->orphanized = true;
4074                                 }
4075
4076                                 /*
4077                                  * Make sure we clear our orphanized inode's
4078                                  * name from the name cache. This is because the
4079                                  * inode ow_inode might be an ancestor of some
4080                                  * other inode that will be orphanized as well
4081                                  * later and has an inode number greater than
4082                                  * sctx->send_progress. We need to prevent
4083                                  * future name lookups from using the old name
4084                                  * and get instead the orphan name.
4085                                  */
4086                                 nce = name_cache_search(sctx, ow_inode, ow_gen);
4087                                 if (nce) {
4088                                         name_cache_delete(sctx, nce);
4089                                         kfree(nce);
4090                                 }
4091
4092                                 /*
4093                                  * ow_inode might currently be an ancestor of
4094                                  * cur_ino, therefore compute valid_path (the
4095                                  * current path of cur_ino) again because it
4096                                  * might contain the pre-orphanization name of
4097                                  * ow_inode, which is no longer valid.
4098                                  */
4099                                 ret = is_ancestor(sctx->parent_root,
4100                                                   ow_inode, ow_gen,
4101                                                   sctx->cur_ino, NULL);
4102                                 if (ret > 0) {
4103                                         orphanized_ancestor = true;
4104                                         fs_path_reset(valid_path);
4105                                         ret = get_cur_path(sctx, sctx->cur_ino,
4106                                                            sctx->cur_inode_gen,
4107                                                            valid_path);
4108                                 }
4109                                 if (ret < 0)
4110                                         goto out;
4111                         } else {
4112                                 /*
4113                                  * If we previously orphanized a directory that
4114                                  * collided with a new reference that we already
4115                                  * processed, recompute the current path because
4116                                  * that directory may be part of the path.
4117                                  */
4118                                 if (orphanized_dir) {
4119                                         ret = refresh_ref_path(sctx, cur);
4120                                         if (ret < 0)
4121                                                 goto out;
4122                                 }
4123                                 ret = send_unlink(sctx, cur->full_path);
4124                                 if (ret < 0)
4125                                         goto out;
4126                         }
4127                 }
4128
4129         }
4130
4131         list_for_each_entry(cur, &sctx->new_refs, list) {
4132                 /*
4133                  * We may have refs where the parent directory does not exist
4134                  * yet. This happens if the parent directories inum is higher
4135                  * than the current inum. To handle this case, we create the
4136                  * parent directory out of order. But we need to check if this
4137                  * did already happen before due to other refs in the same dir.
4138                  */
4139                 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
4140                 if (ret < 0)
4141                         goto out;
4142                 if (ret == inode_state_will_create) {
4143                         ret = 0;
4144                         /*
4145                          * First check if any of the current inodes refs did
4146                          * already create the dir.
4147                          */
4148                         list_for_each_entry(cur2, &sctx->new_refs, list) {
4149                                 if (cur == cur2)
4150                                         break;
4151                                 if (cur2->dir == cur->dir) {
4152                                         ret = 1;
4153                                         break;
4154                                 }
4155                         }
4156
4157                         /*
4158                          * If that did not happen, check if a previous inode
4159                          * did already create the dir.
4160                          */
4161                         if (!ret)
4162                                 ret = did_create_dir(sctx, cur->dir);
4163                         if (ret < 0)
4164                                 goto out;
4165                         if (!ret) {
4166                                 ret = send_create_inode(sctx, cur->dir);
4167                                 if (ret < 0)
4168                                         goto out;
4169                         }
4170                 }
4171
4172                 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
4173                         ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
4174                         if (ret < 0)
4175                                 goto out;
4176                         if (ret == 1) {
4177                                 can_rename = false;
4178                                 *pending_move = 1;
4179                         }
4180                 }
4181
4182                 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
4183                     can_rename) {
4184                         ret = wait_for_parent_move(sctx, cur, is_orphan);
4185                         if (ret < 0)
4186                                 goto out;
4187                         if (ret == 1) {
4188                                 can_rename = false;
4189                                 *pending_move = 1;
4190                         }
4191                 }
4192
4193                 /*
4194                  * link/move the ref to the new place. If we have an orphan
4195                  * inode, move it and update valid_path. If not, link or move
4196                  * it depending on the inode mode.
4197                  */
4198                 if (is_orphan && can_rename) {
4199                         ret = send_rename(sctx, valid_path, cur->full_path);
4200                         if (ret < 0)
4201                                 goto out;
4202                         is_orphan = 0;
4203                         ret = fs_path_copy(valid_path, cur->full_path);
4204                         if (ret < 0)
4205                                 goto out;
4206                 } else if (can_rename) {
4207                         if (S_ISDIR(sctx->cur_inode_mode)) {
4208                                 /*
4209                                  * Dirs can't be linked, so move it. For moved
4210                                  * dirs, we always have one new and one deleted
4211                                  * ref. The deleted ref is ignored later.
4212                                  */
4213                                 ret = send_rename(sctx, valid_path,
4214                                                   cur->full_path);
4215                                 if (!ret)
4216                                         ret = fs_path_copy(valid_path,
4217                                                            cur->full_path);
4218                                 if (ret < 0)
4219                                         goto out;
4220                         } else {
4221                                 /*
4222                                  * We might have previously orphanized an inode
4223                                  * which is an ancestor of our current inode,
4224                                  * so our reference's full path, which was
4225                                  * computed before any such orphanizations, must
4226                                  * be updated.
4227                                  */
4228                                 if (orphanized_dir) {
4229                                         ret = update_ref_path(sctx, cur);
4230                                         if (ret < 0)
4231                                                 goto out;
4232                                 }
4233                                 ret = send_link(sctx, cur->full_path,
4234                                                 valid_path);
4235                                 if (ret < 0)
4236                                         goto out;
4237                         }
4238                 }
4239                 ret = dup_ref(cur, &check_dirs);
4240                 if (ret < 0)
4241                         goto out;
4242         }
4243
4244         if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
4245                 /*
4246                  * Check if we can already rmdir the directory. If not,
4247                  * orphanize it. For every dir item inside that gets deleted
4248                  * later, we do this check again and rmdir it then if possible.
4249                  * See the use of check_dirs for more details.
4250                  */
4251                 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4252                                 sctx->cur_ino);
4253                 if (ret < 0)
4254                         goto out;
4255                 if (ret) {
4256                         ret = send_rmdir(sctx, valid_path);
4257                         if (ret < 0)
4258                                 goto out;
4259                 } else if (!is_orphan) {
4260                         ret = orphanize_inode(sctx, sctx->cur_ino,
4261                                         sctx->cur_inode_gen, valid_path);
4262                         if (ret < 0)
4263                                 goto out;
4264                         is_orphan = 1;
4265                 }
4266
4267                 list_for_each_entry(cur, &sctx->deleted_refs, list) {
4268                         ret = dup_ref(cur, &check_dirs);
4269                         if (ret < 0)
4270                                 goto out;
4271                 }
4272         } else if (S_ISDIR(sctx->cur_inode_mode) &&
4273                    !list_empty(&sctx->deleted_refs)) {
4274                 /*
4275                  * We have a moved dir. Add the old parent to check_dirs
4276                  */
4277                 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
4278                                 list);
4279                 ret = dup_ref(cur, &check_dirs);
4280                 if (ret < 0)
4281                         goto out;
4282         } else if (!S_ISDIR(sctx->cur_inode_mode)) {
4283                 /*
4284                  * We have a non dir inode. Go through all deleted refs and
4285                  * unlink them if they were not already overwritten by other
4286                  * inodes.
4287                  */
4288                 list_for_each_entry(cur, &sctx->deleted_refs, list) {
4289                         ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
4290                                         sctx->cur_ino, sctx->cur_inode_gen,
4291                                         cur->name, cur->name_len);
4292                         if (ret < 0)
4293                                 goto out;
4294                         if (!ret) {
4295                                 /*
4296                                  * If we orphanized any ancestor before, we need
4297                                  * to recompute the full path for deleted names,
4298                                  * since any such path was computed before we
4299                                  * processed any references and orphanized any
4300                                  * ancestor inode.
4301                                  */
4302                                 if (orphanized_ancestor) {
4303                                         ret = update_ref_path(sctx, cur);
4304                                         if (ret < 0)
4305                                                 goto out;
4306                                 }
4307                                 ret = send_unlink(sctx, cur->full_path);
4308                                 if (ret < 0)
4309                                         goto out;
4310                         }
4311                         ret = dup_ref(cur, &check_dirs);
4312                         if (ret < 0)
4313                                 goto out;
4314                 }
4315                 /*
4316                  * If the inode is still orphan, unlink the orphan. This may
4317                  * happen when a previous inode did overwrite the first ref
4318                  * of this inode and no new refs were added for the current
4319                  * inode. Unlinking does not mean that the inode is deleted in
4320                  * all cases. There may still be links to this inode in other
4321                  * places.
4322                  */
4323                 if (is_orphan) {
4324                         ret = send_unlink(sctx, valid_path);
4325                         if (ret < 0)
4326                                 goto out;
4327                 }
4328         }
4329
4330         /*
4331          * We did collect all parent dirs where cur_inode was once located. We
4332          * now go through all these dirs and check if they are pending for
4333          * deletion and if it's finally possible to perform the rmdir now.
4334          * We also update the inode stats of the parent dirs here.
4335          */
4336         list_for_each_entry(cur, &check_dirs, list) {
4337                 /*
4338                  * In case we had refs into dirs that were not processed yet,
4339                  * we don't need to do the utime and rmdir logic for these dirs.
4340                  * The dir will be processed later.
4341                  */
4342                 if (cur->dir > sctx->cur_ino)
4343                         continue;
4344
4345                 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
4346                 if (ret < 0)
4347                         goto out;
4348
4349                 if (ret == inode_state_did_create ||
4350                     ret == inode_state_no_change) {
4351                         /* TODO delayed utimes */
4352                         ret = send_utimes(sctx, cur->dir, cur->dir_gen);
4353                         if (ret < 0)
4354                                 goto out;
4355                 } else if (ret == inode_state_did_delete &&
4356                            cur->dir != last_dir_ino_rm) {
4357                         ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
4358                                         sctx->cur_ino);
4359                         if (ret < 0)
4360                                 goto out;
4361                         if (ret) {
4362                                 ret = get_cur_path(sctx, cur->dir,
4363                                                    cur->dir_gen, valid_path);
4364                                 if (ret < 0)
4365                                         goto out;
4366                                 ret = send_rmdir(sctx, valid_path);
4367                                 if (ret < 0)
4368                                         goto out;
4369                                 last_dir_ino_rm = cur->dir;
4370                         }
4371                 }
4372         }
4373
4374         ret = 0;
4375
4376 out:
4377         __free_recorded_refs(&check_dirs);
4378         free_recorded_refs(sctx);
4379         fs_path_free(valid_path);
4380         return ret;
4381 }
4382
4383 static int rbtree_ref_comp(const void *k, const struct rb_node *node)
4384 {
4385         const struct recorded_ref *data = k;
4386         const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
4387         int result;
4388
4389         if (data->dir > ref->dir)
4390                 return 1;
4391         if (data->dir < ref->dir)
4392                 return -1;
4393         if (data->dir_gen > ref->dir_gen)
4394                 return 1;
4395         if (data->dir_gen < ref->dir_gen)
4396                 return -1;
4397         if (data->name_len > ref->name_len)
4398                 return 1;
4399         if (data->name_len < ref->name_len)
4400                 return -1;
4401         result = strcmp(data->name, ref->name);
4402         if (result > 0)
4403                 return 1;
4404         if (result < 0)
4405                 return -1;
4406         return 0;
4407 }
4408
4409 static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
4410 {
4411         const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
4412
4413         return rbtree_ref_comp(entry, parent) < 0;
4414 }
4415
4416 static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
4417                               struct fs_path *name, u64 dir, u64 dir_gen,
4418                               struct send_ctx *sctx)
4419 {
4420         int ret = 0;
4421         struct fs_path *path = NULL;
4422         struct recorded_ref *ref = NULL;
4423
4424         path = fs_path_alloc();
4425         if (!path) {
4426                 ret = -ENOMEM;
4427                 goto out;
4428         }
4429
4430         ref = recorded_ref_alloc();
4431         if (!ref) {
4432                 ret = -ENOMEM;
4433                 goto out;
4434         }
4435
4436         ret = get_cur_path(sctx, dir, dir_gen, path);
4437         if (ret < 0)
4438                 goto out;
4439         ret = fs_path_add_path(path, name);
4440         if (ret < 0)
4441                 goto out;
4442
4443         ref->dir = dir;
4444         ref->dir_gen = dir_gen;
4445         set_ref_path(ref, path);
4446         list_add_tail(&ref->list, refs);
4447         rb_add(&ref->node, root, rbtree_ref_less);
4448         ref->root = root;
4449 out:
4450         if (ret) {
4451                 if (path && (!ref || !ref->full_path))
4452                         fs_path_free(path);
4453                 recorded_ref_free(ref);
4454         }
4455         return ret;
4456 }
4457
4458 static int record_new_ref_if_needed(int num, u64 dir, int index,
4459                                     struct fs_path *name, void *ctx)
4460 {
4461         int ret = 0;
4462         struct send_ctx *sctx = ctx;
4463         struct rb_node *node = NULL;
4464         struct recorded_ref data;
4465         struct recorded_ref *ref;
4466         u64 dir_gen;
4467
4468         ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
4469         if (ret < 0)
4470                 goto out;
4471
4472         data.dir = dir;
4473         data.dir_gen = dir_gen;
4474         set_ref_path(&data, name);
4475         node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp);
4476         if (node) {
4477                 ref = rb_entry(node, struct recorded_ref, node);
4478                 recorded_ref_free(ref);
4479         } else {
4480                 ret = record_ref_in_tree(&sctx->rbtree_new_refs,
4481                                          &sctx->new_refs, name, dir, dir_gen,
4482                                          sctx);
4483         }
4484 out:
4485         return ret;
4486 }
4487
4488 static int record_deleted_ref_if_needed(int num, u64 dir, int index,
4489                                         struct fs_path *name, void *ctx)
4490 {
4491         int ret = 0;
4492         struct send_ctx *sctx = ctx;
4493         struct rb_node *node = NULL;
4494         struct recorded_ref data;
4495         struct recorded_ref *ref;
4496         u64 dir_gen;
4497
4498         ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
4499         if (ret < 0)
4500                 goto out;
4501
4502         data.dir = dir;
4503         data.dir_gen = dir_gen;
4504         set_ref_path(&data, name);
4505         node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp);
4506         if (node) {
4507                 ref = rb_entry(node, struct recorded_ref, node);
4508                 recorded_ref_free(ref);
4509         } else {
4510                 ret = record_ref_in_tree(&sctx->rbtree_deleted_refs,
4511                                          &sctx->deleted_refs, name, dir,
4512                                          dir_gen, sctx);
4513         }
4514 out:
4515         return ret;
4516 }
4517
4518 static int record_new_ref(struct send_ctx *sctx)
4519 {
4520         int ret;
4521
4522         ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4523                                 sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
4524         if (ret < 0)
4525                 goto out;
4526         ret = 0;
4527
4528 out:
4529         return ret;
4530 }
4531
4532 static int record_deleted_ref(struct send_ctx *sctx)
4533 {
4534         int ret;
4535
4536         ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4537                                 sctx->cmp_key, 0, record_deleted_ref_if_needed,
4538                                 sctx);
4539         if (ret < 0)
4540                 goto out;
4541         ret = 0;
4542
4543 out:
4544         return ret;
4545 }
4546
4547 static int record_changed_ref(struct send_ctx *sctx)
4548 {
4549         int ret = 0;
4550
4551         ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4552                         sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
4553         if (ret < 0)
4554                 goto out;
4555         ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4556                         sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
4557         if (ret < 0)
4558                 goto out;
4559         ret = 0;
4560
4561 out:
4562         return ret;
4563 }
4564
4565 /*
4566  * Record and process all refs at once. Needed when an inode changes the
4567  * generation number, which means that it was deleted and recreated.
4568  */
4569 static int process_all_refs(struct send_ctx *sctx,
4570                             enum btrfs_compare_tree_result cmd)
4571 {
4572         int ret = 0;
4573         int iter_ret = 0;
4574         struct btrfs_root *root;
4575         struct btrfs_path *path;
4576         struct btrfs_key key;
4577         struct btrfs_key found_key;
4578         iterate_inode_ref_t cb;
4579         int pending_move = 0;
4580
4581         path = alloc_path_for_send();
4582         if (!path)
4583                 return -ENOMEM;
4584
4585         if (cmd == BTRFS_COMPARE_TREE_NEW) {
4586                 root = sctx->send_root;
4587                 cb = record_new_ref_if_needed;
4588         } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
4589                 root = sctx->parent_root;
4590                 cb = record_deleted_ref_if_needed;
4591         } else {
4592                 btrfs_err(sctx->send_root->fs_info,
4593                                 "Wrong command %d in process_all_refs", cmd);
4594                 ret = -EINVAL;
4595                 goto out;
4596         }
4597
4598         key.objectid = sctx->cmp_key->objectid;
4599         key.type = BTRFS_INODE_REF_KEY;
4600         key.offset = 0;
4601         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
4602                 if (found_key.objectid != key.objectid ||
4603                     (found_key.type != BTRFS_INODE_REF_KEY &&
4604                      found_key.type != BTRFS_INODE_EXTREF_KEY))
4605                         break;
4606
4607                 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
4608                 if (ret < 0)
4609                         goto out;
4610         }
4611         /* Catch error found during iteration */
4612         if (iter_ret < 0) {
4613                 ret = iter_ret;
4614                 goto out;
4615         }
4616         btrfs_release_path(path);
4617
4618         /*
4619          * We don't actually care about pending_move as we are simply
4620          * re-creating this inode and will be rename'ing it into place once we
4621          * rename the parent directory.
4622          */
4623         ret = process_recorded_refs(sctx, &pending_move);
4624 out:
4625         btrfs_free_path(path);
4626         return ret;
4627 }
4628
4629 static int send_set_xattr(struct send_ctx *sctx,
4630                           struct fs_path *path,
4631                           const char *name, int name_len,
4632                           const char *data, int data_len)
4633 {
4634         int ret = 0;
4635
4636         ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
4637         if (ret < 0)
4638                 goto out;
4639
4640         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4641         TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4642         TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
4643
4644         ret = send_cmd(sctx);
4645
4646 tlv_put_failure:
4647 out:
4648         return ret;
4649 }
4650
4651 static int send_remove_xattr(struct send_ctx *sctx,
4652                           struct fs_path *path,
4653                           const char *name, int name_len)
4654 {
4655         int ret = 0;
4656
4657         ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
4658         if (ret < 0)
4659                 goto out;
4660
4661         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4662         TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4663
4664         ret = send_cmd(sctx);
4665
4666 tlv_put_failure:
4667 out:
4668         return ret;
4669 }
4670
4671 static int __process_new_xattr(int num, struct btrfs_key *di_key,
4672                                const char *name, int name_len, const char *data,
4673                                int data_len, void *ctx)
4674 {
4675         int ret;
4676         struct send_ctx *sctx = ctx;
4677         struct fs_path *p;
4678         struct posix_acl_xattr_header dummy_acl;
4679
4680         /* Capabilities are emitted by finish_inode_if_needed */
4681         if (!strncmp(name, XATTR_NAME_CAPS, name_len))
4682                 return 0;
4683
4684         p = fs_path_alloc();
4685         if (!p)
4686                 return -ENOMEM;
4687
4688         /*
4689          * This hack is needed because empty acls are stored as zero byte
4690          * data in xattrs. Problem with that is, that receiving these zero byte
4691          * acls will fail later. To fix this, we send a dummy acl list that
4692          * only contains the version number and no entries.
4693          */
4694         if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
4695             !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
4696                 if (data_len == 0) {
4697                         dummy_acl.a_version =
4698                                         cpu_to_le32(POSIX_ACL_XATTR_VERSION);
4699                         data = (char *)&dummy_acl;
4700                         data_len = sizeof(dummy_acl);
4701                 }
4702         }
4703
4704         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4705         if (ret < 0)
4706                 goto out;
4707
4708         ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
4709
4710 out:
4711         fs_path_free(p);
4712         return ret;
4713 }
4714
4715 static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
4716                                    const char *name, int name_len,
4717                                    const char *data, int data_len, void *ctx)
4718 {
4719         int ret;
4720         struct send_ctx *sctx = ctx;
4721         struct fs_path *p;
4722
4723         p = fs_path_alloc();
4724         if (!p)
4725                 return -ENOMEM;
4726
4727         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4728         if (ret < 0)
4729                 goto out;
4730
4731         ret = send_remove_xattr(sctx, p, name, name_len);
4732
4733 out:
4734         fs_path_free(p);
4735         return ret;
4736 }
4737
4738 static int process_new_xattr(struct send_ctx *sctx)
4739 {
4740         int ret = 0;
4741
4742         ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4743                                __process_new_xattr, sctx);
4744
4745         return ret;
4746 }
4747
4748 static int process_deleted_xattr(struct send_ctx *sctx)
4749 {
4750         return iterate_dir_item(sctx->parent_root, sctx->right_path,
4751                                 __process_deleted_xattr, sctx);
4752 }
4753
4754 struct find_xattr_ctx {
4755         const char *name;
4756         int name_len;
4757         int found_idx;
4758         char *found_data;
4759         int found_data_len;
4760 };
4761
4762 static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
4763                         int name_len, const char *data, int data_len, void *vctx)
4764 {
4765         struct find_xattr_ctx *ctx = vctx;
4766
4767         if (name_len == ctx->name_len &&
4768             strncmp(name, ctx->name, name_len) == 0) {
4769                 ctx->found_idx = num;
4770                 ctx->found_data_len = data_len;
4771                 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
4772                 if (!ctx->found_data)
4773                         return -ENOMEM;
4774                 return 1;
4775         }
4776         return 0;
4777 }
4778
4779 static int find_xattr(struct btrfs_root *root,
4780                       struct btrfs_path *path,
4781                       struct btrfs_key *key,
4782                       const char *name, int name_len,
4783                       char **data, int *data_len)
4784 {
4785         int ret;
4786         struct find_xattr_ctx ctx;
4787
4788         ctx.name = name;
4789         ctx.name_len = name_len;
4790         ctx.found_idx = -1;
4791         ctx.found_data = NULL;
4792         ctx.found_data_len = 0;
4793
4794         ret = iterate_dir_item(root, path, __find_xattr, &ctx);
4795         if (ret < 0)
4796                 return ret;
4797
4798         if (ctx.found_idx == -1)
4799                 return -ENOENT;
4800         if (data) {
4801                 *data = ctx.found_data;
4802                 *data_len = ctx.found_data_len;
4803         } else {
4804                 kfree(ctx.found_data);
4805         }
4806         return ctx.found_idx;
4807 }
4808
4809
4810 static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
4811                                        const char *name, int name_len,
4812                                        const char *data, int data_len,
4813                                        void *ctx)
4814 {
4815         int ret;
4816         struct send_ctx *sctx = ctx;
4817         char *found_data = NULL;
4818         int found_data_len  = 0;
4819
4820         ret = find_xattr(sctx->parent_root, sctx->right_path,
4821                          sctx->cmp_key, name, name_len, &found_data,
4822                          &found_data_len);
4823         if (ret == -ENOENT) {
4824                 ret = __process_new_xattr(num, di_key, name, name_len, data,
4825                                           data_len, ctx);
4826         } else if (ret >= 0) {
4827                 if (data_len != found_data_len ||
4828                     memcmp(data, found_data, data_len)) {
4829                         ret = __process_new_xattr(num, di_key, name, name_len,
4830                                                   data, data_len, ctx);
4831                 } else {
4832                         ret = 0;
4833                 }
4834         }
4835
4836         kfree(found_data);
4837         return ret;
4838 }
4839
4840 static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
4841                                            const char *name, int name_len,
4842                                            const char *data, int data_len,
4843                                            void *ctx)
4844 {
4845         int ret;
4846         struct send_ctx *sctx = ctx;
4847
4848         ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
4849                          name, name_len, NULL, NULL);
4850         if (ret == -ENOENT)
4851                 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
4852                                               data_len, ctx);
4853         else if (ret >= 0)
4854                 ret = 0;
4855
4856         return ret;
4857 }
4858
4859 static int process_changed_xattr(struct send_ctx *sctx)
4860 {
4861         int ret = 0;
4862
4863         ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4864                         __process_changed_new_xattr, sctx);
4865         if (ret < 0)
4866                 goto out;
4867         ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
4868                         __process_changed_deleted_xattr, sctx);
4869
4870 out:
4871         return ret;
4872 }
4873
4874 static int process_all_new_xattrs(struct send_ctx *sctx)
4875 {
4876         int ret = 0;
4877         int iter_ret = 0;
4878         struct btrfs_root *root;
4879         struct btrfs_path *path;
4880         struct btrfs_key key;
4881         struct btrfs_key found_key;
4882
4883         path = alloc_path_for_send();
4884         if (!path)
4885                 return -ENOMEM;
4886
4887         root = sctx->send_root;
4888
4889         key.objectid = sctx->cmp_key->objectid;
4890         key.type = BTRFS_XATTR_ITEM_KEY;
4891         key.offset = 0;
4892         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
4893                 if (found_key.objectid != key.objectid ||
4894                     found_key.type != key.type) {
4895                         ret = 0;
4896                         break;
4897                 }
4898
4899                 ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
4900                 if (ret < 0)
4901                         break;
4902         }
4903         /* Catch error found during iteration */
4904         if (iter_ret < 0)
4905                 ret = iter_ret;
4906
4907         btrfs_free_path(path);
4908         return ret;
4909 }
4910
4911 static int send_verity(struct send_ctx *sctx, struct fs_path *path,
4912                        struct fsverity_descriptor *desc)
4913 {
4914         int ret;
4915
4916         ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
4917         if (ret < 0)
4918                 goto out;
4919
4920         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4921         TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
4922                         le8_to_cpu(desc->hash_algorithm));
4923         TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
4924                         1U << le8_to_cpu(desc->log_blocksize));
4925         TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
4926                         le8_to_cpu(desc->salt_size));
4927         TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
4928                         le32_to_cpu(desc->sig_size));
4929
4930         ret = send_cmd(sctx);
4931
4932 tlv_put_failure:
4933 out:
4934         return ret;
4935 }
4936
4937 static int process_verity(struct send_ctx *sctx)
4938 {
4939         int ret = 0;
4940         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
4941         struct inode *inode;
4942         struct fs_path *p;
4943
4944         inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
4945         if (IS_ERR(inode))
4946                 return PTR_ERR(inode);
4947
4948         ret = btrfs_get_verity_descriptor(inode, NULL, 0);
4949         if (ret < 0)
4950                 goto iput;
4951
4952         if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
4953                 ret = -EMSGSIZE;
4954                 goto iput;
4955         }
4956         if (!sctx->verity_descriptor) {
4957                 sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
4958                                                    GFP_KERNEL);
4959                 if (!sctx->verity_descriptor) {
4960                         ret = -ENOMEM;
4961                         goto iput;
4962                 }
4963         }
4964
4965         ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
4966         if (ret < 0)
4967                 goto iput;
4968
4969         p = fs_path_alloc();
4970         if (!p) {
4971                 ret = -ENOMEM;
4972                 goto iput;
4973         }
4974         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4975         if (ret < 0)
4976                 goto free_path;
4977
4978         ret = send_verity(sctx, p, sctx->verity_descriptor);
4979         if (ret < 0)
4980                 goto free_path;
4981
4982 free_path:
4983         fs_path_free(p);
4984 iput:
4985         iput(inode);
4986         return ret;
4987 }
4988
4989 static inline u64 max_send_read_size(const struct send_ctx *sctx)
4990 {
4991         return sctx->send_max_size - SZ_16K;
4992 }
4993
4994 static int put_data_header(struct send_ctx *sctx, u32 len)
4995 {
4996         if (WARN_ON_ONCE(sctx->put_data))
4997                 return -EINVAL;
4998         sctx->put_data = true;
4999         if (sctx->proto >= 2) {
5000                 /*
5001                  * Since v2, the data attribute header doesn't include a length,
5002                  * it is implicitly to the end of the command.
5003                  */
5004                 if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
5005                         return -EOVERFLOW;
5006                 put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
5007                 sctx->send_size += sizeof(__le16);
5008         } else {
5009                 struct btrfs_tlv_header *hdr;
5010
5011                 if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
5012                         return -EOVERFLOW;
5013                 hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
5014                 put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
5015                 put_unaligned_le16(len, &hdr->tlv_len);
5016                 sctx->send_size += sizeof(*hdr);
5017         }
5018         return 0;
5019 }
5020
5021 static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
5022 {
5023         struct btrfs_root *root = sctx->send_root;
5024         struct btrfs_fs_info *fs_info = root->fs_info;
5025         struct page *page;
5026         pgoff_t index = offset >> PAGE_SHIFT;
5027         pgoff_t last_index;
5028         unsigned pg_offset = offset_in_page(offset);
5029         int ret;
5030
5031         ret = put_data_header(sctx, len);
5032         if (ret)
5033                 return ret;
5034
5035         last_index = (offset + len - 1) >> PAGE_SHIFT;
5036
5037         while (index <= last_index) {
5038                 unsigned cur_len = min_t(unsigned, len,
5039                                          PAGE_SIZE - pg_offset);
5040
5041                 page = find_lock_page(sctx->cur_inode->i_mapping, index);
5042                 if (!page) {
5043                         page_cache_sync_readahead(sctx->cur_inode->i_mapping,
5044                                                   &sctx->ra, NULL, index,
5045                                                   last_index + 1 - index);
5046
5047                         page = find_or_create_page(sctx->cur_inode->i_mapping,
5048                                                    index, GFP_KERNEL);
5049                         if (!page) {
5050                                 ret = -ENOMEM;
5051                                 break;
5052                         }
5053                 }
5054
5055                 if (PageReadahead(page))
5056                         page_cache_async_readahead(sctx->cur_inode->i_mapping,
5057                                                    &sctx->ra, NULL, page_folio(page),
5058                                                    index, last_index + 1 - index);
5059
5060                 if (!PageUptodate(page)) {
5061                         btrfs_read_folio(NULL, page_folio(page));
5062                         lock_page(page);
5063                         if (!PageUptodate(page)) {
5064                                 unlock_page(page);
5065                                 btrfs_err(fs_info,
5066                         "send: IO error at offset %llu for inode %llu root %llu",
5067                                         page_offset(page), sctx->cur_ino,
5068                                         sctx->send_root->root_key.objectid);
5069                                 put_page(page);
5070                                 ret = -EIO;
5071                                 break;
5072                         }
5073                 }
5074
5075                 memcpy_from_page(sctx->send_buf + sctx->send_size, page,
5076                                  pg_offset, cur_len);
5077                 unlock_page(page);
5078                 put_page(page);
5079                 index++;
5080                 pg_offset = 0;
5081                 len -= cur_len;
5082                 sctx->send_size += cur_len;
5083         }
5084
5085         return ret;
5086 }
5087
5088 /*
5089  * Read some bytes from the current inode/file and send a write command to
5090  * user space.
5091  */
5092 static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
5093 {
5094         struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5095         int ret = 0;
5096         struct fs_path *p;
5097
5098         p = fs_path_alloc();
5099         if (!p)
5100                 return -ENOMEM;
5101
5102         btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
5103
5104         ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
5105         if (ret < 0)
5106                 goto out;
5107
5108         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5109         if (ret < 0)
5110                 goto out;
5111
5112         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5113         TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5114         ret = put_file_data(sctx, offset, len);
5115         if (ret < 0)
5116                 goto out;
5117
5118         ret = send_cmd(sctx);
5119
5120 tlv_put_failure:
5121 out:
5122         fs_path_free(p);
5123         return ret;
5124 }
5125
5126 /*
5127  * Send a clone command to user space.
5128  */
5129 static int send_clone(struct send_ctx *sctx,
5130                       u64 offset, u32 len,
5131                       struct clone_root *clone_root)
5132 {
5133         int ret = 0;
5134         struct fs_path *p;
5135         u64 gen;
5136
5137         btrfs_debug(sctx->send_root->fs_info,
5138                     "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5139                     offset, len, clone_root->root->root_key.objectid,
5140                     clone_root->ino, clone_root->offset);
5141
5142         p = fs_path_alloc();
5143         if (!p)
5144                 return -ENOMEM;
5145
5146         ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
5147         if (ret < 0)
5148                 goto out;
5149
5150         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5151         if (ret < 0)
5152                 goto out;
5153
5154         TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5155         TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
5156         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5157
5158         if (clone_root->root == sctx->send_root) {
5159                 ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
5160                 if (ret < 0)
5161                         goto out;
5162                 ret = get_cur_path(sctx, clone_root->ino, gen, p);
5163         } else {
5164                 ret = get_inode_path(clone_root->root, clone_root->ino, p);
5165         }
5166         if (ret < 0)
5167                 goto out;
5168
5169         /*
5170          * If the parent we're using has a received_uuid set then use that as
5171          * our clone source as that is what we will look for when doing a
5172          * receive.
5173          *
5174          * This covers the case that we create a snapshot off of a received
5175          * subvolume and then use that as the parent and try to receive on a
5176          * different host.
5177          */
5178         if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
5179                 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5180                              clone_root->root->root_item.received_uuid);
5181         else
5182                 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5183                              clone_root->root->root_item.uuid);
5184         TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
5185                     btrfs_root_ctransid(&clone_root->root->root_item));
5186         TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
5187         TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
5188                         clone_root->offset);
5189
5190         ret = send_cmd(sctx);
5191
5192 tlv_put_failure:
5193 out:
5194         fs_path_free(p);
5195         return ret;
5196 }
5197
5198 /*
5199  * Send an update extent command to user space.
5200  */
5201 static int send_update_extent(struct send_ctx *sctx,
5202                               u64 offset, u32 len)
5203 {
5204         int ret = 0;
5205         struct fs_path *p;
5206
5207         p = fs_path_alloc();
5208         if (!p)
5209                 return -ENOMEM;
5210
5211         ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
5212         if (ret < 0)
5213                 goto out;
5214
5215         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5216         if (ret < 0)
5217                 goto out;
5218
5219         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5220         TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5221         TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5222
5223         ret = send_cmd(sctx);
5224
5225 tlv_put_failure:
5226 out:
5227         fs_path_free(p);
5228         return ret;
5229 }
5230
5231 static int send_hole(struct send_ctx *sctx, u64 end)
5232 {
5233         struct fs_path *p = NULL;
5234         u64 read_size = max_send_read_size(sctx);
5235         u64 offset = sctx->cur_inode_last_extent;
5236         int ret = 0;
5237
5238         /*
5239          * A hole that starts at EOF or beyond it. Since we do not yet support
5240          * fallocate (for extent preallocation and hole punching), sending a
5241          * write of zeroes starting at EOF or beyond would later require issuing
5242          * a truncate operation which would undo the write and achieve nothing.
5243          */
5244         if (offset >= sctx->cur_inode_size)
5245                 return 0;
5246
5247         /*
5248          * Don't go beyond the inode's i_size due to prealloc extents that start
5249          * after the i_size.
5250          */
5251         end = min_t(u64, end, sctx->cur_inode_size);
5252
5253         if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5254                 return send_update_extent(sctx, offset, end - offset);
5255
5256         p = fs_path_alloc();
5257         if (!p)
5258                 return -ENOMEM;
5259         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5260         if (ret < 0)
5261                 goto tlv_put_failure;
5262         while (offset < end) {
5263                 u64 len = min(end - offset, read_size);
5264
5265                 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
5266                 if (ret < 0)
5267                         break;
5268                 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5269                 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5270                 ret = put_data_header(sctx, len);
5271                 if (ret < 0)
5272                         break;
5273                 memset(sctx->send_buf + sctx->send_size, 0, len);
5274                 sctx->send_size += len;
5275                 ret = send_cmd(sctx);
5276                 if (ret < 0)
5277                         break;
5278                 offset += len;
5279         }
5280         sctx->cur_inode_next_write_offset = offset;
5281 tlv_put_failure:
5282         fs_path_free(p);
5283         return ret;
5284 }
5285
5286 static int send_encoded_inline_extent(struct send_ctx *sctx,
5287                                       struct btrfs_path *path, u64 offset,
5288                                       u64 len)
5289 {
5290         struct btrfs_root *root = sctx->send_root;
5291         struct btrfs_fs_info *fs_info = root->fs_info;
5292         struct inode *inode;
5293         struct fs_path *fspath;
5294         struct extent_buffer *leaf = path->nodes[0];
5295         struct btrfs_key key;
5296         struct btrfs_file_extent_item *ei;
5297         u64 ram_bytes;
5298         size_t inline_size;
5299         int ret;
5300
5301         inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
5302         if (IS_ERR(inode))
5303                 return PTR_ERR(inode);
5304
5305         fspath = fs_path_alloc();
5306         if (!fspath) {
5307                 ret = -ENOMEM;
5308                 goto out;
5309         }
5310
5311         ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
5312         if (ret < 0)
5313                 goto out;
5314
5315         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
5316         if (ret < 0)
5317                 goto out;
5318
5319         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5320         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
5321         ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
5322         inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
5323
5324         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5325         TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5326         TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5327                     min(key.offset + ram_bytes - offset, len));
5328         TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
5329         TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
5330         ret = btrfs_encoded_io_compression_from_extent(fs_info,
5331                                 btrfs_file_extent_compression(leaf, ei));
5332         if (ret < 0)
5333                 goto out;
5334         TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5335
5336         ret = put_data_header(sctx, inline_size);
5337         if (ret < 0)
5338                 goto out;
5339         read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
5340                            btrfs_file_extent_inline_start(ei), inline_size);
5341         sctx->send_size += inline_size;
5342
5343         ret = send_cmd(sctx);
5344
5345 tlv_put_failure:
5346 out:
5347         fs_path_free(fspath);
5348         iput(inode);
5349         return ret;
5350 }
5351
5352 static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
5353                                u64 offset, u64 len)
5354 {
5355         struct btrfs_root *root = sctx->send_root;
5356         struct btrfs_fs_info *fs_info = root->fs_info;
5357         struct inode *inode;
5358         struct fs_path *fspath;
5359         struct extent_buffer *leaf = path->nodes[0];
5360         struct btrfs_key key;
5361         struct btrfs_file_extent_item *ei;
5362         u64 disk_bytenr, disk_num_bytes;
5363         u32 data_offset;
5364         struct btrfs_cmd_header *hdr;
5365         u32 crc;
5366         int ret;
5367
5368         inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
5369         if (IS_ERR(inode))
5370                 return PTR_ERR(inode);
5371
5372         fspath = fs_path_alloc();
5373         if (!fspath) {
5374                 ret = -ENOMEM;
5375                 goto out;
5376         }
5377
5378         ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
5379         if (ret < 0)
5380                 goto out;
5381
5382         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
5383         if (ret < 0)
5384                 goto out;
5385
5386         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5387         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
5388         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
5389         disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);
5390
5391         TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5392         TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5393         TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5394                     min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
5395                         len));
5396         TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
5397                     btrfs_file_extent_ram_bytes(leaf, ei));
5398         TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
5399                     offset - key.offset + btrfs_file_extent_offset(leaf, ei));
5400         ret = btrfs_encoded_io_compression_from_extent(fs_info,
5401                                 btrfs_file_extent_compression(leaf, ei));
5402         if (ret < 0)
5403                 goto out;
5404         TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5405         TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);
5406
5407         ret = put_data_header(sctx, disk_num_bytes);
5408         if (ret < 0)
5409                 goto out;
5410
5411         /*
5412          * We want to do I/O directly into the send buffer, so get the next page
5413          * boundary in the send buffer. This means that there may be a gap
5414          * between the beginning of the command and the file data.
5415          */
5416         data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
5417         if (data_offset > sctx->send_max_size ||
5418             sctx->send_max_size - data_offset < disk_num_bytes) {
5419                 ret = -EOVERFLOW;
5420                 goto out;
5421         }
5422
5423         /*
5424          * Note that send_buf is a mapping of send_buf_pages, so this is really
5425          * reading into send_buf.
5426          */
5427         ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
5428                                                     disk_bytenr, disk_num_bytes,
5429                                                     sctx->send_buf_pages +
5430                                                     (data_offset >> PAGE_SHIFT));
5431         if (ret)
5432                 goto out;
5433
5434         hdr = (struct btrfs_cmd_header *)sctx->send_buf;
5435         hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
5436         hdr->crc = 0;
5437         crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
5438         crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
5439         hdr->crc = cpu_to_le32(crc);
5440
5441         ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
5442                         &sctx->send_off);
5443         if (!ret) {
5444                 ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
5445                                 disk_num_bytes, &sctx->send_off);
5446         }
5447         sctx->send_size = 0;
5448         sctx->put_data = false;
5449
5450 tlv_put_failure:
5451 out:
5452         fs_path_free(fspath);
5453         iput(inode);
5454         return ret;
5455 }
5456
5457 static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
5458                             const u64 offset, const u64 len)
5459 {
5460         const u64 end = offset + len;
5461         struct extent_buffer *leaf = path->nodes[0];
5462         struct btrfs_file_extent_item *ei;
5463         u64 read_size = max_send_read_size(sctx);
5464         u64 sent = 0;
5465
5466         if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5467                 return send_update_extent(sctx, offset, len);
5468
5469         ei = btrfs_item_ptr(leaf, path->slots[0],
5470                             struct btrfs_file_extent_item);
5471         if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
5472             btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
5473                 bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
5474                                   BTRFS_FILE_EXTENT_INLINE);
5475
5476                 /*
5477                  * Send the compressed extent unless the compressed data is
5478                  * larger than the decompressed data. This can happen if we're
5479                  * not sending the entire extent, either because it has been
5480                  * partially overwritten/truncated or because this is a part of
5481                  * the extent that we couldn't clone in clone_range().
5482                  */
5483                 if (is_inline &&
5484                     btrfs_file_extent_inline_item_len(leaf,
5485                                                       path->slots[0]) <= len) {
5486                         return send_encoded_inline_extent(sctx, path, offset,
5487                                                           len);
5488                 } else if (!is_inline &&
5489                            btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
5490                         return send_encoded_extent(sctx, path, offset, len);
5491                 }
5492         }
5493
5494         if (sctx->cur_inode == NULL) {
5495                 struct btrfs_root *root = sctx->send_root;
5496
5497                 sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
5498                 if (IS_ERR(sctx->cur_inode)) {
5499                         int err = PTR_ERR(sctx->cur_inode);
5500
5501                         sctx->cur_inode = NULL;
5502                         return err;
5503                 }
5504                 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
5505                 file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
5506
5507                 /*
5508                  * It's very likely there are no pages from this inode in the page
5509                  * cache, so after reading extents and sending their data, we clean
5510                  * the page cache to avoid trashing the page cache (adding pressure
5511                  * to the page cache and forcing eviction of other data more useful
5512                  * for applications).
5513                  *
5514                  * We decide if we should clean the page cache simply by checking
5515                  * if the inode's mapping nrpages is 0 when we first open it, and
5516                  * not by using something like filemap_range_has_page() before
5517                  * reading an extent because when we ask the readahead code to
5518                  * read a given file range, it may (and almost always does) read
5519                  * pages from beyond that range (see the documentation for
5520                  * page_cache_sync_readahead()), so it would not be reliable,
5521                  * because after reading the first extent future calls to
5522                  * filemap_range_has_page() would return true because the readahead
5523                  * on the previous extent resulted in reading pages of the current
5524                  * extent as well.
5525                  */
5526                 sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
5527                 sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
5528         }
5529
5530         while (sent < len) {
5531                 u64 size = min(len - sent, read_size);
5532                 int ret;
5533
5534                 ret = send_write(sctx, offset + sent, size);
5535                 if (ret < 0)
5536                         return ret;
5537                 sent += size;
5538         }
5539
5540         if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
5541                 /*
5542                  * Always operate only on ranges that are a multiple of the page
5543                  * size. This is not only to prevent zeroing parts of a page in
5544                  * the case of subpage sector size, but also to guarantee we evict
5545                  * pages, as passing a range that is smaller than page size does
5546                  * not evict the respective page (only zeroes part of its content).
5547                  *
5548                  * Always start from the end offset of the last range cleared.
5549                  * This is because the readahead code may (and very often does)
5550                  * reads pages beyond the range we request for readahead. So if
5551                  * we have an extent layout like this:
5552                  *
5553                  *            [ extent A ] [ extent B ] [ extent C ]
5554                  *
5555                  * When we ask page_cache_sync_readahead() to read extent A, it
5556                  * may also trigger reads for pages of extent B. If we are doing
5557                  * an incremental send and extent B has not changed between the
5558                  * parent and send snapshots, some or all of its pages may end
5559                  * up being read and placed in the page cache. So when truncating
5560                  * the page cache we always start from the end offset of the
5561                  * previously processed extent up to the end of the current
5562                  * extent.
5563                  */
5564                 truncate_inode_pages_range(&sctx->cur_inode->i_data,
5565                                            sctx->page_cache_clear_start,
5566                                            end - 1);
5567                 sctx->page_cache_clear_start = end;
5568         }
5569
5570         return 0;
5571 }
5572
5573 /*
5574  * Search for a capability xattr related to sctx->cur_ino. If the capability is
5575  * found, call send_set_xattr function to emit it.
5576  *
5577  * Return 0 if there isn't a capability, or when the capability was emitted
5578  * successfully, or < 0 if an error occurred.
5579  */
5580 static int send_capabilities(struct send_ctx *sctx)
5581 {
5582         struct fs_path *fspath = NULL;
5583         struct btrfs_path *path;
5584         struct btrfs_dir_item *di;
5585         struct extent_buffer *leaf;
5586         unsigned long data_ptr;
5587         char *buf = NULL;
5588         int buf_len;
5589         int ret = 0;
5590
5591         path = alloc_path_for_send();
5592         if (!path)
5593                 return -ENOMEM;
5594
5595         di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
5596                                 XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
5597         if (!di) {
5598                 /* There is no xattr for this inode */
5599                 goto out;
5600         } else if (IS_ERR(di)) {
5601                 ret = PTR_ERR(di);
5602                 goto out;
5603         }
5604
5605         leaf = path->nodes[0];
5606         buf_len = btrfs_dir_data_len(leaf, di);
5607
5608         fspath = fs_path_alloc();
5609         buf = kmalloc(buf_len, GFP_KERNEL);
5610         if (!fspath || !buf) {
5611                 ret = -ENOMEM;
5612                 goto out;
5613         }
5614
5615         ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
5616         if (ret < 0)
5617                 goto out;
5618
5619         data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
5620         read_extent_buffer(leaf, buf, data_ptr, buf_len);
5621
5622         ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
5623                         strlen(XATTR_NAME_CAPS), buf, buf_len);
5624 out:
5625         kfree(buf);
5626         fs_path_free(fspath);
5627         btrfs_free_path(path);
5628         return ret;
5629 }
5630
5631 static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
5632                        struct clone_root *clone_root, const u64 disk_byte,
5633                        u64 data_offset, u64 offset, u64 len)
5634 {
5635         struct btrfs_path *path;
5636         struct btrfs_key key;
5637         int ret;
5638         struct btrfs_inode_info info;
5639         u64 clone_src_i_size = 0;
5640
5641         /*
5642          * Prevent cloning from a zero offset with a length matching the sector
5643          * size because in some scenarios this will make the receiver fail.
5644          *
5645          * For example, if in the source filesystem the extent at offset 0
5646          * has a length of sectorsize and it was written using direct IO, then
5647          * it can never be an inline extent (even if compression is enabled).
5648          * Then this extent can be cloned in the original filesystem to a non
5649          * zero file offset, but it may not be possible to clone in the
5650          * destination filesystem because it can be inlined due to compression
5651          * on the destination filesystem (as the receiver's write operations are
5652          * always done using buffered IO). The same happens when the original
5653          * filesystem does not have compression enabled but the destination
5654          * filesystem has.
5655          */
5656         if (clone_root->offset == 0 &&
5657             len == sctx->send_root->fs_info->sectorsize)
5658                 return send_extent_data(sctx, dst_path, offset, len);
5659
5660         path = alloc_path_for_send();
5661         if (!path)
5662                 return -ENOMEM;
5663
5664         /*
5665          * There are inodes that have extents that lie behind its i_size. Don't
5666          * accept clones from these extents.
5667          */
5668         ret = get_inode_info(clone_root->root, clone_root->ino, &info);
5669         btrfs_release_path(path);
5670         if (ret < 0)
5671                 goto out;
5672         clone_src_i_size = info.size;
5673
5674         /*
5675          * We can't send a clone operation for the entire range if we find
5676          * extent items in the respective range in the source file that
5677          * refer to different extents or if we find holes.
5678          * So check for that and do a mix of clone and regular write/copy
5679          * operations if needed.
5680          *
5681          * Example:
5682          *
5683          * mkfs.btrfs -f /dev/sda
5684          * mount /dev/sda /mnt
5685          * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5686          * cp --reflink=always /mnt/foo /mnt/bar
5687          * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5688          * btrfs subvolume snapshot -r /mnt /mnt/snap
5689          *
5690          * If when we send the snapshot and we are processing file bar (which
5691          * has a higher inode number than foo) we blindly send a clone operation
5692          * for the [0, 100K[ range from foo to bar, the receiver ends up getting
5693          * a file bar that matches the content of file foo - iow, doesn't match
5694          * the content from bar in the original filesystem.
5695          */
5696         key.objectid = clone_root->ino;
5697         key.type = BTRFS_EXTENT_DATA_KEY;
5698         key.offset = clone_root->offset;
5699         ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
5700         if (ret < 0)
5701                 goto out;
5702         if (ret > 0 && path->slots[0] > 0) {
5703                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
5704                 if (key.objectid == clone_root->ino &&
5705                     key.type == BTRFS_EXTENT_DATA_KEY)
5706                         path->slots[0]--;
5707         }
5708
5709         while (true) {
5710                 struct extent_buffer *leaf = path->nodes[0];
5711                 int slot = path->slots[0];
5712                 struct btrfs_file_extent_item *ei;
5713                 u8 type;
5714                 u64 ext_len;
5715                 u64 clone_len;
5716                 u64 clone_data_offset;
5717                 bool crossed_src_i_size = false;
5718
5719                 if (slot >= btrfs_header_nritems(leaf)) {
5720                         ret = btrfs_next_leaf(clone_root->root, path);
5721                         if (ret < 0)
5722                                 goto out;
5723                         else if (ret > 0)
5724                                 break;
5725                         continue;
5726                 }
5727
5728                 btrfs_item_key_to_cpu(leaf, &key, slot);
5729
5730                 /*
5731                  * We might have an implicit trailing hole (NO_HOLES feature
5732                  * enabled). We deal with it after leaving this loop.
5733                  */
5734                 if (key.objectid != clone_root->ino ||
5735                     key.type != BTRFS_EXTENT_DATA_KEY)
5736                         break;
5737
5738                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5739                 type = btrfs_file_extent_type(leaf, ei);
5740                 if (type == BTRFS_FILE_EXTENT_INLINE) {
5741                         ext_len = btrfs_file_extent_ram_bytes(leaf, ei);
5742                         ext_len = PAGE_ALIGN(ext_len);
5743                 } else {
5744                         ext_len = btrfs_file_extent_num_bytes(leaf, ei);
5745                 }
5746
5747                 if (key.offset + ext_len <= clone_root->offset)
5748                         goto next;
5749
5750                 if (key.offset > clone_root->offset) {
5751                         /* Implicit hole, NO_HOLES feature enabled. */
5752                         u64 hole_len = key.offset - clone_root->offset;
5753
5754                         if (hole_len > len)
5755                                 hole_len = len;
5756                         ret = send_extent_data(sctx, dst_path, offset,
5757                                                hole_len);
5758                         if (ret < 0)
5759                                 goto out;
5760
5761                         len -= hole_len;
5762                         if (len == 0)
5763                                 break;
5764                         offset += hole_len;
5765                         clone_root->offset += hole_len;
5766                         data_offset += hole_len;
5767                 }
5768
5769                 if (key.offset >= clone_root->offset + len)
5770                         break;
5771
5772                 if (key.offset >= clone_src_i_size)
5773                         break;
5774
5775                 if (key.offset + ext_len > clone_src_i_size) {
5776                         ext_len = clone_src_i_size - key.offset;
5777                         crossed_src_i_size = true;
5778                 }
5779
5780                 clone_data_offset = btrfs_file_extent_offset(leaf, ei);
5781                 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
5782                         clone_root->offset = key.offset;
5783                         if (clone_data_offset < data_offset &&
5784                                 clone_data_offset + ext_len > data_offset) {
5785                                 u64 extent_offset;
5786
5787                                 extent_offset = data_offset - clone_data_offset;
5788                                 ext_len -= extent_offset;
5789                                 clone_data_offset += extent_offset;
5790                                 clone_root->offset += extent_offset;
5791                         }
5792                 }
5793
5794                 clone_len = min_t(u64, ext_len, len);
5795
5796                 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
5797                     clone_data_offset == data_offset) {
5798                         const u64 src_end = clone_root->offset + clone_len;
5799                         const u64 sectorsize = SZ_64K;
5800
5801                         /*
5802                          * We can't clone the last block, when its size is not
5803                          * sector size aligned, into the middle of a file. If we
5804                          * do so, the receiver will get a failure (-EINVAL) when
5805                          * trying to clone or will silently corrupt the data in
5806                          * the destination file if it's on a kernel without the
5807                          * fix introduced by commit ac765f83f1397646
5808                          * ("Btrfs: fix data corruption due to cloning of eof
5809                          * block).
5810                          *
5811                          * So issue a clone of the aligned down range plus a
5812                          * regular write for the eof block, if we hit that case.
5813                          *
5814                          * Also, we use the maximum possible sector size, 64K,
5815                          * because we don't know what's the sector size of the
5816                          * filesystem that receives the stream, so we have to
5817                          * assume the largest possible sector size.
5818                          */
5819                         if (src_end == clone_src_i_size &&
5820                             !IS_ALIGNED(src_end, sectorsize) &&
5821                             offset + clone_len < sctx->cur_inode_size) {
5822                                 u64 slen;
5823
5824                                 slen = ALIGN_DOWN(src_end - clone_root->offset,
5825                                                   sectorsize);
5826                                 if (slen > 0) {
5827                                         ret = send_clone(sctx, offset, slen,
5828                                                          clone_root);
5829                                         if (ret < 0)
5830                                                 goto out;
5831                                 }
5832                                 ret = send_extent_data(sctx, dst_path,
5833                                                        offset + slen,
5834                                                        clone_len - slen);
5835                         } else {
5836                                 ret = send_clone(sctx, offset, clone_len,
5837                                                  clone_root);
5838                         }
5839                 } else if (crossed_src_i_size && clone_len < len) {
5840                         /*
5841                          * If we are at i_size of the clone source inode and we
5842                          * can not clone from it, terminate the loop. This is
5843                          * to avoid sending two write operations, one with a
5844                          * length matching clone_len and the final one after
5845                          * this loop with a length of len - clone_len.
5846                          *
5847                          * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
5848                          * was passed to the send ioctl), this helps avoid
5849                          * sending an encoded write for an offset that is not
5850                          * sector size aligned, in case the i_size of the source
5851                          * inode is not sector size aligned. That will make the
5852                          * receiver fallback to decompression of the data and
5853                          * writing it using regular buffered IO, therefore while
5854                          * not incorrect, it's not optimal due decompression and
5855                          * possible re-compression at the receiver.
5856                          */
5857                         break;
5858                 } else {
5859                         ret = send_extent_data(sctx, dst_path, offset,
5860                                                clone_len);
5861                 }
5862
5863                 if (ret < 0)
5864                         goto out;
5865
5866                 len -= clone_len;
5867                 if (len == 0)
5868                         break;
5869                 offset += clone_len;
5870                 clone_root->offset += clone_len;
5871
5872                 /*
5873                  * If we are cloning from the file we are currently processing,
5874                  * and using the send root as the clone root, we must stop once
5875                  * the current clone offset reaches the current eof of the file
5876                  * at the receiver, otherwise we would issue an invalid clone
5877                  * operation (source range going beyond eof) and cause the
5878                  * receiver to fail. So if we reach the current eof, bail out
5879                  * and fallback to a regular write.
5880                  */
5881                 if (clone_root->root == sctx->send_root &&
5882                     clone_root->ino == sctx->cur_ino &&
5883                     clone_root->offset >= sctx->cur_inode_next_write_offset)
5884                         break;
5885
5886                 data_offset += clone_len;
5887 next:
5888                 path->slots[0]++;
5889         }
5890
5891         if (len > 0)
5892                 ret = send_extent_data(sctx, dst_path, offset, len);
5893         else
5894                 ret = 0;
5895 out:
5896         btrfs_free_path(path);
5897         return ret;
5898 }
5899
5900 static int send_write_or_clone(struct send_ctx *sctx,
5901                                struct btrfs_path *path,
5902                                struct btrfs_key *key,
5903                                struct clone_root *clone_root)
5904 {
5905         int ret = 0;
5906         u64 offset = key->offset;
5907         u64 end;
5908         u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
5909
5910         end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
5911         if (offset >= end)
5912                 return 0;
5913
5914         if (clone_root && IS_ALIGNED(end, bs)) {
5915                 struct btrfs_file_extent_item *ei;
5916                 u64 disk_byte;
5917                 u64 data_offset;
5918
5919                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
5920                                     struct btrfs_file_extent_item);
5921                 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
5922                 data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
5923                 ret = clone_range(sctx, path, clone_root, disk_byte,
5924                                   data_offset, offset, end - offset);
5925         } else {
5926                 ret = send_extent_data(sctx, path, offset, end - offset);
5927         }
5928         sctx->cur_inode_next_write_offset = end;
5929         return ret;
5930 }
5931
5932 static int is_extent_unchanged(struct send_ctx *sctx,
5933                                struct btrfs_path *left_path,
5934                                struct btrfs_key *ekey)
5935 {
5936         int ret = 0;
5937         struct btrfs_key key;
5938         struct btrfs_path *path = NULL;
5939         struct extent_buffer *eb;
5940         int slot;
5941         struct btrfs_key found_key;
5942         struct btrfs_file_extent_item *ei;
5943         u64 left_disknr;
5944         u64 right_disknr;
5945         u64 left_offset;
5946         u64 right_offset;
5947         u64 left_offset_fixed;
5948         u64 left_len;
5949         u64 right_len;
5950         u64 left_gen;
5951         u64 right_gen;
5952         u8 left_type;
5953         u8 right_type;
5954
5955         path = alloc_path_for_send();
5956         if (!path)
5957                 return -ENOMEM;
5958
5959         eb = left_path->nodes[0];
5960         slot = left_path->slots[0];
5961         ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
5962         left_type = btrfs_file_extent_type(eb, ei);
5963
5964         if (left_type != BTRFS_FILE_EXTENT_REG) {
5965                 ret = 0;
5966                 goto out;
5967         }
5968         left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
5969         left_len = btrfs_file_extent_num_bytes(eb, ei);
5970         left_offset = btrfs_file_extent_offset(eb, ei);
5971         left_gen = btrfs_file_extent_generation(eb, ei);
5972
5973         /*
5974          * Following comments will refer to these graphics. L is the left
5975          * extents which we are checking at the moment. 1-8 are the right
5976          * extents that we iterate.
5977          *
5978          *       |-----L-----|
5979          * |-1-|-2a-|-3-|-4-|-5-|-6-|
5980          *
5981          *       |-----L-----|
5982          * |--1--|-2b-|...(same as above)
5983          *
5984          * Alternative situation. Happens on files where extents got split.
5985          *       |-----L-----|
5986          * |-----------7-----------|-6-|
5987          *
5988          * Alternative situation. Happens on files which got larger.
5989          *       |-----L-----|
5990          * |-8-|
5991          * Nothing follows after 8.
5992          */
5993
5994         key.objectid = ekey->objectid;
5995         key.type = BTRFS_EXTENT_DATA_KEY;
5996         key.offset = ekey->offset;
5997         ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
5998         if (ret < 0)
5999                 goto out;
6000         if (ret) {
6001                 ret = 0;
6002                 goto out;
6003         }
6004
6005         /*
6006          * Handle special case where the right side has no extents at all.
6007          */
6008         eb = path->nodes[0];
6009         slot = path->slots[0];
6010         btrfs_item_key_to_cpu(eb, &found_key, slot);
6011         if (found_key.objectid != key.objectid ||
6012             found_key.type != key.type) {
6013                 /* If we're a hole then just pretend nothing changed */
6014                 ret = (left_disknr) ? 0 : 1;
6015                 goto out;
6016         }
6017
6018         /*
6019          * We're now on 2a, 2b or 7.
6020          */
6021         key = found_key;
6022         while (key.offset < ekey->offset + left_len) {
6023                 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6024                 right_type = btrfs_file_extent_type(eb, ei);
6025                 if (right_type != BTRFS_FILE_EXTENT_REG &&
6026                     right_type != BTRFS_FILE_EXTENT_INLINE) {
6027                         ret = 0;
6028                         goto out;
6029                 }
6030
6031                 if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6032                         right_len = btrfs_file_extent_ram_bytes(eb, ei);
6033                         right_len = PAGE_ALIGN(right_len);
6034                 } else {
6035                         right_len = btrfs_file_extent_num_bytes(eb, ei);
6036                 }
6037
6038                 /*
6039                  * Are we at extent 8? If yes, we know the extent is changed.
6040                  * This may only happen on the first iteration.
6041                  */
6042                 if (found_key.offset + right_len <= ekey->offset) {
6043                         /* If we're a hole just pretend nothing changed */
6044                         ret = (left_disknr) ? 0 : 1;
6045                         goto out;
6046                 }
6047
6048                 /*
6049                  * We just wanted to see if when we have an inline extent, what
6050                  * follows it is a regular extent (wanted to check the above
6051                  * condition for inline extents too). This should normally not
6052                  * happen but it's possible for example when we have an inline
6053                  * compressed extent representing data with a size matching
6054                  * the page size (currently the same as sector size).
6055                  */
6056                 if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6057                         ret = 0;
6058                         goto out;
6059                 }
6060
6061                 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
6062                 right_offset = btrfs_file_extent_offset(eb, ei);
6063                 right_gen = btrfs_file_extent_generation(eb, ei);
6064
6065                 left_offset_fixed = left_offset;
6066                 if (key.offset < ekey->offset) {
6067                         /* Fix the right offset for 2a and 7. */
6068                         right_offset += ekey->offset - key.offset;
6069                 } else {
6070                         /* Fix the left offset for all behind 2a and 2b */
6071                         left_offset_fixed += key.offset - ekey->offset;
6072                 }
6073
6074                 /*
6075                  * Check if we have the same extent.
6076                  */
6077                 if (left_disknr != right_disknr ||
6078                     left_offset_fixed != right_offset ||
6079                     left_gen != right_gen) {
6080                         ret = 0;
6081                         goto out;
6082                 }
6083
6084                 /*
6085                  * Go to the next extent.
6086                  */
6087                 ret = btrfs_next_item(sctx->parent_root, path);
6088                 if (ret < 0)
6089                         goto out;
6090                 if (!ret) {
6091                         eb = path->nodes[0];
6092                         slot = path->slots[0];
6093                         btrfs_item_key_to_cpu(eb, &found_key, slot);
6094                 }
6095                 if (ret || found_key.objectid != key.objectid ||
6096                     found_key.type != key.type) {
6097                         key.offset += right_len;
6098                         break;
6099                 }
6100                 if (found_key.offset != key.offset + right_len) {
6101                         ret = 0;
6102                         goto out;
6103                 }
6104                 key = found_key;
6105         }
6106
6107         /*
6108          * We're now behind the left extent (treat as unchanged) or at the end
6109          * of the right side (treat as changed).
6110          */
6111         if (key.offset >= ekey->offset + left_len)
6112                 ret = 1;
6113         else
6114                 ret = 0;
6115
6116
6117 out:
6118         btrfs_free_path(path);
6119         return ret;
6120 }
6121
6122 static int get_last_extent(struct send_ctx *sctx, u64 offset)
6123 {
6124         struct btrfs_path *path;
6125         struct btrfs_root *root = sctx->send_root;
6126         struct btrfs_key key;
6127         int ret;
6128
6129         path = alloc_path_for_send();
6130         if (!path)
6131                 return -ENOMEM;
6132
6133         sctx->cur_inode_last_extent = 0;
6134
6135         key.objectid = sctx->cur_ino;
6136         key.type = BTRFS_EXTENT_DATA_KEY;
6137         key.offset = offset;
6138         ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
6139         if (ret < 0)
6140                 goto out;
6141         ret = 0;
6142         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
6143         if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
6144                 goto out;
6145
6146         sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6147 out:
6148         btrfs_free_path(path);
6149         return ret;
6150 }
6151
6152 static int range_is_hole_in_parent(struct send_ctx *sctx,
6153                                    const u64 start,
6154                                    const u64 end)
6155 {
6156         struct btrfs_path *path;
6157         struct btrfs_key key;
6158         struct btrfs_root *root = sctx->parent_root;
6159         u64 search_start = start;
6160         int ret;
6161
6162         path = alloc_path_for_send();
6163         if (!path)
6164                 return -ENOMEM;
6165
6166         key.objectid = sctx->cur_ino;
6167         key.type = BTRFS_EXTENT_DATA_KEY;
6168         key.offset = search_start;
6169         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6170         if (ret < 0)
6171                 goto out;
6172         if (ret > 0 && path->slots[0] > 0)
6173                 path->slots[0]--;
6174
6175         while (search_start < end) {
6176                 struct extent_buffer *leaf = path->nodes[0];
6177                 int slot = path->slots[0];
6178                 struct btrfs_file_extent_item *fi;
6179                 u64 extent_end;
6180
6181                 if (slot >= btrfs_header_nritems(leaf)) {
6182                         ret = btrfs_next_leaf(root, path);
6183                         if (ret < 0)
6184                                 goto out;
6185                         else if (ret > 0)
6186                                 break;
6187                         continue;
6188                 }
6189
6190                 btrfs_item_key_to_cpu(leaf, &key, slot);
6191                 if (key.objectid < sctx->cur_ino ||
6192                     key.type < BTRFS_EXTENT_DATA_KEY)
6193                         goto next;
6194                 if (key.objectid > sctx->cur_ino ||
6195                     key.type > BTRFS_EXTENT_DATA_KEY ||
6196                     key.offset >= end)
6197                         break;
6198
6199                 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6200                 extent_end = btrfs_file_extent_end(path);
6201                 if (extent_end <= start)
6202                         goto next;
6203                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
6204                         search_start = extent_end;
6205                         goto next;
6206                 }
6207                 ret = 0;
6208                 goto out;
6209 next:
6210                 path->slots[0]++;
6211         }
6212         ret = 1;
6213 out:
6214         btrfs_free_path(path);
6215         return ret;
6216 }
6217
6218 static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
6219                            struct btrfs_key *key)
6220 {
6221         int ret = 0;
6222
6223         if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
6224                 return 0;
6225
6226         if (sctx->cur_inode_last_extent == (u64)-1) {
6227                 ret = get_last_extent(sctx, key->offset - 1);
6228                 if (ret)
6229                         return ret;
6230         }
6231
6232         if (path->slots[0] == 0 &&
6233             sctx->cur_inode_last_extent < key->offset) {
6234                 /*
6235                  * We might have skipped entire leafs that contained only
6236                  * file extent items for our current inode. These leafs have
6237                  * a generation number smaller (older) than the one in the
6238                  * current leaf and the leaf our last extent came from, and
6239                  * are located between these 2 leafs.
6240                  */
6241                 ret = get_last_extent(sctx, key->offset - 1);
6242                 if (ret)
6243                         return ret;
6244         }
6245
6246         if (sctx->cur_inode_last_extent < key->offset) {
6247                 ret = range_is_hole_in_parent(sctx,
6248                                               sctx->cur_inode_last_extent,
6249                                               key->offset);
6250                 if (ret < 0)
6251                         return ret;
6252                 else if (ret == 0)
6253                         ret = send_hole(sctx, key->offset);
6254                 else
6255                         ret = 0;
6256         }
6257         sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6258         return ret;
6259 }
6260
6261 static int process_extent(struct send_ctx *sctx,
6262                           struct btrfs_path *path,
6263                           struct btrfs_key *key)
6264 {
6265         struct clone_root *found_clone = NULL;
6266         int ret = 0;
6267
6268         if (S_ISLNK(sctx->cur_inode_mode))
6269                 return 0;
6270
6271         if (sctx->parent_root && !sctx->cur_inode_new) {
6272                 ret = is_extent_unchanged(sctx, path, key);
6273                 if (ret < 0)
6274                         goto out;
6275                 if (ret) {
6276                         ret = 0;
6277                         goto out_hole;
6278                 }
6279         } else {
6280                 struct btrfs_file_extent_item *ei;
6281                 u8 type;
6282
6283                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
6284                                     struct btrfs_file_extent_item);
6285                 type = btrfs_file_extent_type(path->nodes[0], ei);
6286                 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
6287                     type == BTRFS_FILE_EXTENT_REG) {
6288                         /*
6289                          * The send spec does not have a prealloc command yet,
6290                          * so just leave a hole for prealloc'ed extents until
6291                          * we have enough commands queued up to justify rev'ing
6292                          * the send spec.
6293                          */
6294                         if (type == BTRFS_FILE_EXTENT_PREALLOC) {
6295                                 ret = 0;
6296                                 goto out;
6297                         }
6298
6299                         /* Have a hole, just skip it. */
6300                         if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
6301                                 ret = 0;
6302                                 goto out;
6303                         }
6304                 }
6305         }
6306
6307         ret = find_extent_clone(sctx, path, key->objectid, key->offset,
6308                         sctx->cur_inode_size, &found_clone);
6309         if (ret != -ENOENT && ret < 0)
6310                 goto out;
6311
6312         ret = send_write_or_clone(sctx, path, key, found_clone);
6313         if (ret)
6314                 goto out;
6315 out_hole:
6316         ret = maybe_send_hole(sctx, path, key);
6317 out:
6318         return ret;
6319 }
6320
6321 static int process_all_extents(struct send_ctx *sctx)
6322 {
6323         int ret = 0;
6324         int iter_ret = 0;
6325         struct btrfs_root *root;
6326         struct btrfs_path *path;
6327         struct btrfs_key key;
6328         struct btrfs_key found_key;
6329
6330         root = sctx->send_root;
6331         path = alloc_path_for_send();
6332         if (!path)
6333                 return -ENOMEM;
6334
6335         key.objectid = sctx->cmp_key->objectid;
6336         key.type = BTRFS_EXTENT_DATA_KEY;
6337         key.offset = 0;
6338         btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
6339                 if (found_key.objectid != key.objectid ||
6340                     found_key.type != key.type) {
6341                         ret = 0;
6342                         break;
6343                 }
6344
6345                 ret = process_extent(sctx, path, &found_key);
6346                 if (ret < 0)
6347                         break;
6348         }
6349         /* Catch error found during iteration */
6350         if (iter_ret < 0)
6351                 ret = iter_ret;
6352
6353         btrfs_free_path(path);
6354         return ret;
6355 }
6356
6357 static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
6358                                            int *pending_move,
6359                                            int *refs_processed)
6360 {
6361         int ret = 0;
6362
6363         if (sctx->cur_ino == 0)
6364                 goto out;
6365         if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
6366             sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
6367                 goto out;
6368         if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
6369                 goto out;
6370
6371         ret = process_recorded_refs(sctx, pending_move);
6372         if (ret < 0)
6373                 goto out;
6374
6375         *refs_processed = 1;
6376 out:
6377         return ret;
6378 }
6379
6380 static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
6381 {
6382         int ret = 0;
6383         struct btrfs_inode_info info;
6384         u64 left_mode;
6385         u64 left_uid;
6386         u64 left_gid;
6387         u64 left_fileattr;
6388         u64 right_mode;
6389         u64 right_uid;
6390         u64 right_gid;
6391         u64 right_fileattr;
6392         int need_chmod = 0;
6393         int need_chown = 0;
6394         bool need_fileattr = false;
6395         int need_truncate = 1;
6396         int pending_move = 0;
6397         int refs_processed = 0;
6398
6399         if (sctx->ignore_cur_inode)
6400                 return 0;
6401
6402         ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
6403                                               &refs_processed);
6404         if (ret < 0)
6405                 goto out;
6406
6407         /*
6408          * We have processed the refs and thus need to advance send_progress.
6409          * Now, calls to get_cur_xxx will take the updated refs of the current
6410          * inode into account.
6411          *
6412          * On the other hand, if our current inode is a directory and couldn't
6413          * be moved/renamed because its parent was renamed/moved too and it has
6414          * a higher inode number, we can only move/rename our current inode
6415          * after we moved/renamed its parent. Therefore in this case operate on
6416          * the old path (pre move/rename) of our current inode, and the
6417          * move/rename will be performed later.
6418          */
6419         if (refs_processed && !pending_move)
6420                 sctx->send_progress = sctx->cur_ino + 1;
6421
6422         if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
6423                 goto out;
6424         if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
6425                 goto out;
6426         ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
6427         if (ret < 0)
6428                 goto out;
6429         left_mode = info.mode;
6430         left_uid = info.uid;
6431         left_gid = info.gid;
6432         left_fileattr = info.fileattr;
6433
6434         if (!sctx->parent_root || sctx->cur_inode_new) {
6435                 need_chown = 1;
6436                 if (!S_ISLNK(sctx->cur_inode_mode))
6437                         need_chmod = 1;
6438                 if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
6439                         need_truncate = 0;
6440         } else {
6441                 u64 old_size;
6442
6443                 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
6444                 if (ret < 0)
6445                         goto out;
6446                 old_size = info.size;
6447                 right_mode = info.mode;
6448                 right_uid = info.uid;
6449                 right_gid = info.gid;
6450                 right_fileattr = info.fileattr;
6451
6452                 if (left_uid != right_uid || left_gid != right_gid)
6453                         need_chown = 1;
6454                 if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
6455                         need_chmod = 1;
6456                 if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
6457                         need_fileattr = true;
6458                 if ((old_size == sctx->cur_inode_size) ||
6459                     (sctx->cur_inode_size > old_size &&
6460                      sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
6461                         need_truncate = 0;
6462         }
6463
6464         if (S_ISREG(sctx->cur_inode_mode)) {
6465                 if (need_send_hole(sctx)) {
6466                         if (sctx->cur_inode_last_extent == (u64)-1 ||
6467                             sctx->cur_inode_last_extent <
6468                             sctx->cur_inode_size) {
6469                                 ret = get_last_extent(sctx, (u64)-1);
6470                                 if (ret)
6471                                         goto out;
6472                         }
6473                         if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
6474                                 ret = range_is_hole_in_parent(sctx,
6475                                                       sctx->cur_inode_last_extent,
6476                                                       sctx->cur_inode_size);
6477                                 if (ret < 0) {
6478                                         goto out;
6479                                 } else if (ret == 0) {
6480                                         ret = send_hole(sctx, sctx->cur_inode_size);
6481                                         if (ret < 0)
6482                                                 goto out;
6483                                 } else {
6484                                         /* Range is already a hole, skip. */
6485                                         ret = 0;
6486                                 }
6487                         }
6488                 }
6489                 if (need_truncate) {
6490                         ret = send_truncate(sctx, sctx->cur_ino,
6491                                             sctx->cur_inode_gen,
6492                                             sctx->cur_inode_size);
6493                         if (ret < 0)
6494                                 goto out;
6495                 }
6496         }
6497
6498         if (need_chown) {
6499                 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6500                                 left_uid, left_gid);
6501                 if (ret < 0)
6502                         goto out;
6503         }
6504         if (need_chmod) {
6505                 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6506                                 left_mode);
6507                 if (ret < 0)
6508                         goto out;
6509         }
6510         if (need_fileattr) {
6511                 ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6512                                     left_fileattr);
6513                 if (ret < 0)
6514                         goto out;
6515         }
6516
6517         if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
6518             && sctx->cur_inode_needs_verity) {
6519                 ret = process_verity(sctx);
6520                 if (ret < 0)
6521                         goto out;
6522         }
6523
6524         ret = send_capabilities(sctx);
6525         if (ret < 0)
6526                 goto out;
6527
6528         /*
6529          * If other directory inodes depended on our current directory
6530          * inode's move/rename, now do their move/rename operations.
6531          */
6532         if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
6533                 ret = apply_children_dir_moves(sctx);
6534                 if (ret)
6535                         goto out;
6536                 /*
6537                  * Need to send that every time, no matter if it actually
6538                  * changed between the two trees as we have done changes to
6539                  * the inode before. If our inode is a directory and it's
6540                  * waiting to be moved/renamed, we will send its utimes when
6541                  * it's moved/renamed, therefore we don't need to do it here.
6542                  */
6543                 sctx->send_progress = sctx->cur_ino + 1;
6544                 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
6545                 if (ret < 0)
6546                         goto out;
6547         }
6548
6549 out:
6550         return ret;
6551 }
6552
6553 static void close_current_inode(struct send_ctx *sctx)
6554 {
6555         u64 i_size;
6556
6557         if (sctx->cur_inode == NULL)
6558                 return;
6559
6560         i_size = i_size_read(sctx->cur_inode);
6561
6562         /*
6563          * If we are doing an incremental send, we may have extents between the
6564          * last processed extent and the i_size that have not been processed
6565          * because they haven't changed but we may have read some of their pages
6566          * through readahead, see the comments at send_extent_data().
6567          */
6568         if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
6569                 truncate_inode_pages_range(&sctx->cur_inode->i_data,
6570                                            sctx->page_cache_clear_start,
6571                                            round_up(i_size, PAGE_SIZE) - 1);
6572
6573         iput(sctx->cur_inode);
6574         sctx->cur_inode = NULL;
6575 }
6576
6577 static int changed_inode(struct send_ctx *sctx,
6578                          enum btrfs_compare_tree_result result)
6579 {
6580         int ret = 0;
6581         struct btrfs_key *key = sctx->cmp_key;
6582         struct btrfs_inode_item *left_ii = NULL;
6583         struct btrfs_inode_item *right_ii = NULL;
6584         u64 left_gen = 0;
6585         u64 right_gen = 0;
6586
6587         close_current_inode(sctx);
6588
6589         sctx->cur_ino = key->objectid;
6590         sctx->cur_inode_new_gen = false;
6591         sctx->cur_inode_last_extent = (u64)-1;
6592         sctx->cur_inode_next_write_offset = 0;
6593         sctx->ignore_cur_inode = false;
6594
6595         /*
6596          * Set send_progress to current inode. This will tell all get_cur_xxx
6597          * functions that the current inode's refs are not updated yet. Later,
6598          * when process_recorded_refs is finished, it is set to cur_ino + 1.
6599          */
6600         sctx->send_progress = sctx->cur_ino;
6601
6602         if (result == BTRFS_COMPARE_TREE_NEW ||
6603             result == BTRFS_COMPARE_TREE_CHANGED) {
6604                 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
6605                                 sctx->left_path->slots[0],
6606                                 struct btrfs_inode_item);
6607                 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
6608                                 left_ii);
6609         } else {
6610                 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6611                                 sctx->right_path->slots[0],
6612                                 struct btrfs_inode_item);
6613                 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6614                                 right_ii);
6615         }
6616         if (result == BTRFS_COMPARE_TREE_CHANGED) {
6617                 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6618                                 sctx->right_path->slots[0],
6619                                 struct btrfs_inode_item);
6620
6621                 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6622                                 right_ii);
6623
6624                 /*
6625                  * The cur_ino = root dir case is special here. We can't treat
6626                  * the inode as deleted+reused because it would generate a
6627                  * stream that tries to delete/mkdir the root dir.
6628                  */
6629                 if (left_gen != right_gen &&
6630                     sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6631                         sctx->cur_inode_new_gen = true;
6632         }
6633
6634         /*
6635          * Normally we do not find inodes with a link count of zero (orphans)
6636          * because the most common case is to create a snapshot and use it
6637          * for a send operation. However other less common use cases involve
6638          * using a subvolume and send it after turning it to RO mode just
6639          * after deleting all hard links of a file while holding an open
6640          * file descriptor against it or turning a RO snapshot into RW mode,
6641          * keep an open file descriptor against a file, delete it and then
6642          * turn the snapshot back to RO mode before using it for a send
6643          * operation. The former is what the receiver operation does.
6644          * Therefore, if we want to send these snapshots soon after they're
6645          * received, we need to handle orphan inodes as well. Moreover, orphans
6646          * can appear not only in the send snapshot but also in the parent
6647          * snapshot. Here are several cases:
6648          *
6649          * Case 1: BTRFS_COMPARE_TREE_NEW
6650          *       |  send snapshot  | action
6651          * --------------------------------
6652          * nlink |        0        | ignore
6653          *
6654          * Case 2: BTRFS_COMPARE_TREE_DELETED
6655          *       | parent snapshot | action
6656          * ----------------------------------
6657          * nlink |        0        | as usual
6658          * Note: No unlinks will be sent because there're no paths for it.
6659          *
6660          * Case 3: BTRFS_COMPARE_TREE_CHANGED
6661          *           |       | parent snapshot | send snapshot | action
6662          * -----------------------------------------------------------------------
6663          * subcase 1 | nlink |        0        |       0       | ignore
6664          * subcase 2 | nlink |       >0        |       0       | new_gen(deletion)
6665          * subcase 3 | nlink |        0        |      >0       | new_gen(creation)
6666          *
6667          */
6668         if (result == BTRFS_COMPARE_TREE_NEW) {
6669                 if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
6670                         sctx->ignore_cur_inode = true;
6671                         goto out;
6672                 }
6673                 sctx->cur_inode_gen = left_gen;
6674                 sctx->cur_inode_new = true;
6675                 sctx->cur_inode_deleted = false;
6676                 sctx->cur_inode_size = btrfs_inode_size(
6677                                 sctx->left_path->nodes[0], left_ii);
6678                 sctx->cur_inode_mode = btrfs_inode_mode(
6679                                 sctx->left_path->nodes[0], left_ii);
6680                 sctx->cur_inode_rdev = btrfs_inode_rdev(
6681                                 sctx->left_path->nodes[0], left_ii);
6682                 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6683                         ret = send_create_inode_if_needed(sctx);
6684         } else if (result == BTRFS_COMPARE_TREE_DELETED) {
6685                 sctx->cur_inode_gen = right_gen;
6686                 sctx->cur_inode_new = false;
6687                 sctx->cur_inode_deleted = true;
6688                 sctx->cur_inode_size = btrfs_inode_size(
6689                                 sctx->right_path->nodes[0], right_ii);
6690                 sctx->cur_inode_mode = btrfs_inode_mode(
6691                                 sctx->right_path->nodes[0], right_ii);
6692         } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
6693                 u32 new_nlinks, old_nlinks;
6694
6695                 new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
6696                 old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
6697                 if (new_nlinks == 0 && old_nlinks == 0) {
6698                         sctx->ignore_cur_inode = true;
6699                         goto out;
6700                 } else if (new_nlinks == 0 || old_nlinks == 0) {
6701                         sctx->cur_inode_new_gen = 1;
6702                 }
6703                 /*
6704                  * We need to do some special handling in case the inode was
6705                  * reported as changed with a changed generation number. This
6706                  * means that the original inode was deleted and new inode
6707                  * reused the same inum. So we have to treat the old inode as
6708                  * deleted and the new one as new.
6709                  */
6710                 if (sctx->cur_inode_new_gen) {
6711                         /*
6712                          * First, process the inode as if it was deleted.
6713                          */
6714                         if (old_nlinks > 0) {
6715                                 sctx->cur_inode_gen = right_gen;
6716                                 sctx->cur_inode_new = false;
6717                                 sctx->cur_inode_deleted = true;
6718                                 sctx->cur_inode_size = btrfs_inode_size(
6719                                                 sctx->right_path->nodes[0], right_ii);
6720                                 sctx->cur_inode_mode = btrfs_inode_mode(
6721                                                 sctx->right_path->nodes[0], right_ii);
6722                                 ret = process_all_refs(sctx,
6723                                                 BTRFS_COMPARE_TREE_DELETED);
6724                                 if (ret < 0)
6725                                         goto out;
6726                         }
6727
6728                         /*
6729                          * Now process the inode as if it was new.
6730                          */
6731                         if (new_nlinks > 0) {
6732                                 sctx->cur_inode_gen = left_gen;
6733                                 sctx->cur_inode_new = true;
6734                                 sctx->cur_inode_deleted = false;
6735                                 sctx->cur_inode_size = btrfs_inode_size(
6736                                                 sctx->left_path->nodes[0],
6737                                                 left_ii);
6738                                 sctx->cur_inode_mode = btrfs_inode_mode(
6739                                                 sctx->left_path->nodes[0],
6740                                                 left_ii);
6741                                 sctx->cur_inode_rdev = btrfs_inode_rdev(
6742                                                 sctx->left_path->nodes[0],
6743                                                 left_ii);
6744                                 ret = send_create_inode_if_needed(sctx);
6745                                 if (ret < 0)
6746                                         goto out;
6747
6748                                 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
6749                                 if (ret < 0)
6750                                         goto out;
6751                                 /*
6752                                  * Advance send_progress now as we did not get
6753                                  * into process_recorded_refs_if_needed in the
6754                                  * new_gen case.
6755                                  */
6756                                 sctx->send_progress = sctx->cur_ino + 1;
6757
6758                                 /*
6759                                  * Now process all extents and xattrs of the
6760                                  * inode as if they were all new.
6761                                  */
6762                                 ret = process_all_extents(sctx);
6763                                 if (ret < 0)
6764                                         goto out;
6765                                 ret = process_all_new_xattrs(sctx);
6766                                 if (ret < 0)
6767                                         goto out;
6768                         }
6769                 } else {
6770                         sctx->cur_inode_gen = left_gen;
6771                         sctx->cur_inode_new = false;
6772                         sctx->cur_inode_new_gen = false;
6773                         sctx->cur_inode_deleted = false;
6774                         sctx->cur_inode_size = btrfs_inode_size(
6775                                         sctx->left_path->nodes[0], left_ii);
6776                         sctx->cur_inode_mode = btrfs_inode_mode(
6777                                         sctx->left_path->nodes[0], left_ii);
6778                 }
6779         }
6780
6781 out:
6782         return ret;
6783 }
6784
6785 /*
6786  * We have to process new refs before deleted refs, but compare_trees gives us
6787  * the new and deleted refs mixed. To fix this, we record the new/deleted refs
6788  * first and later process them in process_recorded_refs.
6789  * For the cur_inode_new_gen case, we skip recording completely because
6790  * changed_inode did already initiate processing of refs. The reason for this is
6791  * that in this case, compare_tree actually compares the refs of 2 different
6792  * inodes. To fix this, process_all_refs is used in changed_inode to handle all
6793  * refs of the right tree as deleted and all refs of the left tree as new.
6794  */
6795 static int changed_ref(struct send_ctx *sctx,
6796                        enum btrfs_compare_tree_result result)
6797 {
6798         int ret = 0;
6799
6800         if (sctx->cur_ino != sctx->cmp_key->objectid) {
6801                 inconsistent_snapshot_error(sctx, result, "reference");
6802                 return -EIO;
6803         }
6804
6805         if (!sctx->cur_inode_new_gen &&
6806             sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
6807                 if (result == BTRFS_COMPARE_TREE_NEW)
6808                         ret = record_new_ref(sctx);
6809                 else if (result == BTRFS_COMPARE_TREE_DELETED)
6810                         ret = record_deleted_ref(sctx);
6811                 else if (result == BTRFS_COMPARE_TREE_CHANGED)
6812                         ret = record_changed_ref(sctx);
6813         }
6814
6815         return ret;
6816 }
6817
6818 /*
6819  * Process new/deleted/changed xattrs. We skip processing in the
6820  * cur_inode_new_gen case because changed_inode did already initiate processing
6821  * of xattrs. The reason is the same as in changed_ref
6822  */
6823 static int changed_xattr(struct send_ctx *sctx,
6824                          enum btrfs_compare_tree_result result)
6825 {
6826         int ret = 0;
6827
6828         if (sctx->cur_ino != sctx->cmp_key->objectid) {
6829                 inconsistent_snapshot_error(sctx, result, "xattr");
6830                 return -EIO;
6831         }
6832
6833         if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
6834                 if (result == BTRFS_COMPARE_TREE_NEW)
6835                         ret = process_new_xattr(sctx);
6836                 else if (result == BTRFS_COMPARE_TREE_DELETED)
6837                         ret = process_deleted_xattr(sctx);
6838                 else if (result == BTRFS_COMPARE_TREE_CHANGED)
6839                         ret = process_changed_xattr(sctx);
6840         }
6841
6842         return ret;
6843 }
6844
6845 /*
6846  * Process new/deleted/changed extents. We skip processing in the
6847  * cur_inode_new_gen case because changed_inode did already initiate processing
6848  * of extents. The reason is the same as in changed_ref
6849  */
6850 static int changed_extent(struct send_ctx *sctx,
6851                           enum btrfs_compare_tree_result result)
6852 {
6853         int ret = 0;
6854
6855         /*
6856          * We have found an extent item that changed without the inode item
6857          * having changed. This can happen either after relocation (where the
6858          * disk_bytenr of an extent item is replaced at
6859          * relocation.c:replace_file_extents()) or after deduplication into a
6860          * file in both the parent and send snapshots (where an extent item can
6861          * get modified or replaced with a new one). Note that deduplication
6862          * updates the inode item, but it only changes the iversion (sequence
6863          * field in the inode item) of the inode, so if a file is deduplicated
6864          * the same amount of times in both the parent and send snapshots, its
6865          * iversion becomes the same in both snapshots, whence the inode item is
6866          * the same on both snapshots.
6867          */
6868         if (sctx->cur_ino != sctx->cmp_key->objectid)
6869                 return 0;
6870
6871         if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
6872                 if (result != BTRFS_COMPARE_TREE_DELETED)
6873                         ret = process_extent(sctx, sctx->left_path,
6874                                         sctx->cmp_key);
6875         }
6876
6877         return ret;
6878 }
6879
6880 static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
6881 {
6882         int ret = 0;
6883
6884         if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
6885                 if (result == BTRFS_COMPARE_TREE_NEW)
6886                         sctx->cur_inode_needs_verity = true;
6887         }
6888         return ret;
6889 }
6890
6891 static int dir_changed(struct send_ctx *sctx, u64 dir)
6892 {
6893         u64 orig_gen, new_gen;
6894         int ret;
6895
6896         ret = get_inode_gen(sctx->send_root, dir, &new_gen);
6897         if (ret)
6898                 return ret;
6899
6900         ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
6901         if (ret)
6902                 return ret;
6903
6904         return (orig_gen != new_gen) ? 1 : 0;
6905 }
6906
6907 static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
6908                         struct btrfs_key *key)
6909 {
6910         struct btrfs_inode_extref *extref;
6911         struct extent_buffer *leaf;
6912         u64 dirid = 0, last_dirid = 0;
6913         unsigned long ptr;
6914         u32 item_size;
6915         u32 cur_offset = 0;
6916         int ref_name_len;
6917         int ret = 0;
6918
6919         /* Easy case, just check this one dirid */
6920         if (key->type == BTRFS_INODE_REF_KEY) {
6921                 dirid = key->offset;
6922
6923                 ret = dir_changed(sctx, dirid);
6924                 goto out;
6925         }
6926
6927         leaf = path->nodes[0];
6928         item_size = btrfs_item_size(leaf, path->slots[0]);
6929         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
6930         while (cur_offset < item_size) {
6931                 extref = (struct btrfs_inode_extref *)(ptr +
6932                                                        cur_offset);
6933                 dirid = btrfs_inode_extref_parent(leaf, extref);
6934                 ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
6935                 cur_offset += ref_name_len + sizeof(*extref);
6936                 if (dirid == last_dirid)
6937                         continue;
6938                 ret = dir_changed(sctx, dirid);
6939                 if (ret)
6940                         break;
6941                 last_dirid = dirid;
6942         }
6943 out:
6944         return ret;
6945 }
6946
6947 /*
6948  * Updates compare related fields in sctx and simply forwards to the actual
6949  * changed_xxx functions.
6950  */
6951 static int changed_cb(struct btrfs_path *left_path,
6952                       struct btrfs_path *right_path,
6953                       struct btrfs_key *key,
6954                       enum btrfs_compare_tree_result result,
6955                       struct send_ctx *sctx)
6956 {
6957         int ret = 0;
6958
6959         /*
6960          * We can not hold the commit root semaphore here. This is because in
6961          * the case of sending and receiving to the same filesystem, using a
6962          * pipe, could result in a deadlock:
6963          *
6964          * 1) The task running send blocks on the pipe because it's full;
6965          *
6966          * 2) The task running receive, which is the only consumer of the pipe,
6967          *    is waiting for a transaction commit (for example due to a space
6968          *    reservation when doing a write or triggering a transaction commit
6969          *    when creating a subvolume);
6970          *
6971          * 3) The transaction is waiting to write lock the commit root semaphore,
6972          *    but can not acquire it since it's being held at 1).
6973          *
6974          * Down this call chain we write to the pipe through kernel_write().
6975          * The same type of problem can also happen when sending to a file that
6976          * is stored in the same filesystem - when reserving space for a write
6977          * into the file, we can trigger a transaction commit.
6978          *
6979          * Our caller has supplied us with clones of leaves from the send and
6980          * parent roots, so we're safe here from a concurrent relocation and
6981          * further reallocation of metadata extents while we are here. Below we
6982          * also assert that the leaves are clones.
6983          */
6984         lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
6985
6986         /*
6987          * We always have a send root, so left_path is never NULL. We will not
6988          * have a leaf when we have reached the end of the send root but have
6989          * not yet reached the end of the parent root.
6990          */
6991         if (left_path->nodes[0])
6992                 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
6993                                 &left_path->nodes[0]->bflags));
6994         /*
6995          * When doing a full send we don't have a parent root, so right_path is
6996          * NULL. When doing an incremental send, we may have reached the end of
6997          * the parent root already, so we don't have a leaf at right_path.
6998          */
6999         if (right_path && right_path->nodes[0])
7000                 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7001                                 &right_path->nodes[0]->bflags));
7002
7003         if (result == BTRFS_COMPARE_TREE_SAME) {
7004                 if (key->type == BTRFS_INODE_REF_KEY ||
7005                     key->type == BTRFS_INODE_EXTREF_KEY) {
7006                         ret = compare_refs(sctx, left_path, key);
7007                         if (!ret)
7008                                 return 0;
7009                         if (ret < 0)
7010                                 return ret;
7011                 } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
7012                         return maybe_send_hole(sctx, left_path, key);
7013                 } else {
7014                         return 0;
7015                 }
7016                 result = BTRFS_COMPARE_TREE_CHANGED;
7017                 ret = 0;
7018         }
7019
7020         sctx->left_path = left_path;
7021         sctx->right_path = right_path;
7022         sctx->cmp_key = key;
7023
7024         ret = finish_inode_if_needed(sctx, 0);
7025         if (ret < 0)
7026                 goto out;
7027
7028         /* Ignore non-FS objects */
7029         if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
7030             key->objectid == BTRFS_FREE_SPACE_OBJECTID)
7031                 goto out;
7032
7033         if (key->type == BTRFS_INODE_ITEM_KEY) {
7034                 ret = changed_inode(sctx, result);
7035         } else if (!sctx->ignore_cur_inode) {
7036                 if (key->type == BTRFS_INODE_REF_KEY ||
7037                     key->type == BTRFS_INODE_EXTREF_KEY)
7038                         ret = changed_ref(sctx, result);
7039                 else if (key->type == BTRFS_XATTR_ITEM_KEY)
7040                         ret = changed_xattr(sctx, result);
7041                 else if (key->type == BTRFS_EXTENT_DATA_KEY)
7042                         ret = changed_extent(sctx, result);
7043                 else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
7044                          key->offset == 0)
7045                         ret = changed_verity(sctx, result);
7046         }
7047
7048 out:
7049         return ret;
7050 }
7051
7052 static int search_key_again(const struct send_ctx *sctx,
7053                             struct btrfs_root *root,
7054                             struct btrfs_path *path,
7055                             const struct btrfs_key *key)
7056 {
7057         int ret;
7058
7059         if (!path->need_commit_sem)
7060                 lockdep_assert_held_read(&root->fs_info->commit_root_sem);
7061
7062         /*
7063          * Roots used for send operations are readonly and no one can add,
7064          * update or remove keys from them, so we should be able to find our
7065          * key again. The only exception is deduplication, which can operate on
7066          * readonly roots and add, update or remove keys to/from them - but at
7067          * the moment we don't allow it to run in parallel with send.
7068          */
7069         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7070         ASSERT(ret <= 0);
7071         if (ret > 0) {
7072                 btrfs_print_tree(path->nodes[path->lowest_level], false);
7073                 btrfs_err(root->fs_info,
7074 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
7075                           key->objectid, key->type, key->offset,
7076                           (root == sctx->parent_root ? "parent" : "send"),
7077                           root->root_key.objectid, path->lowest_level,
7078                           path->slots[path->lowest_level]);
7079                 return -EUCLEAN;
7080         }
7081
7082         return ret;
7083 }
7084
7085 static int full_send_tree(struct send_ctx *sctx)
7086 {
7087         int ret;
7088         struct btrfs_root *send_root = sctx->send_root;
7089         struct btrfs_key key;
7090         struct btrfs_fs_info *fs_info = send_root->fs_info;
7091         struct btrfs_path *path;
7092
7093         path = alloc_path_for_send();
7094         if (!path)
7095                 return -ENOMEM;
7096         path->reada = READA_FORWARD_ALWAYS;
7097
7098         key.objectid = BTRFS_FIRST_FREE_OBJECTID;
7099         key.type = BTRFS_INODE_ITEM_KEY;
7100         key.offset = 0;
7101
7102         down_read(&fs_info->commit_root_sem);
7103         sctx->last_reloc_trans = fs_info->last_reloc_trans;
7104         up_read(&fs_info->commit_root_sem);
7105
7106         ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
7107         if (ret < 0)
7108                 goto out;
7109         if (ret)
7110                 goto out_finish;
7111
7112         while (1) {
7113                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
7114
7115                 ret = changed_cb(path, NULL, &key,
7116                                  BTRFS_COMPARE_TREE_NEW, sctx);
7117                 if (ret < 0)
7118                         goto out;
7119
7120                 down_read(&fs_info->commit_root_sem);
7121                 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7122                         sctx->last_reloc_trans = fs_info->last_reloc_trans;
7123                         up_read(&fs_info->commit_root_sem);
7124                         /*
7125                          * A transaction used for relocating a block group was
7126                          * committed or is about to finish its commit. Release
7127                          * our path (leaf) and restart the search, so that we
7128                          * avoid operating on any file extent items that are
7129                          * stale, with a disk_bytenr that reflects a pre
7130                          * relocation value. This way we avoid as much as
7131                          * possible to fallback to regular writes when checking
7132                          * if we can clone file ranges.
7133                          */
7134                         btrfs_release_path(path);
7135                         ret = search_key_again(sctx, send_root, path, &key);
7136                         if (ret < 0)
7137                                 goto out;
7138                 } else {
7139                         up_read(&fs_info->commit_root_sem);
7140                 }
7141
7142                 ret = btrfs_next_item(send_root, path);
7143                 if (ret < 0)
7144                         goto out;
7145                 if (ret) {
7146                         ret  = 0;
7147                         break;
7148                 }
7149         }
7150
7151 out_finish:
7152         ret = finish_inode_if_needed(sctx, 1);
7153
7154 out:
7155         btrfs_free_path(path);
7156         return ret;
7157 }
7158
7159 static int replace_node_with_clone(struct btrfs_path *path, int level)
7160 {
7161         struct extent_buffer *clone;
7162
7163         clone = btrfs_clone_extent_buffer(path->nodes[level]);
7164         if (!clone)
7165                 return -ENOMEM;
7166
7167         free_extent_buffer(path->nodes[level]);
7168         path->nodes[level] = clone;
7169
7170         return 0;
7171 }
7172
7173 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
7174 {
7175         struct extent_buffer *eb;
7176         struct extent_buffer *parent = path->nodes[*level];
7177         int slot = path->slots[*level];
7178         const int nritems = btrfs_header_nritems(parent);
7179         u64 reada_max;
7180         u64 reada_done = 0;
7181
7182         lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
7183
7184         BUG_ON(*level == 0);
7185         eb = btrfs_read_node_slot(parent, slot);
7186         if (IS_ERR(eb))
7187                 return PTR_ERR(eb);
7188
7189         /*
7190          * Trigger readahead for the next leaves we will process, so that it is
7191          * very likely that when we need them they are already in memory and we
7192          * will not block on disk IO. For nodes we only do readahead for one,
7193          * since the time window between processing nodes is typically larger.
7194          */
7195         reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
7196
7197         for (slot++; slot < nritems && reada_done < reada_max; slot++) {
7198                 if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
7199                         btrfs_readahead_node_child(parent, slot);
7200                         reada_done += eb->fs_info->nodesize;
7201                 }
7202         }
7203
7204         path->nodes[*level - 1] = eb;
7205         path->slots[*level - 1] = 0;
7206         (*level)--;
7207
7208         if (*level == 0)
7209                 return replace_node_with_clone(path, 0);
7210
7211         return 0;
7212 }
7213
7214 static int tree_move_next_or_upnext(struct btrfs_path *path,
7215                                     int *level, int root_level)
7216 {
7217         int ret = 0;
7218         int nritems;
7219         nritems = btrfs_header_nritems(path->nodes[*level]);
7220
7221         path->slots[*level]++;
7222
7223         while (path->slots[*level] >= nritems) {
7224                 if (*level == root_level) {
7225                         path->slots[*level] = nritems - 1;
7226                         return -1;
7227                 }
7228
7229                 /* move upnext */
7230                 path->slots[*level] = 0;
7231                 free_extent_buffer(path->nodes[*level]);
7232                 path->nodes[*level] = NULL;
7233                 (*level)++;
7234                 path->slots[*level]++;
7235
7236                 nritems = btrfs_header_nritems(path->nodes[*level]);
7237                 ret = 1;
7238         }
7239         return ret;
7240 }
7241
7242 /*
7243  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
7244  * or down.
7245  */
7246 static int tree_advance(struct btrfs_path *path,
7247                         int *level, int root_level,
7248                         int allow_down,
7249                         struct btrfs_key *key,
7250                         u64 reada_min_gen)
7251 {
7252         int ret;
7253
7254         if (*level == 0 || !allow_down) {
7255                 ret = tree_move_next_or_upnext(path, level, root_level);
7256         } else {
7257                 ret = tree_move_down(path, level, reada_min_gen);
7258         }
7259
7260         /*
7261          * Even if we have reached the end of a tree, ret is -1, update the key
7262          * anyway, so that in case we need to restart due to a block group
7263          * relocation, we can assert that the last key of the root node still
7264          * exists in the tree.
7265          */
7266         if (*level == 0)
7267                 btrfs_item_key_to_cpu(path->nodes[*level], key,
7268                                       path->slots[*level]);
7269         else
7270                 btrfs_node_key_to_cpu(path->nodes[*level], key,
7271                                       path->slots[*level]);
7272
7273         return ret;
7274 }
7275
7276 static int tree_compare_item(struct btrfs_path *left_path,
7277                              struct btrfs_path *right_path,
7278                              char *tmp_buf)
7279 {
7280         int cmp;
7281         int len1, len2;
7282         unsigned long off1, off2;
7283
7284         len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
7285         len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
7286         if (len1 != len2)
7287                 return 1;
7288
7289         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
7290         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
7291                                 right_path->slots[0]);
7292
7293         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
7294
7295         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
7296         if (cmp)
7297                 return 1;
7298         return 0;
7299 }
7300
7301 /*
7302  * A transaction used for relocating a block group was committed or is about to
7303  * finish its commit. Release our paths and restart the search, so that we are
7304  * not using stale extent buffers:
7305  *
7306  * 1) For levels > 0, we are only holding references of extent buffers, without
7307  *    any locks on them, which does not prevent them from having been relocated
7308  *    and reallocated after the last time we released the commit root semaphore.
7309  *    The exception are the root nodes, for which we always have a clone, see
7310  *    the comment at btrfs_compare_trees();
7311  *
7312  * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7313  *    we are safe from the concurrent relocation and reallocation. However they
7314  *    can have file extent items with a pre relocation disk_bytenr value, so we
7315  *    restart the start from the current commit roots and clone the new leaves so
7316  *    that we get the post relocation disk_bytenr values. Not doing so, could
7317  *    make us clone the wrong data in case there are new extents using the old
7318  *    disk_bytenr that happen to be shared.
7319  */
7320 static int restart_after_relocation(struct btrfs_path *left_path,
7321                                     struct btrfs_path *right_path,
7322                                     const struct btrfs_key *left_key,
7323                                     const struct btrfs_key *right_key,
7324                                     int left_level,
7325                                     int right_level,
7326                                     const struct send_ctx *sctx)
7327 {
7328         int root_level;
7329         int ret;
7330
7331         lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
7332
7333         btrfs_release_path(left_path);
7334         btrfs_release_path(right_path);
7335
7336         /*
7337          * Since keys can not be added or removed to/from our roots because they
7338          * are readonly and we do not allow deduplication to run in parallel
7339          * (which can add, remove or change keys), the layout of the trees should
7340          * not change.
7341          */
7342         left_path->lowest_level = left_level;
7343         ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
7344         if (ret < 0)
7345                 return ret;
7346
7347         right_path->lowest_level = right_level;
7348         ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
7349         if (ret < 0)
7350                 return ret;
7351
7352         /*
7353          * If the lowest level nodes are leaves, clone them so that they can be
7354          * safely used by changed_cb() while not under the protection of the
7355          * commit root semaphore, even if relocation and reallocation happens in
7356          * parallel.
7357          */
7358         if (left_level == 0) {
7359                 ret = replace_node_with_clone(left_path, 0);
7360                 if (ret < 0)
7361                         return ret;
7362         }
7363
7364         if (right_level == 0) {
7365                 ret = replace_node_with_clone(right_path, 0);
7366                 if (ret < 0)
7367                         return ret;
7368         }
7369
7370         /*
7371          * Now clone the root nodes (unless they happen to be the leaves we have
7372          * already cloned). This is to protect against concurrent snapshotting of
7373          * the send and parent roots (see the comment at btrfs_compare_trees()).
7374          */
7375         root_level = btrfs_header_level(sctx->send_root->commit_root);
7376         if (root_level > 0) {
7377                 ret = replace_node_with_clone(left_path, root_level);
7378                 if (ret < 0)
7379                         return ret;
7380         }
7381
7382         root_level = btrfs_header_level(sctx->parent_root->commit_root);
7383         if (root_level > 0) {
7384                 ret = replace_node_with_clone(right_path, root_level);
7385                 if (ret < 0)
7386                         return ret;
7387         }
7388
7389         return 0;
7390 }
7391
7392 /*
7393  * This function compares two trees and calls the provided callback for
7394  * every changed/new/deleted item it finds.
7395  * If shared tree blocks are encountered, whole subtrees are skipped, making
7396  * the compare pretty fast on snapshotted subvolumes.
7397  *
7398  * This currently works on commit roots only. As commit roots are read only,
7399  * we don't do any locking. The commit roots are protected with transactions.
7400  * Transactions are ended and rejoined when a commit is tried in between.
7401  *
7402  * This function checks for modifications done to the trees while comparing.
7403  * If it detects a change, it aborts immediately.
7404  */
7405 static int btrfs_compare_trees(struct btrfs_root *left_root,
7406                         struct btrfs_root *right_root, struct send_ctx *sctx)
7407 {
7408         struct btrfs_fs_info *fs_info = left_root->fs_info;
7409         int ret;
7410         int cmp;
7411         struct btrfs_path *left_path = NULL;
7412         struct btrfs_path *right_path = NULL;
7413         struct btrfs_key left_key;
7414         struct btrfs_key right_key;
7415         char *tmp_buf = NULL;
7416         int left_root_level;
7417         int right_root_level;
7418         int left_level;
7419         int right_level;
7420         int left_end_reached = 0;
7421         int right_end_reached = 0;
7422         int advance_left = 0;
7423         int advance_right = 0;
7424         u64 left_blockptr;
7425         u64 right_blockptr;
7426         u64 left_gen;
7427         u64 right_gen;
7428         u64 reada_min_gen;
7429
7430         left_path = btrfs_alloc_path();
7431         if (!left_path) {
7432                 ret = -ENOMEM;
7433                 goto out;
7434         }
7435         right_path = btrfs_alloc_path();
7436         if (!right_path) {
7437                 ret = -ENOMEM;
7438                 goto out;
7439         }
7440
7441         tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
7442         if (!tmp_buf) {
7443                 ret = -ENOMEM;
7444                 goto out;
7445         }
7446
7447         left_path->search_commit_root = 1;
7448         left_path->skip_locking = 1;
7449         right_path->search_commit_root = 1;
7450         right_path->skip_locking = 1;
7451
7452         /*
7453          * Strategy: Go to the first items of both trees. Then do
7454          *
7455          * If both trees are at level 0
7456          *   Compare keys of current items
7457          *     If left < right treat left item as new, advance left tree
7458          *       and repeat
7459          *     If left > right treat right item as deleted, advance right tree
7460          *       and repeat
7461          *     If left == right do deep compare of items, treat as changed if
7462          *       needed, advance both trees and repeat
7463          * If both trees are at the same level but not at level 0
7464          *   Compare keys of current nodes/leafs
7465          *     If left < right advance left tree and repeat
7466          *     If left > right advance right tree and repeat
7467          *     If left == right compare blockptrs of the next nodes/leafs
7468          *       If they match advance both trees but stay at the same level
7469          *         and repeat
7470          *       If they don't match advance both trees while allowing to go
7471          *         deeper and repeat
7472          * If tree levels are different
7473          *   Advance the tree that needs it and repeat
7474          *
7475          * Advancing a tree means:
7476          *   If we are at level 0, try to go to the next slot. If that's not
7477          *   possible, go one level up and repeat. Stop when we found a level
7478          *   where we could go to the next slot. We may at this point be on a
7479          *   node or a leaf.
7480          *
7481          *   If we are not at level 0 and not on shared tree blocks, go one
7482          *   level deeper.
7483          *
7484          *   If we are not at level 0 and on shared tree blocks, go one slot to
7485          *   the right if possible or go up and right.
7486          */
7487
7488         down_read(&fs_info->commit_root_sem);
7489         left_level = btrfs_header_level(left_root->commit_root);
7490         left_root_level = left_level;
7491         /*
7492          * We clone the root node of the send and parent roots to prevent races
7493          * with snapshot creation of these roots. Snapshot creation COWs the
7494          * root node of a tree, so after the transaction is committed the old
7495          * extent can be reallocated while this send operation is still ongoing.
7496          * So we clone them, under the commit root semaphore, to be race free.
7497          */
7498         left_path->nodes[left_level] =
7499                         btrfs_clone_extent_buffer(left_root->commit_root);
7500         if (!left_path->nodes[left_level]) {
7501                 ret = -ENOMEM;
7502                 goto out_unlock;
7503         }
7504
7505         right_level = btrfs_header_level(right_root->commit_root);
7506         right_root_level = right_level;
7507         right_path->nodes[right_level] =
7508                         btrfs_clone_extent_buffer(right_root->commit_root);
7509         if (!right_path->nodes[right_level]) {
7510                 ret = -ENOMEM;
7511                 goto out_unlock;
7512         }
7513         /*
7514          * Our right root is the parent root, while the left root is the "send"
7515          * root. We know that all new nodes/leaves in the left root must have
7516          * a generation greater than the right root's generation, so we trigger
7517          * readahead for those nodes and leaves of the left root, as we know we
7518          * will need to read them at some point.
7519          */
7520         reada_min_gen = btrfs_header_generation(right_root->commit_root);
7521
7522         if (left_level == 0)
7523                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
7524                                 &left_key, left_path->slots[left_level]);
7525         else
7526                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
7527                                 &left_key, left_path->slots[left_level]);
7528         if (right_level == 0)
7529                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
7530                                 &right_key, right_path->slots[right_level]);
7531         else
7532                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
7533                                 &right_key, right_path->slots[right_level]);
7534
7535         sctx->last_reloc_trans = fs_info->last_reloc_trans;
7536
7537         while (1) {
7538                 if (need_resched() ||
7539                     rwsem_is_contended(&fs_info->commit_root_sem)) {
7540                         up_read(&fs_info->commit_root_sem);
7541                         cond_resched();
7542                         down_read(&fs_info->commit_root_sem);
7543                 }
7544
7545                 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7546                         ret = restart_after_relocation(left_path, right_path,
7547                                                        &left_key, &right_key,
7548                                                        left_level, right_level,
7549                                                        sctx);
7550                         if (ret < 0)
7551                                 goto out_unlock;
7552                         sctx->last_reloc_trans = fs_info->last_reloc_trans;
7553                 }
7554
7555                 if (advance_left && !left_end_reached) {
7556                         ret = tree_advance(left_path, &left_level,
7557                                         left_root_level,
7558                                         advance_left != ADVANCE_ONLY_NEXT,
7559                                         &left_key, reada_min_gen);
7560                         if (ret == -1)
7561                                 left_end_reached = ADVANCE;
7562                         else if (ret < 0)
7563                                 goto out_unlock;
7564                         advance_left = 0;
7565                 }
7566                 if (advance_right && !right_end_reached) {
7567                         ret = tree_advance(right_path, &right_level,
7568                                         right_root_level,
7569                                         advance_right != ADVANCE_ONLY_NEXT,
7570                                         &right_key, reada_min_gen);
7571                         if (ret == -1)
7572                                 right_end_reached = ADVANCE;
7573                         else if (ret < 0)
7574                                 goto out_unlock;
7575                         advance_right = 0;
7576                 }
7577
7578                 if (left_end_reached && right_end_reached) {
7579                         ret = 0;
7580                         goto out_unlock;
7581                 } else if (left_end_reached) {
7582                         if (right_level == 0) {
7583                                 up_read(&fs_info->commit_root_sem);
7584                                 ret = changed_cb(left_path, right_path,
7585                                                 &right_key,
7586                                                 BTRFS_COMPARE_TREE_DELETED,
7587                                                 sctx);
7588                                 if (ret < 0)
7589                                         goto out;
7590                                 down_read(&fs_info->commit_root_sem);
7591                         }
7592                         advance_right = ADVANCE;
7593                         continue;
7594                 } else if (right_end_reached) {
7595                         if (left_level == 0) {
7596                                 up_read(&fs_info->commit_root_sem);
7597                                 ret = changed_cb(left_path, right_path,
7598                                                 &left_key,
7599                                                 BTRFS_COMPARE_TREE_NEW,
7600                                                 sctx);
7601                                 if (ret < 0)
7602                                         goto out;
7603                                 down_read(&fs_info->commit_root_sem);
7604                         }
7605                         advance_left = ADVANCE;
7606                         continue;
7607                 }
7608
7609                 if (left_level == 0 && right_level == 0) {
7610                         up_read(&fs_info->commit_root_sem);
7611                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7612                         if (cmp < 0) {
7613                                 ret = changed_cb(left_path, right_path,
7614                                                 &left_key,
7615                                                 BTRFS_COMPARE_TREE_NEW,
7616                                                 sctx);
7617                                 advance_left = ADVANCE;
7618                         } else if (cmp > 0) {
7619                                 ret = changed_cb(left_path, right_path,
7620                                                 &right_key,
7621                                                 BTRFS_COMPARE_TREE_DELETED,
7622                                                 sctx);
7623                                 advance_right = ADVANCE;
7624                         } else {
7625                                 enum btrfs_compare_tree_result result;
7626
7627                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
7628                                 ret = tree_compare_item(left_path, right_path,
7629                                                         tmp_buf);
7630                                 if (ret)
7631                                         result = BTRFS_COMPARE_TREE_CHANGED;
7632                                 else
7633                                         result = BTRFS_COMPARE_TREE_SAME;
7634                                 ret = changed_cb(left_path, right_path,
7635                                                  &left_key, result, sctx);
7636                                 advance_left = ADVANCE;
7637                                 advance_right = ADVANCE;
7638                         }
7639
7640                         if (ret < 0)
7641                                 goto out;
7642                         down_read(&fs_info->commit_root_sem);
7643                 } else if (left_level == right_level) {
7644                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7645                         if (cmp < 0) {
7646                                 advance_left = ADVANCE;
7647                         } else if (cmp > 0) {
7648                                 advance_right = ADVANCE;
7649                         } else {
7650                                 left_blockptr = btrfs_node_blockptr(
7651                                                 left_path->nodes[left_level],
7652                                                 left_path->slots[left_level]);
7653                                 right_blockptr = btrfs_node_blockptr(
7654                                                 right_path->nodes[right_level],
7655                                                 right_path->slots[right_level]);
7656                                 left_gen = btrfs_node_ptr_generation(
7657                                                 left_path->nodes[left_level],
7658                                                 left_path->slots[left_level]);
7659                                 right_gen = btrfs_node_ptr_generation(
7660                                                 right_path->nodes[right_level],
7661                                                 right_path->slots[right_level]);
7662                                 if (left_blockptr == right_blockptr &&
7663                                     left_gen == right_gen) {
7664                                         /*
7665                                          * As we're on a shared block, don't
7666                                          * allow to go deeper.
7667                                          */
7668                                         advance_left = ADVANCE_ONLY_NEXT;
7669                                         advance_right = ADVANCE_ONLY_NEXT;
7670                                 } else {
7671                                         advance_left = ADVANCE;
7672                                         advance_right = ADVANCE;
7673                                 }
7674                         }
7675                 } else if (left_level < right_level) {
7676                         advance_right = ADVANCE;
7677                 } else {
7678                         advance_left = ADVANCE;
7679                 }
7680         }
7681
7682 out_unlock:
7683         up_read(&fs_info->commit_root_sem);
7684 out:
7685         btrfs_free_path(left_path);
7686         btrfs_free_path(right_path);
7687         kvfree(tmp_buf);
7688         return ret;
7689 }
7690
7691 static int send_subvol(struct send_ctx *sctx)
7692 {
7693         int ret;
7694
7695         if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
7696                 ret = send_header(sctx);
7697                 if (ret < 0)
7698                         goto out;
7699         }
7700
7701         ret = send_subvol_begin(sctx);
7702         if (ret < 0)
7703                 goto out;
7704
7705         if (sctx->parent_root) {
7706                 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
7707                 if (ret < 0)
7708                         goto out;
7709                 ret = finish_inode_if_needed(sctx, 1);
7710                 if (ret < 0)
7711                         goto out;
7712         } else {
7713                 ret = full_send_tree(sctx);
7714                 if (ret < 0)
7715                         goto out;
7716         }
7717
7718 out:
7719         free_recorded_refs(sctx);
7720         return ret;
7721 }
7722
7723 /*
7724  * If orphan cleanup did remove any orphans from a root, it means the tree
7725  * was modified and therefore the commit root is not the same as the current
7726  * root anymore. This is a problem, because send uses the commit root and
7727  * therefore can see inode items that don't exist in the current root anymore,
7728  * and for example make calls to btrfs_iget, which will do tree lookups based
7729  * on the current root and not on the commit root. Those lookups will fail,
7730  * returning a -ESTALE error, and making send fail with that error. So make
7731  * sure a send does not see any orphans we have just removed, and that it will
7732  * see the same inodes regardless of whether a transaction commit happened
7733  * before it started (meaning that the commit root will be the same as the
7734  * current root) or not.
7735  */
7736 static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
7737 {
7738         int i;
7739         struct btrfs_trans_handle *trans = NULL;
7740
7741 again:
7742         if (sctx->parent_root &&
7743             sctx->parent_root->node != sctx->parent_root->commit_root)
7744                 goto commit_trans;
7745
7746         for (i = 0; i < sctx->clone_roots_cnt; i++)
7747                 if (sctx->clone_roots[i].root->node !=
7748                     sctx->clone_roots[i].root->commit_root)
7749                         goto commit_trans;
7750
7751         if (trans)
7752                 return btrfs_end_transaction(trans);
7753
7754         return 0;
7755
7756 commit_trans:
7757         /* Use any root, all fs roots will get their commit roots updated. */
7758         if (!trans) {
7759                 trans = btrfs_join_transaction(sctx->send_root);
7760                 if (IS_ERR(trans))
7761                         return PTR_ERR(trans);
7762                 goto again;
7763         }
7764
7765         return btrfs_commit_transaction(trans);
7766 }
7767
7768 /*
7769  * Make sure any existing dellaloc is flushed for any root used by a send
7770  * operation so that we do not miss any data and we do not race with writeback
7771  * finishing and changing a tree while send is using the tree. This could
7772  * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
7773  * a send operation then uses the subvolume.
7774  * After flushing delalloc ensure_commit_roots_uptodate() must be called.
7775  */
7776 static int flush_delalloc_roots(struct send_ctx *sctx)
7777 {
7778         struct btrfs_root *root = sctx->parent_root;
7779         int ret;
7780         int i;
7781
7782         if (root) {
7783                 ret = btrfs_start_delalloc_snapshot(root, false);
7784                 if (ret)
7785                         return ret;
7786                 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
7787         }
7788
7789         for (i = 0; i < sctx->clone_roots_cnt; i++) {
7790                 root = sctx->clone_roots[i].root;
7791                 ret = btrfs_start_delalloc_snapshot(root, false);
7792                 if (ret)
7793                         return ret;
7794                 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
7795         }
7796
7797         return 0;
7798 }
7799
7800 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
7801 {
7802         spin_lock(&root->root_item_lock);
7803         root->send_in_progress--;
7804         /*
7805          * Not much left to do, we don't know why it's unbalanced and
7806          * can't blindly reset it to 0.
7807          */
7808         if (root->send_in_progress < 0)
7809                 btrfs_err(root->fs_info,
7810                           "send_in_progress unbalanced %d root %llu",
7811                           root->send_in_progress, root->root_key.objectid);
7812         spin_unlock(&root->root_item_lock);
7813 }
7814
7815 static void dedupe_in_progress_warn(const struct btrfs_root *root)
7816 {
7817         btrfs_warn_rl(root->fs_info,
7818 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
7819                       root->root_key.objectid, root->dedupe_in_progress);
7820 }
7821
7822 long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
7823 {
7824         int ret = 0;
7825         struct btrfs_root *send_root = BTRFS_I(inode)->root;
7826         struct btrfs_fs_info *fs_info = send_root->fs_info;
7827         struct btrfs_root *clone_root;
7828         struct send_ctx *sctx = NULL;
7829         u32 i;
7830         u64 *clone_sources_tmp = NULL;
7831         int clone_sources_to_rollback = 0;
7832         size_t alloc_size;
7833         int sort_clone_roots = 0;
7834
7835         if (!capable(CAP_SYS_ADMIN))
7836                 return -EPERM;
7837
7838         /*
7839          * The subvolume must remain read-only during send, protect against
7840          * making it RW. This also protects against deletion.
7841          */
7842         spin_lock(&send_root->root_item_lock);
7843         if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
7844                 dedupe_in_progress_warn(send_root);
7845                 spin_unlock(&send_root->root_item_lock);
7846                 return -EAGAIN;
7847         }
7848         send_root->send_in_progress++;
7849         spin_unlock(&send_root->root_item_lock);
7850
7851         /*
7852          * Userspace tools do the checks and warn the user if it's
7853          * not RO.
7854          */
7855         if (!btrfs_root_readonly(send_root)) {
7856                 ret = -EPERM;
7857                 goto out;
7858         }
7859
7860         /*
7861          * Check that we don't overflow at later allocations, we request
7862          * clone_sources_count + 1 items, and compare to unsigned long inside
7863          * access_ok. Also set an upper limit for allocation size so this can't
7864          * easily exhaust memory. Max number of clone sources is about 200K.
7865          */
7866         if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
7867                 ret = -EINVAL;
7868                 goto out;
7869         }
7870
7871         if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
7872                 ret = -EOPNOTSUPP;
7873                 goto out;
7874         }
7875
7876         sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
7877         if (!sctx) {
7878                 ret = -ENOMEM;
7879                 goto out;
7880         }
7881
7882         INIT_LIST_HEAD(&sctx->new_refs);
7883         INIT_LIST_HEAD(&sctx->deleted_refs);
7884         INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
7885         INIT_LIST_HEAD(&sctx->name_cache_list);
7886
7887         sctx->flags = arg->flags;
7888
7889         if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
7890                 if (arg->version > BTRFS_SEND_STREAM_VERSION) {
7891                         ret = -EPROTO;
7892                         goto out;
7893                 }
7894                 /* Zero means "use the highest version" */
7895                 sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
7896         } else {
7897                 sctx->proto = 1;
7898         }
7899         if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
7900                 ret = -EINVAL;
7901                 goto out;
7902         }
7903
7904         sctx->send_filp = fget(arg->send_fd);
7905         if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
7906                 ret = -EBADF;
7907                 goto out;
7908         }
7909
7910         sctx->send_root = send_root;
7911         /*
7912          * Unlikely but possible, if the subvolume is marked for deletion but
7913          * is slow to remove the directory entry, send can still be started
7914          */
7915         if (btrfs_root_dead(sctx->send_root)) {
7916                 ret = -EPERM;
7917                 goto out;
7918         }
7919
7920         sctx->clone_roots_cnt = arg->clone_sources_count;
7921
7922         if (sctx->proto >= 2) {
7923                 u32 send_buf_num_pages;
7924
7925                 sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
7926                 sctx->send_buf = vmalloc(sctx->send_max_size);
7927                 if (!sctx->send_buf) {
7928                         ret = -ENOMEM;
7929                         goto out;
7930                 }
7931                 send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
7932                 sctx->send_buf_pages = kcalloc(send_buf_num_pages,
7933                                                sizeof(*sctx->send_buf_pages),
7934                                                GFP_KERNEL);
7935                 if (!sctx->send_buf_pages) {
7936                         ret = -ENOMEM;
7937                         goto out;
7938                 }
7939                 for (i = 0; i < send_buf_num_pages; i++) {
7940                         sctx->send_buf_pages[i] =
7941                                 vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
7942                 }
7943         } else {
7944                 sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
7945                 sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
7946         }
7947         if (!sctx->send_buf) {
7948                 ret = -ENOMEM;
7949                 goto out;
7950         }
7951
7952         sctx->pending_dir_moves = RB_ROOT;
7953         sctx->waiting_dir_moves = RB_ROOT;
7954         sctx->orphan_dirs = RB_ROOT;
7955         sctx->rbtree_new_refs = RB_ROOT;
7956         sctx->rbtree_deleted_refs = RB_ROOT;
7957
7958         sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
7959                                      arg->clone_sources_count + 1,
7960                                      GFP_KERNEL);
7961         if (!sctx->clone_roots) {
7962                 ret = -ENOMEM;
7963                 goto out;
7964         }
7965
7966         alloc_size = array_size(sizeof(*arg->clone_sources),
7967                                 arg->clone_sources_count);
7968
7969         if (arg->clone_sources_count) {
7970                 clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
7971                 if (!clone_sources_tmp) {
7972                         ret = -ENOMEM;
7973                         goto out;
7974                 }
7975
7976                 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
7977                                 alloc_size);
7978                 if (ret) {
7979                         ret = -EFAULT;
7980                         goto out;
7981                 }
7982
7983                 for (i = 0; i < arg->clone_sources_count; i++) {
7984                         clone_root = btrfs_get_fs_root(fs_info,
7985                                                 clone_sources_tmp[i], true);
7986                         if (IS_ERR(clone_root)) {
7987                                 ret = PTR_ERR(clone_root);
7988                                 goto out;
7989                         }
7990                         spin_lock(&clone_root->root_item_lock);
7991                         if (!btrfs_root_readonly(clone_root) ||
7992                             btrfs_root_dead(clone_root)) {
7993                                 spin_unlock(&clone_root->root_item_lock);
7994                                 btrfs_put_root(clone_root);
7995                                 ret = -EPERM;
7996                                 goto out;
7997                         }
7998                         if (clone_root->dedupe_in_progress) {
7999                                 dedupe_in_progress_warn(clone_root);
8000                                 spin_unlock(&clone_root->root_item_lock);
8001                                 btrfs_put_root(clone_root);
8002                                 ret = -EAGAIN;
8003                                 goto out;
8004                         }
8005                         clone_root->send_in_progress++;
8006                         spin_unlock(&clone_root->root_item_lock);
8007
8008                         sctx->clone_roots[i].root = clone_root;
8009                         clone_sources_to_rollback = i + 1;
8010                 }
8011                 kvfree(clone_sources_tmp);
8012                 clone_sources_tmp = NULL;
8013         }
8014
8015         if (arg->parent_root) {
8016                 sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
8017                                                       true);
8018                 if (IS_ERR(sctx->parent_root)) {
8019                         ret = PTR_ERR(sctx->parent_root);
8020                         goto out;
8021                 }
8022
8023                 spin_lock(&sctx->parent_root->root_item_lock);
8024                 sctx->parent_root->send_in_progress++;
8025                 if (!btrfs_root_readonly(sctx->parent_root) ||
8026                                 btrfs_root_dead(sctx->parent_root)) {
8027                         spin_unlock(&sctx->parent_root->root_item_lock);
8028                         ret = -EPERM;
8029                         goto out;
8030                 }
8031                 if (sctx->parent_root->dedupe_in_progress) {
8032                         dedupe_in_progress_warn(sctx->parent_root);
8033                         spin_unlock(&sctx->parent_root->root_item_lock);
8034                         ret = -EAGAIN;
8035                         goto out;
8036                 }
8037                 spin_unlock(&sctx->parent_root->root_item_lock);
8038         }
8039
8040         /*
8041          * Clones from send_root are allowed, but only if the clone source
8042          * is behind the current send position. This is checked while searching
8043          * for possible clone sources.
8044          */
8045         sctx->clone_roots[sctx->clone_roots_cnt++].root =
8046                 btrfs_grab_root(sctx->send_root);
8047
8048         /* We do a bsearch later */
8049         sort(sctx->clone_roots, sctx->clone_roots_cnt,
8050                         sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
8051                         NULL);
8052         sort_clone_roots = 1;
8053
8054         ret = flush_delalloc_roots(sctx);
8055         if (ret)
8056                 goto out;
8057
8058         ret = ensure_commit_roots_uptodate(sctx);
8059         if (ret)
8060                 goto out;
8061
8062         ret = send_subvol(sctx);
8063         if (ret < 0)
8064                 goto out;
8065
8066         if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
8067                 ret = begin_cmd(sctx, BTRFS_SEND_C_END);
8068                 if (ret < 0)
8069                         goto out;
8070                 ret = send_cmd(sctx);
8071                 if (ret < 0)
8072                         goto out;
8073         }
8074
8075 out:
8076         WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
8077         while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
8078                 struct rb_node *n;
8079                 struct pending_dir_move *pm;
8080
8081                 n = rb_first(&sctx->pending_dir_moves);
8082                 pm = rb_entry(n, struct pending_dir_move, node);
8083                 while (!list_empty(&pm->list)) {
8084                         struct pending_dir_move *pm2;
8085
8086                         pm2 = list_first_entry(&pm->list,
8087                                                struct pending_dir_move, list);
8088                         free_pending_move(sctx, pm2);
8089                 }
8090                 free_pending_move(sctx, pm);
8091         }
8092
8093         WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
8094         while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
8095                 struct rb_node *n;
8096                 struct waiting_dir_move *dm;
8097
8098                 n = rb_first(&sctx->waiting_dir_moves);
8099                 dm = rb_entry(n, struct waiting_dir_move, node);
8100                 rb_erase(&dm->node, &sctx->waiting_dir_moves);
8101                 kfree(dm);
8102         }
8103
8104         WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
8105         while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
8106                 struct rb_node *n;
8107                 struct orphan_dir_info *odi;
8108
8109                 n = rb_first(&sctx->orphan_dirs);
8110                 odi = rb_entry(n, struct orphan_dir_info, node);
8111                 free_orphan_dir_info(sctx, odi);
8112         }
8113
8114         if (sort_clone_roots) {
8115                 for (i = 0; i < sctx->clone_roots_cnt; i++) {
8116                         btrfs_root_dec_send_in_progress(
8117                                         sctx->clone_roots[i].root);
8118                         btrfs_put_root(sctx->clone_roots[i].root);
8119                 }
8120         } else {
8121                 for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
8122                         btrfs_root_dec_send_in_progress(
8123                                         sctx->clone_roots[i].root);
8124                         btrfs_put_root(sctx->clone_roots[i].root);
8125                 }
8126
8127                 btrfs_root_dec_send_in_progress(send_root);
8128         }
8129         if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
8130                 btrfs_root_dec_send_in_progress(sctx->parent_root);
8131                 btrfs_put_root(sctx->parent_root);
8132         }
8133
8134         kvfree(clone_sources_tmp);
8135
8136         if (sctx) {
8137                 if (sctx->send_filp)
8138                         fput(sctx->send_filp);
8139
8140                 kvfree(sctx->clone_roots);
8141                 kfree(sctx->send_buf_pages);
8142                 kvfree(sctx->send_buf);
8143                 kvfree(sctx->verity_descriptor);
8144
8145                 name_cache_free(sctx);
8146
8147                 close_current_inode(sctx);
8148
8149                 kfree(sctx);
8150         }
8151
8152         return ret;
8153 }