GNU Linux-libre 6.9.1-gnu
[releases.git] / fs / bcachefs / ec.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* erasure coding */
4
5 #include "bcachefs.h"
6 #include "alloc_background.h"
7 #include "alloc_foreground.h"
8 #include "backpointers.h"
9 #include "bkey_buf.h"
10 #include "bset.h"
11 #include "btree_gc.h"
12 #include "btree_update.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "disk_groups.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "io_read.h"
20 #include "keylist.h"
21 #include "recovery.h"
22 #include "replicas.h"
23 #include "super-io.h"
24 #include "util.h"
25
26 #include <linux/sort.h>
27
28 #ifdef __KERNEL__
29
30 #include <linux/raid/pq.h>
31 #include <linux/raid/xor.h>
32
33 static void raid5_recov(unsigned disks, unsigned failed_idx,
34                         size_t size, void **data)
35 {
36         unsigned i = 2, nr;
37
38         BUG_ON(failed_idx >= disks);
39
40         swap(data[0], data[failed_idx]);
41         memcpy(data[0], data[1], size);
42
43         while (i < disks) {
44                 nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
45                 xor_blocks(nr, size, data[0], data + i);
46                 i += nr;
47         }
48
49         swap(data[0], data[failed_idx]);
50 }
51
52 static void raid_gen(int nd, int np, size_t size, void **v)
53 {
54         if (np >= 1)
55                 raid5_recov(nd + np, nd, size, v);
56         if (np >= 2)
57                 raid6_call.gen_syndrome(nd + np, size, v);
58         BUG_ON(np > 2);
59 }
60
61 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
62 {
63         switch (nr) {
64         case 0:
65                 break;
66         case 1:
67                 if (ir[0] < nd + 1)
68                         raid5_recov(nd + 1, ir[0], size, v);
69                 else
70                         raid6_call.gen_syndrome(nd + np, size, v);
71                 break;
72         case 2:
73                 if (ir[1] < nd) {
74                         /* data+data failure. */
75                         raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
76                 } else if (ir[0] < nd) {
77                         /* data + p/q failure */
78
79                         if (ir[1] == nd) /* data + p failure */
80                                 raid6_datap_recov(nd + np, size, ir[0], v);
81                         else { /* data + q failure */
82                                 raid5_recov(nd + 1, ir[0], size, v);
83                                 raid6_call.gen_syndrome(nd + np, size, v);
84                         }
85                 } else {
86                         raid_gen(nd, np, size, v);
87                 }
88                 break;
89         default:
90                 BUG();
91         }
92 }
93
94 #else
95
96 #include <raid/raid.h>
97
98 #endif
99
100 struct ec_bio {
101         struct bch_dev          *ca;
102         struct ec_stripe_buf    *buf;
103         size_t                  idx;
104         struct bio              bio;
105 };
106
107 /* Stripes btree keys: */
108
109 int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
110                         enum bkey_invalid_flags flags,
111                         struct printbuf *err)
112 {
113         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
114         int ret = 0;
115
116         bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
117                          bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
118                          stripe_pos_bad,
119                          "stripe at bad pos");
120
121         bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
122                          stripe_val_size_bad,
123                          "incorrect value size (%zu < %u)",
124                          bkey_val_u64s(k.k), stripe_val_u64s(s));
125
126         ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
127 fsck_err:
128         return ret;
129 }
130
131 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
132                          struct bkey_s_c k)
133 {
134         const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
135         struct bch_stripe s = {};
136
137         memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
138
139         unsigned nr_data = s.nr_blocks - s.nr_redundant;
140
141         prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
142                    s.algorithm,
143                    le16_to_cpu(s.sectors),
144                    nr_data,
145                    s.nr_redundant);
146         bch2_prt_csum_type(out, s.csum_type);
147         prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
148
149         for (unsigned i = 0; i < s.nr_blocks; i++) {
150                 const struct bch_extent_ptr *ptr = sp->ptrs + i;
151
152                 if ((void *) ptr >= bkey_val_end(k))
153                         break;
154
155                 bch2_extent_ptr_to_text(out, c, ptr);
156
157                 if (s.csum_type < BCH_CSUM_NR &&
158                     i < nr_data &&
159                     stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
160                         prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
161         }
162 }
163
164 /* Triggers: */
165
166 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
167                                          struct bkey_s_c_stripe s,
168                                          unsigned idx, bool deleting)
169 {
170         struct bch_fs *c = trans->c;
171         const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
172         struct btree_iter iter;
173         struct bkey_i_alloc_v4 *a;
174         enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
175                 ? BCH_DATA_parity : 0;
176         s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
177         int ret = 0;
178
179         if (deleting)
180                 sectors = -sectors;
181
182         a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
183         if (IS_ERR(a))
184                 return PTR_ERR(a);
185
186         ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
187                                     a->v.gen, a->v.data_type,
188                                     a->v.dirty_sectors);
189         if (ret)
190                 goto err;
191
192         if (!deleting) {
193                 if (bch2_trans_inconsistent_on(a->v.stripe ||
194                                                a->v.stripe_redundancy, trans,
195                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
196                                 iter.pos.inode, iter.pos.offset, a->v.gen,
197                                 bch2_data_type_str(a->v.data_type),
198                                 a->v.dirty_sectors,
199                                 a->v.stripe, s.k->p.offset)) {
200                         ret = -EIO;
201                         goto err;
202                 }
203
204                 if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
205                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
206                                 iter.pos.inode, iter.pos.offset, a->v.gen,
207                                 bch2_data_type_str(a->v.data_type),
208                                 a->v.dirty_sectors,
209                                 s.k->p.offset)) {
210                         ret = -EIO;
211                         goto err;
212                 }
213
214                 a->v.stripe             = s.k->p.offset;
215                 a->v.stripe_redundancy  = s.v->nr_redundant;
216                 a->v.data_type          = BCH_DATA_stripe;
217         } else {
218                 if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
219                                                a->v.stripe_redundancy != s.v->nr_redundant, trans,
220                                 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
221                                 iter.pos.inode, iter.pos.offset, a->v.gen,
222                                 s.k->p.offset, a->v.stripe)) {
223                         ret = -EIO;
224                         goto err;
225                 }
226
227                 a->v.stripe             = 0;
228                 a->v.stripe_redundancy  = 0;
229                 a->v.data_type          = alloc_data_type(a->v, BCH_DATA_user);
230         }
231
232         a->v.dirty_sectors += sectors;
233         if (data_type)
234                 a->v.data_type = !deleting ? data_type : 0;
235
236         ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
237         if (ret)
238                 goto err;
239 err:
240         bch2_trans_iter_exit(trans, &iter);
241         return ret;
242 }
243
244 static int mark_stripe_bucket(struct btree_trans *trans,
245                               struct bkey_s_c k,
246                               unsigned ptr_idx,
247                               unsigned flags)
248 {
249         struct bch_fs *c = trans->c;
250         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
251         unsigned nr_data = s->nr_blocks - s->nr_redundant;
252         bool parity = ptr_idx >= nr_data;
253         enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
254         s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
255         const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
256         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
257         struct bucket old, new, *g;
258         struct printbuf buf = PRINTBUF;
259         int ret = 0;
260
261         BUG_ON(!(flags & BTREE_TRIGGER_GC));
262
263         /* * XXX doesn't handle deletion */
264
265         percpu_down_read(&c->mark_lock);
266         g = PTR_GC_BUCKET(ca, ptr);
267
268         if (g->dirty_sectors ||
269             (g->stripe && g->stripe != k.k->p.offset)) {
270                 bch2_fs_inconsistent(c,
271                               "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
272                               ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
273                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
274                 ret = -EINVAL;
275                 goto err;
276         }
277
278         bucket_lock(g);
279         old = *g;
280
281         ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
282                                     g->gen, g->data_type,
283                                     g->dirty_sectors);
284         if (ret)
285                 goto err;
286
287         g->data_type = data_type;
288         g->dirty_sectors += sectors;
289
290         g->stripe               = k.k->p.offset;
291         g->stripe_redundancy    = s->nr_redundant;
292         new = *g;
293 err:
294         bucket_unlock(g);
295         if (!ret)
296                 bch2_dev_usage_update_m(c, ca, &old, &new);
297         percpu_up_read(&c->mark_lock);
298         printbuf_exit(&buf);
299         return ret;
300 }
301
302 int bch2_trigger_stripe(struct btree_trans *trans,
303                         enum btree_id btree_id, unsigned level,
304                         struct bkey_s_c old, struct bkey_s _new,
305                         unsigned flags)
306 {
307         struct bkey_s_c new = _new.s_c;
308         struct bch_fs *c = trans->c;
309         u64 idx = new.k->p.offset;
310         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
311                 ? bkey_s_c_to_stripe(old).v : NULL;
312         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
313                 ? bkey_s_c_to_stripe(new).v : NULL;
314
315         if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
316                 /*
317                  * If the pointers aren't changing, we don't need to do anything:
318                  */
319                 if (new_s && old_s &&
320                     new_s->nr_blocks    == old_s->nr_blocks &&
321                     new_s->nr_redundant == old_s->nr_redundant &&
322                     !memcmp(old_s->ptrs, new_s->ptrs,
323                             new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
324                         return 0;
325
326                 BUG_ON(new_s && old_s &&
327                        (new_s->nr_blocks        != old_s->nr_blocks ||
328                         new_s->nr_redundant     != old_s->nr_redundant));
329
330                 if (new_s) {
331                         s64 sectors = le16_to_cpu(new_s->sectors);
332
333                         struct bch_replicas_padded r;
334                         bch2_bkey_to_replicas(&r.e, new);
335                         int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
336                         if (ret)
337                                 return ret;
338                 }
339
340                 if (old_s) {
341                         s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
342
343                         struct bch_replicas_padded r;
344                         bch2_bkey_to_replicas(&r.e, old);
345                         int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
346                         if (ret)
347                                 return ret;
348                 }
349
350                 unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
351                 for (unsigned i = 0; i < nr_blocks; i++) {
352                         if (new_s && old_s &&
353                             !memcmp(&new_s->ptrs[i],
354                                     &old_s->ptrs[i],
355                                     sizeof(new_s->ptrs[i])))
356                                 continue;
357
358                         if (new_s) {
359                                 int ret = bch2_trans_mark_stripe_bucket(trans,
360                                                 bkey_s_c_to_stripe(new), i, false);
361                                 if (ret)
362                                         return ret;
363                         }
364
365                         if (old_s) {
366                                 int ret = bch2_trans_mark_stripe_bucket(trans,
367                                                 bkey_s_c_to_stripe(old), i, true);
368                                 if (ret)
369                                         return ret;
370                         }
371                 }
372         }
373
374         if (flags & BTREE_TRIGGER_ATOMIC) {
375                 struct stripe *m = genradix_ptr(&c->stripes, idx);
376
377                 if (!m) {
378                         struct printbuf buf1 = PRINTBUF;
379                         struct printbuf buf2 = PRINTBUF;
380
381                         bch2_bkey_val_to_text(&buf1, c, old);
382                         bch2_bkey_val_to_text(&buf2, c, new);
383                         bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
384                                             "old %s\n"
385                                             "new %s", idx, buf1.buf, buf2.buf);
386                         printbuf_exit(&buf2);
387                         printbuf_exit(&buf1);
388                         bch2_inconsistent_error(c);
389                         return -1;
390                 }
391
392                 if (!new_s) {
393                         bch2_stripes_heap_del(c, m, idx);
394
395                         memset(m, 0, sizeof(*m));
396                 } else {
397                         m->sectors      = le16_to_cpu(new_s->sectors);
398                         m->algorithm    = new_s->algorithm;
399                         m->nr_blocks    = new_s->nr_blocks;
400                         m->nr_redundant = new_s->nr_redundant;
401                         m->blocks_nonempty = 0;
402
403                         for (unsigned i = 0; i < new_s->nr_blocks; i++)
404                                 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
405
406                         if (!old_s)
407                                 bch2_stripes_heap_insert(c, m, idx);
408                         else
409                                 bch2_stripes_heap_update(c, m, idx);
410                 }
411         }
412
413         if (flags & BTREE_TRIGGER_GC) {
414                 struct gc_stripe *m =
415                         genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
416
417                 if (!m) {
418                         bch_err(c, "error allocating memory for gc_stripes, idx %llu",
419                                 idx);
420                         return -BCH_ERR_ENOMEM_mark_stripe;
421                 }
422                 /*
423                  * This will be wrong when we bring back runtime gc: we should
424                  * be unmarking the old key and then marking the new key
425                  */
426                 m->alive        = true;
427                 m->sectors      = le16_to_cpu(new_s->sectors);
428                 m->nr_blocks    = new_s->nr_blocks;
429                 m->nr_redundant = new_s->nr_redundant;
430
431                 for (unsigned i = 0; i < new_s->nr_blocks; i++)
432                         m->ptrs[i] = new_s->ptrs[i];
433
434                 bch2_bkey_to_replicas(&m->r.e, new);
435
436                 /*
437                  * gc recalculates this field from stripe ptr
438                  * references:
439                  */
440                 memset(m->block_sectors, 0, sizeof(m->block_sectors));
441
442                 for (unsigned i = 0; i < new_s->nr_blocks; i++) {
443                         int ret = mark_stripe_bucket(trans, new, i, flags);
444                         if (ret)
445                                 return ret;
446                 }
447
448                 int ret = bch2_update_replicas(c, new, &m->r.e,
449                                       ((s64) m->sectors * m->nr_redundant),
450                                       0, true);
451                 if (ret) {
452                         struct printbuf buf = PRINTBUF;
453
454                         bch2_bkey_val_to_text(&buf, c, new);
455                         bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
456                         printbuf_exit(&buf);
457                         return ret;
458                 }
459         }
460
461         return 0;
462 }
463
464 /* returns blocknr in stripe that we matched: */
465 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
466                                                 struct bkey_s_c k, unsigned *block)
467 {
468         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
469         unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
470
471         bkey_for_each_ptr(ptrs, ptr)
472                 for (i = 0; i < nr_data; i++)
473                         if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
474                                                       le16_to_cpu(s->sectors))) {
475                                 *block = i;
476                                 return ptr;
477                         }
478
479         return NULL;
480 }
481
482 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
483 {
484         switch (k.k->type) {
485         case KEY_TYPE_extent: {
486                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
487                 const union bch_extent_entry *entry;
488
489                 extent_for_each_entry(e, entry)
490                         if (extent_entry_type(entry) ==
491                             BCH_EXTENT_ENTRY_stripe_ptr &&
492                             entry->stripe_ptr.idx == idx)
493                                 return true;
494
495                 break;
496         }
497         }
498
499         return false;
500 }
501
502 /* Stripe bufs: */
503
504 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
505 {
506         if (buf->key.k.type == KEY_TYPE_stripe) {
507                 struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
508                 unsigned i;
509
510                 for (i = 0; i < s->v.nr_blocks; i++) {
511                         kvfree(buf->data[i]);
512                         buf->data[i] = NULL;
513                 }
514         }
515 }
516
517 /* XXX: this is a non-mempoolified memory allocation: */
518 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
519                               unsigned offset, unsigned size)
520 {
521         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
522         unsigned csum_granularity = 1U << v->csum_granularity_bits;
523         unsigned end = offset + size;
524         unsigned i;
525
526         BUG_ON(end > le16_to_cpu(v->sectors));
527
528         offset  = round_down(offset, csum_granularity);
529         end     = min_t(unsigned, le16_to_cpu(v->sectors),
530                         round_up(end, csum_granularity));
531
532         buf->offset     = offset;
533         buf->size       = end - offset;
534
535         memset(buf->valid, 0xFF, sizeof(buf->valid));
536
537         for (i = 0; i < v->nr_blocks; i++) {
538                 buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
539                 if (!buf->data[i])
540                         goto err;
541         }
542
543         return 0;
544 err:
545         ec_stripe_buf_exit(buf);
546         return -BCH_ERR_ENOMEM_stripe_buf;
547 }
548
549 /* Checksumming: */
550
551 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
552                                          unsigned block, unsigned offset)
553 {
554         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
555         unsigned csum_granularity = 1 << v->csum_granularity_bits;
556         unsigned end = buf->offset + buf->size;
557         unsigned len = min(csum_granularity, end - offset);
558
559         BUG_ON(offset >= end);
560         BUG_ON(offset <  buf->offset);
561         BUG_ON(offset & (csum_granularity - 1));
562         BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
563                (len & (csum_granularity - 1)));
564
565         return bch2_checksum(NULL, v->csum_type,
566                              null_nonce(),
567                              buf->data[block] + ((offset - buf->offset) << 9),
568                              len << 9);
569 }
570
571 static void ec_generate_checksums(struct ec_stripe_buf *buf)
572 {
573         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
574         unsigned i, j, csums_per_device = stripe_csums_per_device(v);
575
576         if (!v->csum_type)
577                 return;
578
579         BUG_ON(buf->offset);
580         BUG_ON(buf->size != le16_to_cpu(v->sectors));
581
582         for (i = 0; i < v->nr_blocks; i++)
583                 for (j = 0; j < csums_per_device; j++)
584                         stripe_csum_set(v, i, j,
585                                 ec_block_checksum(buf, i, j << v->csum_granularity_bits));
586 }
587
588 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
589 {
590         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
591         unsigned csum_granularity = 1 << v->csum_granularity_bits;
592         unsigned i;
593
594         if (!v->csum_type)
595                 return;
596
597         for (i = 0; i < v->nr_blocks; i++) {
598                 unsigned offset = buf->offset;
599                 unsigned end = buf->offset + buf->size;
600
601                 if (!test_bit(i, buf->valid))
602                         continue;
603
604                 while (offset < end) {
605                         unsigned j = offset >> v->csum_granularity_bits;
606                         unsigned len = min(csum_granularity, end - offset);
607                         struct bch_csum want = stripe_csum_get(v, i, j);
608                         struct bch_csum got = ec_block_checksum(buf, i, offset);
609
610                         if (bch2_crc_cmp(want, got)) {
611                                 struct printbuf err = PRINTBUF;
612                                 struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
613
614                                 prt_str(&err, "stripe ");
615                                 bch2_csum_err_msg(&err, v->csum_type, want, got);
616                                 prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
617                                 bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
618                                 bch_err_ratelimited(ca, "%s", err.buf);
619                                 printbuf_exit(&err);
620
621                                 clear_bit(i, buf->valid);
622
623                                 bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
624                                 break;
625                         }
626
627                         offset += len;
628                 }
629         }
630 }
631
632 /* Erasure coding: */
633
634 static void ec_generate_ec(struct ec_stripe_buf *buf)
635 {
636         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
637         unsigned nr_data = v->nr_blocks - v->nr_redundant;
638         unsigned bytes = le16_to_cpu(v->sectors) << 9;
639
640         raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
641 }
642
643 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
644 {
645         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
646
647         return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
648 }
649
650 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
651 {
652         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
653         unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
654         unsigned nr_data = v->nr_blocks - v->nr_redundant;
655         unsigned bytes = buf->size << 9;
656
657         if (ec_nr_failed(buf) > v->nr_redundant) {
658                 bch_err_ratelimited(c,
659                         "error doing reconstruct read: unable to read enough blocks");
660                 return -1;
661         }
662
663         for (i = 0; i < nr_data; i++)
664                 if (!test_bit(i, buf->valid))
665                         failed[nr_failed++] = i;
666
667         raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
668         return 0;
669 }
670
671 /* IO: */
672
673 static void ec_block_endio(struct bio *bio)
674 {
675         struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
676         struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
677         struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
678         struct bch_dev *ca = ec_bio->ca;
679         struct closure *cl = bio->bi_private;
680
681         if (bch2_dev_io_err_on(bio->bi_status, ca,
682                                bio_data_dir(bio)
683                                ? BCH_MEMBER_ERROR_write
684                                : BCH_MEMBER_ERROR_read,
685                                "erasure coding %s error: %s",
686                                bio_data_dir(bio) ? "write" : "read",
687                                bch2_blk_status_to_str(bio->bi_status)))
688                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
689
690         if (ptr_stale(ca, ptr)) {
691                 bch_err_ratelimited(ca->fs,
692                                     "error %s stripe: stale pointer after io",
693                                     bio_data_dir(bio) == READ ? "reading from" : "writing to");
694                 clear_bit(ec_bio->idx, ec_bio->buf->valid);
695         }
696
697         bio_put(&ec_bio->bio);
698         percpu_ref_put(&ca->io_ref);
699         closure_put(cl);
700 }
701
702 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
703                         blk_opf_t opf, unsigned idx, struct closure *cl)
704 {
705         struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
706         unsigned offset = 0, bytes = buf->size << 9;
707         struct bch_extent_ptr *ptr = &v->ptrs[idx];
708         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
709         enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
710                 ? BCH_DATA_user
711                 : BCH_DATA_parity;
712         int rw = op_is_write(opf);
713
714         if (ptr_stale(ca, ptr)) {
715                 bch_err_ratelimited(c,
716                                     "error %s stripe: stale pointer",
717                                     rw == READ ? "reading from" : "writing to");
718                 clear_bit(idx, buf->valid);
719                 return;
720         }
721
722         if (!bch2_dev_get_ioref(ca, rw)) {
723                 clear_bit(idx, buf->valid);
724                 return;
725         }
726
727         this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
728
729         while (offset < bytes) {
730                 unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
731                                            DIV_ROUND_UP(bytes, PAGE_SIZE));
732                 unsigned b = min_t(size_t, bytes - offset,
733                                    nr_iovecs << PAGE_SHIFT);
734                 struct ec_bio *ec_bio;
735
736                 ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
737                                                        nr_iovecs,
738                                                        opf,
739                                                        GFP_KERNEL,
740                                                        &c->ec_bioset),
741                                       struct ec_bio, bio);
742
743                 ec_bio->ca                      = ca;
744                 ec_bio->buf                     = buf;
745                 ec_bio->idx                     = idx;
746
747                 ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
748                 ec_bio->bio.bi_end_io           = ec_block_endio;
749                 ec_bio->bio.bi_private          = cl;
750
751                 bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
752
753                 closure_get(cl);
754                 percpu_ref_get(&ca->io_ref);
755
756                 submit_bio(&ec_bio->bio);
757
758                 offset += b;
759         }
760
761         percpu_ref_put(&ca->io_ref);
762 }
763
764 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
765                                 struct ec_stripe_buf *stripe)
766 {
767         struct btree_iter iter;
768         struct bkey_s_c k;
769         int ret;
770
771         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
772                                POS(0, idx), BTREE_ITER_SLOTS);
773         ret = bkey_err(k);
774         if (ret)
775                 goto err;
776         if (k.k->type != KEY_TYPE_stripe) {
777                 ret = -ENOENT;
778                 goto err;
779         }
780         bkey_reassemble(&stripe->key, k);
781 err:
782         bch2_trans_iter_exit(trans, &iter);
783         return ret;
784 }
785
786 /* recovery read path: */
787 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
788 {
789         struct bch_fs *c = trans->c;
790         struct ec_stripe_buf *buf;
791         struct closure cl;
792         struct bch_stripe *v;
793         unsigned i, offset;
794         int ret = 0;
795
796         closure_init_stack(&cl);
797
798         BUG_ON(!rbio->pick.has_ec);
799
800         buf = kzalloc(sizeof(*buf), GFP_NOFS);
801         if (!buf)
802                 return -BCH_ERR_ENOMEM_ec_read_extent;
803
804         ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
805         if (ret) {
806                 bch_err_ratelimited(c,
807                         "error doing reconstruct read: error %i looking up stripe", ret);
808                 kfree(buf);
809                 return -EIO;
810         }
811
812         v = &bkey_i_to_stripe(&buf->key)->v;
813
814         if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
815                 bch_err_ratelimited(c,
816                         "error doing reconstruct read: pointer doesn't match stripe");
817                 ret = -EIO;
818                 goto err;
819         }
820
821         offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
822         if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
823                 bch_err_ratelimited(c,
824                         "error doing reconstruct read: read is bigger than stripe");
825                 ret = -EIO;
826                 goto err;
827         }
828
829         ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
830         if (ret)
831                 goto err;
832
833         for (i = 0; i < v->nr_blocks; i++)
834                 ec_block_io(c, buf, REQ_OP_READ, i, &cl);
835
836         closure_sync(&cl);
837
838         if (ec_nr_failed(buf) > v->nr_redundant) {
839                 bch_err_ratelimited(c,
840                         "error doing reconstruct read: unable to read enough blocks");
841                 ret = -EIO;
842                 goto err;
843         }
844
845         ec_validate_checksums(c, buf);
846
847         ret = ec_do_recov(c, buf);
848         if (ret)
849                 goto err;
850
851         memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
852                       buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
853 err:
854         ec_stripe_buf_exit(buf);
855         kfree(buf);
856         return ret;
857 }
858
859 /* stripe bucket accounting: */
860
861 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
862 {
863         ec_stripes_heap n, *h = &c->ec_stripes_heap;
864
865         if (idx >= h->size) {
866                 if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
867                         return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
868
869                 mutex_lock(&c->ec_stripes_heap_lock);
870                 if (n.size > h->size) {
871                         memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
872                         n.used = h->used;
873                         swap(*h, n);
874                 }
875                 mutex_unlock(&c->ec_stripes_heap_lock);
876
877                 free_heap(&n);
878         }
879
880         if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
881                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
882
883         if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
884             !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
885                 return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
886
887         return 0;
888 }
889
890 static int ec_stripe_mem_alloc(struct btree_trans *trans,
891                                struct btree_iter *iter)
892 {
893         return allocate_dropping_locks_errcode(trans,
894                         __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
895 }
896
897 /*
898  * Hash table of open stripes:
899  * Stripes that are being created or modified are kept in a hash table, so that
900  * stripe deletion can skip them.
901  */
902
903 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
904 {
905         unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
906         struct ec_stripe_new *s;
907
908         hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
909                 if (s->idx == idx)
910                         return true;
911         return false;
912 }
913
914 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
915 {
916         bool ret = false;
917
918         spin_lock(&c->ec_stripes_new_lock);
919         ret = __bch2_stripe_is_open(c, idx);
920         spin_unlock(&c->ec_stripes_new_lock);
921
922         return ret;
923 }
924
925 static bool bch2_try_open_stripe(struct bch_fs *c,
926                                  struct ec_stripe_new *s,
927                                  u64 idx)
928 {
929         bool ret;
930
931         spin_lock(&c->ec_stripes_new_lock);
932         ret = !__bch2_stripe_is_open(c, idx);
933         if (ret) {
934                 unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
935
936                 s->idx = idx;
937                 hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
938         }
939         spin_unlock(&c->ec_stripes_new_lock);
940
941         return ret;
942 }
943
944 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
945 {
946         BUG_ON(!s->idx);
947
948         spin_lock(&c->ec_stripes_new_lock);
949         hlist_del_init(&s->hash);
950         spin_unlock(&c->ec_stripes_new_lock);
951
952         s->idx = 0;
953 }
954
955 /* Heap of all existing stripes, ordered by blocks_nonempty */
956
957 static u64 stripe_idx_to_delete(struct bch_fs *c)
958 {
959         ec_stripes_heap *h = &c->ec_stripes_heap;
960
961         lockdep_assert_held(&c->ec_stripes_heap_lock);
962
963         if (h->used &&
964             h->data[0].blocks_nonempty == 0 &&
965             !bch2_stripe_is_open(c, h->data[0].idx))
966                 return h->data[0].idx;
967
968         return 0;
969 }
970
971 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
972                                       struct ec_stripe_heap_entry l,
973                                       struct ec_stripe_heap_entry r)
974 {
975         return ((l.blocks_nonempty > r.blocks_nonempty) -
976                 (l.blocks_nonempty < r.blocks_nonempty));
977 }
978
979 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
980                                                    size_t i)
981 {
982         struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
983
984         genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
985 }
986
987 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
988 {
989         ec_stripes_heap *h = &c->ec_stripes_heap;
990         struct stripe *m = genradix_ptr(&c->stripes, idx);
991
992         BUG_ON(m->heap_idx >= h->used);
993         BUG_ON(h->data[m->heap_idx].idx != idx);
994 }
995
996 void bch2_stripes_heap_del(struct bch_fs *c,
997                            struct stripe *m, size_t idx)
998 {
999         mutex_lock(&c->ec_stripes_heap_lock);
1000         heap_verify_backpointer(c, idx);
1001
1002         heap_del(&c->ec_stripes_heap, m->heap_idx,
1003                  ec_stripes_heap_cmp,
1004                  ec_stripes_heap_set_backpointer);
1005         mutex_unlock(&c->ec_stripes_heap_lock);
1006 }
1007
1008 void bch2_stripes_heap_insert(struct bch_fs *c,
1009                               struct stripe *m, size_t idx)
1010 {
1011         mutex_lock(&c->ec_stripes_heap_lock);
1012         BUG_ON(heap_full(&c->ec_stripes_heap));
1013
1014         heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
1015                         .idx = idx,
1016                         .blocks_nonempty = m->blocks_nonempty,
1017                 }),
1018                  ec_stripes_heap_cmp,
1019                  ec_stripes_heap_set_backpointer);
1020
1021         heap_verify_backpointer(c, idx);
1022         mutex_unlock(&c->ec_stripes_heap_lock);
1023 }
1024
1025 void bch2_stripes_heap_update(struct bch_fs *c,
1026                               struct stripe *m, size_t idx)
1027 {
1028         ec_stripes_heap *h = &c->ec_stripes_heap;
1029         bool do_deletes;
1030         size_t i;
1031
1032         mutex_lock(&c->ec_stripes_heap_lock);
1033         heap_verify_backpointer(c, idx);
1034
1035         h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
1036
1037         i = m->heap_idx;
1038         heap_sift_up(h,   i, ec_stripes_heap_cmp,
1039                      ec_stripes_heap_set_backpointer);
1040         heap_sift_down(h, i, ec_stripes_heap_cmp,
1041                        ec_stripes_heap_set_backpointer);
1042
1043         heap_verify_backpointer(c, idx);
1044
1045         do_deletes = stripe_idx_to_delete(c) != 0;
1046         mutex_unlock(&c->ec_stripes_heap_lock);
1047
1048         if (do_deletes)
1049                 bch2_do_stripe_deletes(c);
1050 }
1051
1052 /* stripe deletion */
1053
1054 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
1055 {
1056         struct bch_fs *c = trans->c;
1057         struct btree_iter iter;
1058         struct bkey_s_c k;
1059         struct bkey_s_c_stripe s;
1060         int ret;
1061
1062         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
1063                                BTREE_ITER_INTENT);
1064         ret = bkey_err(k);
1065         if (ret)
1066                 goto err;
1067
1068         if (k.k->type != KEY_TYPE_stripe) {
1069                 bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
1070                 ret = -EINVAL;
1071                 goto err;
1072         }
1073
1074         s = bkey_s_c_to_stripe(k);
1075         for (unsigned i = 0; i < s.v->nr_blocks; i++)
1076                 if (stripe_blockcount_get(s.v, i)) {
1077                         struct printbuf buf = PRINTBUF;
1078
1079                         bch2_bkey_val_to_text(&buf, c, k);
1080                         bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
1081                         printbuf_exit(&buf);
1082                         ret = -EINVAL;
1083                         goto err;
1084                 }
1085
1086         ret = bch2_btree_delete_at(trans, &iter, 0);
1087 err:
1088         bch2_trans_iter_exit(trans, &iter);
1089         return ret;
1090 }
1091
1092 static void ec_stripe_delete_work(struct work_struct *work)
1093 {
1094         struct bch_fs *c =
1095                 container_of(work, struct bch_fs, ec_stripe_delete_work);
1096
1097         while (1) {
1098                 mutex_lock(&c->ec_stripes_heap_lock);
1099                 u64 idx = stripe_idx_to_delete(c);
1100                 mutex_unlock(&c->ec_stripes_heap_lock);
1101
1102                 if (!idx)
1103                         break;
1104
1105                 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1106                                         ec_stripe_delete(trans, idx));
1107                 bch_err_fn(c, ret);
1108                 if (ret)
1109                         break;
1110         }
1111
1112         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1113 }
1114
1115 void bch2_do_stripe_deletes(struct bch_fs *c)
1116 {
1117         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
1118             !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
1119                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1120 }
1121
1122 /* stripe creation: */
1123
1124 static int ec_stripe_key_update(struct btree_trans *trans,
1125                                 struct bkey_i_stripe *new,
1126                                 bool create)
1127 {
1128         struct bch_fs *c = trans->c;
1129         struct btree_iter iter;
1130         struct bkey_s_c k;
1131         int ret;
1132
1133         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
1134                                new->k.p, BTREE_ITER_INTENT);
1135         ret = bkey_err(k);
1136         if (ret)
1137                 goto err;
1138
1139         if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
1140                 bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
1141                                      create ? "creating" : "updating",
1142                                      bch2_bkey_types[k.k->type]);
1143                 ret = -EINVAL;
1144                 goto err;
1145         }
1146
1147         if (k.k->type == KEY_TYPE_stripe) {
1148                 const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
1149                 unsigned i;
1150
1151                 if (old->nr_blocks != new->v.nr_blocks) {
1152                         bch_err(c, "error updating stripe: nr_blocks does not match");
1153                         ret = -EINVAL;
1154                         goto err;
1155                 }
1156
1157                 for (i = 0; i < new->v.nr_blocks; i++) {
1158                         unsigned v = stripe_blockcount_get(old, i);
1159
1160                         BUG_ON(v &&
1161                                (old->ptrs[i].dev != new->v.ptrs[i].dev ||
1162                                 old->ptrs[i].gen != new->v.ptrs[i].gen ||
1163                                 old->ptrs[i].offset != new->v.ptrs[i].offset));
1164
1165                         stripe_blockcount_set(&new->v, i, v);
1166                 }
1167         }
1168
1169         ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
1170 err:
1171         bch2_trans_iter_exit(trans, &iter);
1172         return ret;
1173 }
1174
1175 static int ec_stripe_update_extent(struct btree_trans *trans,
1176                                    struct bpos bucket, u8 gen,
1177                                    struct ec_stripe_buf *s,
1178                                    struct bpos *bp_pos)
1179 {
1180         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1181         struct bch_fs *c = trans->c;
1182         struct bch_backpointer bp;
1183         struct btree_iter iter;
1184         struct bkey_s_c k;
1185         const struct bch_extent_ptr *ptr_c;
1186         struct bch_extent_ptr *ptr, *ec_ptr = NULL;
1187         struct bch_extent_stripe_ptr stripe_ptr;
1188         struct bkey_i *n;
1189         int ret, dev, block;
1190
1191         ret = bch2_get_next_backpointer(trans, bucket, gen,
1192                                 bp_pos, &bp, BTREE_ITER_CACHED);
1193         if (ret)
1194                 return ret;
1195         if (bpos_eq(*bp_pos, SPOS_MAX))
1196                 return 0;
1197
1198         if (bp.level) {
1199                 struct printbuf buf = PRINTBUF;
1200                 struct btree_iter node_iter;
1201                 struct btree *b;
1202
1203                 b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
1204                 bch2_trans_iter_exit(trans, &node_iter);
1205
1206                 if (!b)
1207                         return 0;
1208
1209                 prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
1210                 bch2_backpointer_to_text(&buf, &bp);
1211
1212                 bch2_fs_inconsistent(c, "%s", buf.buf);
1213                 printbuf_exit(&buf);
1214                 return -EIO;
1215         }
1216
1217         k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
1218         ret = bkey_err(k);
1219         if (ret)
1220                 return ret;
1221         if (!k.k) {
1222                 /*
1223                  * extent no longer exists - we could flush the btree
1224                  * write buffer and retry to verify, but no need:
1225                  */
1226                 return 0;
1227         }
1228
1229         if (extent_has_stripe_ptr(k, s->key.k.p.offset))
1230                 goto out;
1231
1232         ptr_c = bkey_matches_stripe(v, k, &block);
1233         /*
1234          * It doesn't generally make sense to erasure code cached ptrs:
1235          * XXX: should we be incrementing a counter?
1236          */
1237         if (!ptr_c || ptr_c->cached)
1238                 goto out;
1239
1240         dev = v->ptrs[block].dev;
1241
1242         n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
1243         ret = PTR_ERR_OR_ZERO(n);
1244         if (ret)
1245                 goto out;
1246
1247         bkey_reassemble(n, k);
1248
1249         bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
1250         ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
1251         BUG_ON(!ec_ptr);
1252
1253         stripe_ptr = (struct bch_extent_stripe_ptr) {
1254                 .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
1255                 .block          = block,
1256                 .redundancy     = v->nr_redundant,
1257                 .idx            = s->key.k.p.offset,
1258         };
1259
1260         __extent_entry_insert(n,
1261                         (union bch_extent_entry *) ec_ptr,
1262                         (union bch_extent_entry *) &stripe_ptr);
1263
1264         ret = bch2_trans_update(trans, &iter, n, 0);
1265 out:
1266         bch2_trans_iter_exit(trans, &iter);
1267         return ret;
1268 }
1269
1270 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
1271                                    unsigned block)
1272 {
1273         struct bch_fs *c = trans->c;
1274         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1275         struct bch_extent_ptr bucket = v->ptrs[block];
1276         struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
1277         struct bpos bp_pos = POS_MIN;
1278         int ret = 0;
1279
1280         while (1) {
1281                 ret = commit_do(trans, NULL, NULL,
1282                                 BCH_TRANS_COMMIT_no_check_rw|
1283                                 BCH_TRANS_COMMIT_no_enospc,
1284                         ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
1285                                                 s, &bp_pos));
1286                 if (ret)
1287                         break;
1288                 if (bkey_eq(bp_pos, POS_MAX))
1289                         break;
1290
1291                 bp_pos = bpos_nosnap_successor(bp_pos);
1292         }
1293
1294         return ret;
1295 }
1296
1297 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
1298 {
1299         struct btree_trans *trans = bch2_trans_get(c);
1300         struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1301         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1302         int ret = 0;
1303
1304         ret = bch2_btree_write_buffer_flush_sync(trans);
1305         if (ret)
1306                 goto err;
1307
1308         for (i = 0; i < nr_data; i++) {
1309                 ret = ec_stripe_update_bucket(trans, s, i);
1310                 if (ret)
1311                         break;
1312         }
1313 err:
1314         bch2_trans_put(trans);
1315
1316         return ret;
1317 }
1318
1319 static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1320                                        struct ec_stripe_new *s,
1321                                        unsigned block,
1322                                        struct open_bucket *ob)
1323 {
1324         struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
1325         unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1326         int ret;
1327
1328         if (!bch2_dev_get_ioref(ca, WRITE)) {
1329                 s->err = -BCH_ERR_erofs_no_writes;
1330                 return;
1331         }
1332
1333         memset(s->new_stripe.data[block] + (offset << 9),
1334                0,
1335                ob->sectors_free << 9);
1336
1337         ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1338                         ob->bucket * ca->mi.bucket_size + offset,
1339                         ob->sectors_free,
1340                         GFP_KERNEL, 0);
1341
1342         percpu_ref_put(&ca->io_ref);
1343
1344         if (ret)
1345                 s->err = ret;
1346 }
1347
1348 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1349 {
1350         if (s->idx)
1351                 bch2_stripe_close(c, s);
1352         kfree(s);
1353 }
1354
1355 /*
1356  * data buckets of new stripe all written: create the stripe
1357  */
1358 static void ec_stripe_create(struct ec_stripe_new *s)
1359 {
1360         struct bch_fs *c = s->c;
1361         struct open_bucket *ob;
1362         struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1363         unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1364         int ret;
1365
1366         BUG_ON(s->h->s == s);
1367
1368         closure_sync(&s->iodone);
1369
1370         if (!s->err) {
1371                 for (i = 0; i < nr_data; i++)
1372                         if (s->blocks[i]) {
1373                                 ob = c->open_buckets + s->blocks[i];
1374
1375                                 if (ob->sectors_free)
1376                                         zero_out_rest_of_ec_bucket(c, s, i, ob);
1377                         }
1378         }
1379
1380         if (s->err) {
1381                 if (!bch2_err_matches(s->err, EROFS))
1382                         bch_err(c, "error creating stripe: error writing data buckets");
1383                 goto err;
1384         }
1385
1386         if (s->have_existing_stripe) {
1387                 ec_validate_checksums(c, &s->existing_stripe);
1388
1389                 if (ec_do_recov(c, &s->existing_stripe)) {
1390                         bch_err(c, "error creating stripe: error reading existing stripe");
1391                         goto err;
1392                 }
1393
1394                 for (i = 0; i < nr_data; i++)
1395                         if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1396                                 swap(s->new_stripe.data[i],
1397                                      s->existing_stripe.data[i]);
1398
1399                 ec_stripe_buf_exit(&s->existing_stripe);
1400         }
1401
1402         BUG_ON(!s->allocated);
1403         BUG_ON(!s->idx);
1404
1405         ec_generate_ec(&s->new_stripe);
1406
1407         ec_generate_checksums(&s->new_stripe);
1408
1409         /* write p/q: */
1410         for (i = nr_data; i < v->nr_blocks; i++)
1411                 ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1412         closure_sync(&s->iodone);
1413
1414         if (ec_nr_failed(&s->new_stripe)) {
1415                 bch_err(c, "error creating stripe: error writing redundancy buckets");
1416                 goto err;
1417         }
1418
1419         ret = bch2_trans_do(c, &s->res, NULL,
1420                             BCH_TRANS_COMMIT_no_check_rw|
1421                             BCH_TRANS_COMMIT_no_enospc,
1422                             ec_stripe_key_update(trans,
1423                                         bkey_i_to_stripe(&s->new_stripe.key),
1424                                         !s->have_existing_stripe));
1425         bch_err_msg(c, ret, "creating stripe key");
1426         if (ret) {
1427                 goto err;
1428         }
1429
1430         ret = ec_stripe_update_extents(c, &s->new_stripe);
1431         bch_err_msg(c, ret, "error updating extents");
1432         if (ret)
1433                 goto err;
1434 err:
1435         bch2_disk_reservation_put(c, &s->res);
1436
1437         for (i = 0; i < v->nr_blocks; i++)
1438                 if (s->blocks[i]) {
1439                         ob = c->open_buckets + s->blocks[i];
1440
1441                         if (i < nr_data) {
1442                                 ob->ec = NULL;
1443                                 __bch2_open_bucket_put(c, ob);
1444                         } else {
1445                                 bch2_open_bucket_put(c, ob);
1446                         }
1447                 }
1448
1449         mutex_lock(&c->ec_stripe_new_lock);
1450         list_del(&s->list);
1451         mutex_unlock(&c->ec_stripe_new_lock);
1452         wake_up(&c->ec_stripe_new_wait);
1453
1454         ec_stripe_buf_exit(&s->existing_stripe);
1455         ec_stripe_buf_exit(&s->new_stripe);
1456         closure_debug_destroy(&s->iodone);
1457
1458         ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1459 }
1460
1461 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1462 {
1463         struct ec_stripe_new *s;
1464
1465         mutex_lock(&c->ec_stripe_new_lock);
1466         list_for_each_entry(s, &c->ec_stripe_new_list, list)
1467                 if (!atomic_read(&s->ref[STRIPE_REF_io]))
1468                         goto out;
1469         s = NULL;
1470 out:
1471         mutex_unlock(&c->ec_stripe_new_lock);
1472
1473         return s;
1474 }
1475
1476 static void ec_stripe_create_work(struct work_struct *work)
1477 {
1478         struct bch_fs *c = container_of(work,
1479                 struct bch_fs, ec_stripe_create_work);
1480         struct ec_stripe_new *s;
1481
1482         while ((s = get_pending_stripe(c)))
1483                 ec_stripe_create(s);
1484
1485         bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1486 }
1487
1488 void bch2_ec_do_stripe_creates(struct bch_fs *c)
1489 {
1490         bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1491
1492         if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1493                 bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1494 }
1495
1496 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1497 {
1498         struct ec_stripe_new *s = h->s;
1499
1500         BUG_ON(!s->allocated && !s->err);
1501
1502         h->s            = NULL;
1503         s->pending      = true;
1504
1505         mutex_lock(&c->ec_stripe_new_lock);
1506         list_add(&s->list, &c->ec_stripe_new_list);
1507         mutex_unlock(&c->ec_stripe_new_lock);
1508
1509         ec_stripe_new_put(c, s, STRIPE_REF_io);
1510 }
1511
1512 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1513 {
1514         struct ec_stripe_new *s = ob->ec;
1515
1516         s->err = -EIO;
1517 }
1518
1519 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1520 {
1521         struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1522         struct bch_dev *ca;
1523         unsigned offset;
1524
1525         if (!ob)
1526                 return NULL;
1527
1528         BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1529
1530         ca      = bch_dev_bkey_exists(c, ob->dev);
1531         offset  = ca->mi.bucket_size - ob->sectors_free;
1532
1533         return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1534 }
1535
1536 static int unsigned_cmp(const void *_l, const void *_r)
1537 {
1538         unsigned l = *((const unsigned *) _l);
1539         unsigned r = *((const unsigned *) _r);
1540
1541         return cmp_int(l, r);
1542 }
1543
1544 /* pick most common bucket size: */
1545 static unsigned pick_blocksize(struct bch_fs *c,
1546                                struct bch_devs_mask *devs)
1547 {
1548         unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1549         struct {
1550                 unsigned nr, size;
1551         } cur = { 0, 0 }, best = { 0, 0 };
1552
1553         for_each_member_device_rcu(c, ca, devs)
1554                 sizes[nr++] = ca->mi.bucket_size;
1555
1556         sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1557
1558         for (unsigned i = 0; i < nr; i++) {
1559                 if (sizes[i] != cur.size) {
1560                         if (cur.nr > best.nr)
1561                                 best = cur;
1562
1563                         cur.nr = 0;
1564                         cur.size = sizes[i];
1565                 }
1566
1567                 cur.nr++;
1568         }
1569
1570         if (cur.nr > best.nr)
1571                 best = cur;
1572
1573         return best.size;
1574 }
1575
1576 static bool may_create_new_stripe(struct bch_fs *c)
1577 {
1578         return false;
1579 }
1580
1581 static void ec_stripe_key_init(struct bch_fs *c,
1582                                struct bkey_i *k,
1583                                unsigned nr_data,
1584                                unsigned nr_parity,
1585                                unsigned stripe_size)
1586 {
1587         struct bkey_i_stripe *s = bkey_stripe_init(k);
1588         unsigned u64s;
1589
1590         s->v.sectors                    = cpu_to_le16(stripe_size);
1591         s->v.algorithm                  = 0;
1592         s->v.nr_blocks                  = nr_data + nr_parity;
1593         s->v.nr_redundant               = nr_parity;
1594         s->v.csum_granularity_bits      = ilog2(c->opts.encoded_extent_max >> 9);
1595         s->v.csum_type                  = BCH_CSUM_crc32c;
1596         s->v.pad                        = 0;
1597
1598         while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1599                 BUG_ON(1 << s->v.csum_granularity_bits >=
1600                        le16_to_cpu(s->v.sectors) ||
1601                        s->v.csum_granularity_bits == U8_MAX);
1602                 s->v.csum_granularity_bits++;
1603         }
1604
1605         set_bkey_val_u64s(&s->k, u64s);
1606 }
1607
1608 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1609 {
1610         struct ec_stripe_new *s;
1611
1612         lockdep_assert_held(&h->lock);
1613
1614         s = kzalloc(sizeof(*s), GFP_KERNEL);
1615         if (!s)
1616                 return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
1617
1618         mutex_init(&s->lock);
1619         closure_init(&s->iodone, NULL);
1620         atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1621         atomic_set(&s->ref[STRIPE_REF_io], 1);
1622         s->c            = c;
1623         s->h            = h;
1624         s->nr_data      = min_t(unsigned, h->nr_active_devs,
1625                                 BCH_BKEY_PTRS_MAX) - h->redundancy;
1626         s->nr_parity    = h->redundancy;
1627
1628         ec_stripe_key_init(c, &s->new_stripe.key,
1629                            s->nr_data, s->nr_parity, h->blocksize);
1630
1631         h->s = s;
1632         return 0;
1633 }
1634
1635 static struct ec_stripe_head *
1636 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1637                          unsigned algo, unsigned redundancy,
1638                          enum bch_watermark watermark)
1639 {
1640         struct ec_stripe_head *h;
1641
1642         h = kzalloc(sizeof(*h), GFP_KERNEL);
1643         if (!h)
1644                 return NULL;
1645
1646         mutex_init(&h->lock);
1647         BUG_ON(!mutex_trylock(&h->lock));
1648
1649         h->target       = target;
1650         h->algo         = algo;
1651         h->redundancy   = redundancy;
1652         h->watermark    = watermark;
1653
1654         rcu_read_lock();
1655         h->devs = target_rw_devs(c, BCH_DATA_user, target);
1656
1657         for_each_member_device_rcu(c, ca, &h->devs)
1658                 if (!ca->mi.durability)
1659                         __clear_bit(ca->dev_idx, h->devs.d);
1660
1661         h->blocksize = pick_blocksize(c, &h->devs);
1662
1663         for_each_member_device_rcu(c, ca, &h->devs)
1664                 if (ca->mi.bucket_size == h->blocksize)
1665                         h->nr_active_devs++;
1666
1667         rcu_read_unlock();
1668
1669         /*
1670          * If we only have redundancy + 1 devices, we're better off with just
1671          * replication:
1672          */
1673         if (h->nr_active_devs < h->redundancy + 2)
1674                 bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
1675                         h->nr_active_devs, h->redundancy + 2);
1676
1677         list_add(&h->list, &c->ec_stripe_head_list);
1678         return h;
1679 }
1680
1681 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1682 {
1683         if (h->s &&
1684             h->s->allocated &&
1685             bitmap_weight(h->s->blocks_allocated,
1686                           h->s->nr_data) == h->s->nr_data)
1687                 ec_stripe_set_pending(c, h);
1688
1689         mutex_unlock(&h->lock);
1690 }
1691
1692 static struct ec_stripe_head *
1693 __bch2_ec_stripe_head_get(struct btree_trans *trans,
1694                           unsigned target,
1695                           unsigned algo,
1696                           unsigned redundancy,
1697                           enum bch_watermark watermark)
1698 {
1699         struct bch_fs *c = trans->c;
1700         struct ec_stripe_head *h;
1701         int ret;
1702
1703         if (!redundancy)
1704                 return NULL;
1705
1706         ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1707         if (ret)
1708                 return ERR_PTR(ret);
1709
1710         if (test_bit(BCH_FS_going_ro, &c->flags)) {
1711                 h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1712                 goto found;
1713         }
1714
1715         list_for_each_entry(h, &c->ec_stripe_head_list, list)
1716                 if (h->target           == target &&
1717                     h->algo             == algo &&
1718                     h->redundancy       == redundancy &&
1719                     h->watermark        == watermark) {
1720                         ret = bch2_trans_mutex_lock(trans, &h->lock);
1721                         if (ret)
1722                                 h = ERR_PTR(ret);
1723                         goto found;
1724                 }
1725
1726         h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
1727 found:
1728         if (!IS_ERR_OR_NULL(h) &&
1729             h->nr_active_devs < h->redundancy + 2) {
1730                 mutex_unlock(&h->lock);
1731                 h = NULL;
1732         }
1733         mutex_unlock(&c->ec_stripe_head_lock);
1734         return h;
1735 }
1736
1737 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
1738                                     enum bch_watermark watermark, struct closure *cl)
1739 {
1740         struct bch_fs *c = trans->c;
1741         struct bch_devs_mask devs = h->devs;
1742         struct open_bucket *ob;
1743         struct open_buckets buckets;
1744         struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1745         unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1746         bool have_cache = true;
1747         int ret = 0;
1748
1749         BUG_ON(v->nr_blocks     != h->s->nr_data + h->s->nr_parity);
1750         BUG_ON(v->nr_redundant  != h->s->nr_parity);
1751
1752         for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
1753                 __clear_bit(v->ptrs[i].dev, devs.d);
1754                 if (i < h->s->nr_data)
1755                         nr_have_data++;
1756                 else
1757                         nr_have_parity++;
1758         }
1759
1760         BUG_ON(nr_have_data     > h->s->nr_data);
1761         BUG_ON(nr_have_parity   > h->s->nr_parity);
1762
1763         buckets.nr = 0;
1764         if (nr_have_parity < h->s->nr_parity) {
1765                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1766                                             &h->parity_stripe,
1767                                             &devs,
1768                                             h->s->nr_parity,
1769                                             &nr_have_parity,
1770                                             &have_cache, 0,
1771                                             BCH_DATA_parity,
1772                                             watermark,
1773                                             cl);
1774
1775                 open_bucket_for_each(c, &buckets, ob, i) {
1776                         j = find_next_zero_bit(h->s->blocks_gotten,
1777                                                h->s->nr_data + h->s->nr_parity,
1778                                                h->s->nr_data);
1779                         BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
1780
1781                         h->s->blocks[j] = buckets.v[i];
1782                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1783                         __set_bit(j, h->s->blocks_gotten);
1784                 }
1785
1786                 if (ret)
1787                         return ret;
1788         }
1789
1790         buckets.nr = 0;
1791         if (nr_have_data < h->s->nr_data) {
1792                 ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1793                                             &h->block_stripe,
1794                                             &devs,
1795                                             h->s->nr_data,
1796                                             &nr_have_data,
1797                                             &have_cache, 0,
1798                                             BCH_DATA_user,
1799                                             watermark,
1800                                             cl);
1801
1802                 open_bucket_for_each(c, &buckets, ob, i) {
1803                         j = find_next_zero_bit(h->s->blocks_gotten,
1804                                                h->s->nr_data, 0);
1805                         BUG_ON(j >= h->s->nr_data);
1806
1807                         h->s->blocks[j] = buckets.v[i];
1808                         v->ptrs[j] = bch2_ob_ptr(c, ob);
1809                         __set_bit(j, h->s->blocks_gotten);
1810                 }
1811
1812                 if (ret)
1813                         return ret;
1814         }
1815
1816         return 0;
1817 }
1818
1819 /* XXX: doesn't obey target: */
1820 static s64 get_existing_stripe(struct bch_fs *c,
1821                                struct ec_stripe_head *head)
1822 {
1823         ec_stripes_heap *h = &c->ec_stripes_heap;
1824         struct stripe *m;
1825         size_t heap_idx;
1826         u64 stripe_idx;
1827         s64 ret = -1;
1828
1829         if (may_create_new_stripe(c))
1830                 return -1;
1831
1832         mutex_lock(&c->ec_stripes_heap_lock);
1833         for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
1834                 /* No blocks worth reusing, stripe will just be deleted: */
1835                 if (!h->data[heap_idx].blocks_nonempty)
1836                         continue;
1837
1838                 stripe_idx = h->data[heap_idx].idx;
1839
1840                 m = genradix_ptr(&c->stripes, stripe_idx);
1841
1842                 if (m->algorithm        == head->algo &&
1843                     m->nr_redundant     == head->redundancy &&
1844                     m->sectors          == head->blocksize &&
1845                     m->blocks_nonempty  < m->nr_blocks - m->nr_redundant &&
1846                     bch2_try_open_stripe(c, head->s, stripe_idx)) {
1847                         ret = stripe_idx;
1848                         break;
1849                 }
1850         }
1851         mutex_unlock(&c->ec_stripes_heap_lock);
1852         return ret;
1853 }
1854
1855 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
1856 {
1857         struct bch_fs *c = trans->c;
1858         struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1859         struct bch_stripe *existing_v;
1860         unsigned i;
1861         s64 idx;
1862         int ret;
1863
1864         /*
1865          * If we can't allocate a new stripe, and there's no stripes with empty
1866          * blocks for us to reuse, that means we have to wait on copygc:
1867          */
1868         idx = get_existing_stripe(c, h);
1869         if (idx < 0)
1870                 return -BCH_ERR_stripe_alloc_blocked;
1871
1872         ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
1873         bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
1874                              "reading stripe key: %s", bch2_err_str(ret));
1875         if (ret) {
1876                 bch2_stripe_close(c, h->s);
1877                 return ret;
1878         }
1879
1880         existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
1881
1882         BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
1883         h->s->nr_data = existing_v->nr_blocks -
1884                 existing_v->nr_redundant;
1885
1886         ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
1887         if (ret) {
1888                 bch2_stripe_close(c, h->s);
1889                 return ret;
1890         }
1891
1892         BUG_ON(h->s->existing_stripe.size != h->blocksize);
1893         BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
1894
1895         /*
1896          * Free buckets we initially allocated - they might conflict with
1897          * blocks from the stripe we're reusing:
1898          */
1899         for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
1900                 bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
1901                 h->s->blocks[i] = 0;
1902         }
1903         memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
1904         memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
1905
1906         for (i = 0; i < existing_v->nr_blocks; i++) {
1907                 if (stripe_blockcount_get(existing_v, i)) {
1908                         __set_bit(i, h->s->blocks_gotten);
1909                         __set_bit(i, h->s->blocks_allocated);
1910                 }
1911
1912                 ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
1913         }
1914
1915         bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
1916         h->s->have_existing_stripe = true;
1917
1918         return 0;
1919 }
1920
1921 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
1922 {
1923         struct bch_fs *c = trans->c;
1924         struct btree_iter iter;
1925         struct bkey_s_c k;
1926         struct bpos min_pos = POS(0, 1);
1927         struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
1928         int ret;
1929
1930         if (!h->s->res.sectors) {
1931                 ret = bch2_disk_reservation_get(c, &h->s->res,
1932                                         h->blocksize,
1933                                         h->s->nr_parity,
1934                                         BCH_DISK_RESERVATION_NOFAIL);
1935                 if (ret)
1936                         return ret;
1937         }
1938
1939         for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
1940                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
1941                 if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
1942                         if (start_pos.offset) {
1943                                 start_pos = min_pos;
1944                                 bch2_btree_iter_set_pos(&iter, start_pos);
1945                                 continue;
1946                         }
1947
1948                         ret = -BCH_ERR_ENOSPC_stripe_create;
1949                         break;
1950                 }
1951
1952                 if (bkey_deleted(k.k) &&
1953                     bch2_try_open_stripe(c, h->s, k.k->p.offset))
1954                         break;
1955         }
1956
1957         c->ec_stripe_hint = iter.pos.offset;
1958
1959         if (ret)
1960                 goto err;
1961
1962         ret = ec_stripe_mem_alloc(trans, &iter);
1963         if (ret) {
1964                 bch2_stripe_close(c, h->s);
1965                 goto err;
1966         }
1967
1968         h->s->new_stripe.key.k.p = iter.pos;
1969 out:
1970         bch2_trans_iter_exit(trans, &iter);
1971         return ret;
1972 err:
1973         bch2_disk_reservation_put(c, &h->s->res);
1974         goto out;
1975 }
1976
1977 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
1978                                                unsigned target,
1979                                                unsigned algo,
1980                                                unsigned redundancy,
1981                                                enum bch_watermark watermark,
1982                                                struct closure *cl)
1983 {
1984         struct bch_fs *c = trans->c;
1985         struct ec_stripe_head *h;
1986         bool waiting = false;
1987         int ret;
1988
1989         h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
1990         if (IS_ERR_OR_NULL(h))
1991                 return h;
1992
1993         if (!h->s) {
1994                 ret = ec_new_stripe_alloc(c, h);
1995                 if (ret) {
1996                         bch_err(c, "failed to allocate new stripe");
1997                         goto err;
1998                 }
1999         }
2000
2001         if (h->s->allocated)
2002                 goto allocated;
2003
2004         if (h->s->have_existing_stripe)
2005                 goto alloc_existing;
2006
2007         /* First, try to allocate a full stripe: */
2008         ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
2009                 __bch2_ec_stripe_head_reserve(trans, h);
2010         if (!ret)
2011                 goto allocate_buf;
2012         if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2013             bch2_err_matches(ret, ENOMEM))
2014                 goto err;
2015
2016         /*
2017          * Not enough buckets available for a full stripe: we must reuse an
2018          * existing stripe:
2019          */
2020         while (1) {
2021                 ret = __bch2_ec_stripe_head_reuse(trans, h);
2022                 if (!ret)
2023                         break;
2024                 if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
2025                         goto err;
2026
2027                 if (watermark == BCH_WATERMARK_copygc) {
2028                         ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
2029                                 __bch2_ec_stripe_head_reserve(trans, h);
2030                         if (ret)
2031                                 goto err;
2032                         goto allocate_buf;
2033                 }
2034
2035                 /* XXX freelist_wait? */
2036                 closure_wait(&c->freelist_wait, cl);
2037                 waiting = true;
2038         }
2039
2040         if (waiting)
2041                 closure_wake_up(&c->freelist_wait);
2042 alloc_existing:
2043         /*
2044          * Retry allocating buckets, with the watermark for this
2045          * particular write:
2046          */
2047         ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
2048         if (ret)
2049                 goto err;
2050
2051 allocate_buf:
2052         ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
2053         if (ret)
2054                 goto err;
2055
2056         h->s->allocated = true;
2057 allocated:
2058         BUG_ON(!h->s->idx);
2059         BUG_ON(!h->s->new_stripe.data[0]);
2060         BUG_ON(trans->restarted);
2061         return h;
2062 err:
2063         bch2_ec_stripe_head_put(c, h);
2064         return ERR_PTR(ret);
2065 }
2066
2067 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
2068 {
2069         struct ec_stripe_head *h;
2070         struct open_bucket *ob;
2071         unsigned i;
2072
2073         mutex_lock(&c->ec_stripe_head_lock);
2074         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2075                 mutex_lock(&h->lock);
2076                 if (!h->s)
2077                         goto unlock;
2078
2079                 if (!ca)
2080                         goto found;
2081
2082                 for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
2083                         if (!h->s->blocks[i])
2084                                 continue;
2085
2086                         ob = c->open_buckets + h->s->blocks[i];
2087                         if (ob->dev == ca->dev_idx)
2088                                 goto found;
2089                 }
2090                 goto unlock;
2091 found:
2092                 h->s->err = -BCH_ERR_erofs_no_writes;
2093                 ec_stripe_set_pending(c, h);
2094 unlock:
2095                 mutex_unlock(&h->lock);
2096         }
2097         mutex_unlock(&c->ec_stripe_head_lock);
2098 }
2099
2100 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
2101 {
2102         __bch2_ec_stop(c, ca);
2103 }
2104
2105 void bch2_fs_ec_stop(struct bch_fs *c)
2106 {
2107         __bch2_ec_stop(c, NULL);
2108 }
2109
2110 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
2111 {
2112         bool ret;
2113
2114         mutex_lock(&c->ec_stripe_new_lock);
2115         ret = list_empty(&c->ec_stripe_new_list);
2116         mutex_unlock(&c->ec_stripe_new_lock);
2117
2118         return ret;
2119 }
2120
2121 void bch2_fs_ec_flush(struct bch_fs *c)
2122 {
2123         wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
2124 }
2125
2126 int bch2_stripes_read(struct bch_fs *c)
2127 {
2128         int ret = bch2_trans_run(c,
2129                 for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
2130                                    BTREE_ITER_PREFETCH, k, ({
2131                         if (k.k->type != KEY_TYPE_stripe)
2132                                 continue;
2133
2134                         ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
2135                         if (ret)
2136                                 break;
2137
2138                         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
2139
2140                         struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
2141                         m->sectors      = le16_to_cpu(s->sectors);
2142                         m->algorithm    = s->algorithm;
2143                         m->nr_blocks    = s->nr_blocks;
2144                         m->nr_redundant = s->nr_redundant;
2145                         m->blocks_nonempty = 0;
2146
2147                         for (unsigned i = 0; i < s->nr_blocks; i++)
2148                                 m->blocks_nonempty += !!stripe_blockcount_get(s, i);
2149
2150                         bch2_stripes_heap_insert(c, m, k.k->p.offset);
2151                         0;
2152                 })));
2153         bch_err_fn(c, ret);
2154         return ret;
2155 }
2156
2157 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
2158 {
2159         ec_stripes_heap *h = &c->ec_stripes_heap;
2160         struct stripe *m;
2161         size_t i;
2162
2163         mutex_lock(&c->ec_stripes_heap_lock);
2164         for (i = 0; i < min_t(size_t, h->used, 50); i++) {
2165                 m = genradix_ptr(&c->stripes, h->data[i].idx);
2166
2167                 prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
2168                        h->data[i].blocks_nonempty,
2169                        m->nr_blocks - m->nr_redundant,
2170                        m->nr_redundant);
2171                 if (bch2_stripe_is_open(c, h->data[i].idx))
2172                         prt_str(out, " open");
2173                 prt_newline(out);
2174         }
2175         mutex_unlock(&c->ec_stripes_heap_lock);
2176 }
2177
2178 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
2179 {
2180         struct ec_stripe_head *h;
2181         struct ec_stripe_new *s;
2182
2183         mutex_lock(&c->ec_stripe_head_lock);
2184         list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2185                 prt_printf(out, "target %u algo %u redundancy %u %s:\n",
2186                        h->target, h->algo, h->redundancy,
2187                        bch2_watermarks[h->watermark]);
2188
2189                 if (h->s)
2190                         prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
2191                                h->s->idx, h->s->nr_data, h->s->nr_parity,
2192                                bitmap_weight(h->s->blocks_allocated,
2193                                              h->s->nr_data));
2194         }
2195         mutex_unlock(&c->ec_stripe_head_lock);
2196
2197         prt_printf(out, "in flight:\n");
2198
2199         mutex_lock(&c->ec_stripe_new_lock);
2200         list_for_each_entry(s, &c->ec_stripe_new_list, list) {
2201                 prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
2202                            s->idx, s->nr_data, s->nr_parity,
2203                            atomic_read(&s->ref[STRIPE_REF_io]),
2204                            atomic_read(&s->ref[STRIPE_REF_stripe]),
2205                            bch2_watermarks[s->h->watermark]);
2206         }
2207         mutex_unlock(&c->ec_stripe_new_lock);
2208 }
2209
2210 void bch2_fs_ec_exit(struct bch_fs *c)
2211 {
2212         struct ec_stripe_head *h;
2213         unsigned i;
2214
2215         while (1) {
2216                 mutex_lock(&c->ec_stripe_head_lock);
2217                 h = list_first_entry_or_null(&c->ec_stripe_head_list,
2218                                              struct ec_stripe_head, list);
2219                 if (h)
2220                         list_del(&h->list);
2221                 mutex_unlock(&c->ec_stripe_head_lock);
2222                 if (!h)
2223                         break;
2224
2225                 if (h->s) {
2226                         for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
2227                                 BUG_ON(h->s->blocks[i]);
2228
2229                         kfree(h->s);
2230                 }
2231                 kfree(h);
2232         }
2233
2234         BUG_ON(!list_empty(&c->ec_stripe_new_list));
2235
2236         free_heap(&c->ec_stripes_heap);
2237         genradix_free(&c->stripes);
2238         bioset_exit(&c->ec_bioset);
2239 }
2240
2241 void bch2_fs_ec_init_early(struct bch_fs *c)
2242 {
2243         spin_lock_init(&c->ec_stripes_new_lock);
2244         mutex_init(&c->ec_stripes_heap_lock);
2245
2246         INIT_LIST_HEAD(&c->ec_stripe_head_list);
2247         mutex_init(&c->ec_stripe_head_lock);
2248
2249         INIT_LIST_HEAD(&c->ec_stripe_new_list);
2250         mutex_init(&c->ec_stripe_new_lock);
2251         init_waitqueue_head(&c->ec_stripe_new_wait);
2252
2253         INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
2254         INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
2255 }
2256
2257 int bch2_fs_ec_init(struct bch_fs *c)
2258 {
2259         return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
2260                            BIOSET_NEED_BVECS);
2261 }