GNU Linux-libre 4.19.245-gnu1
[releases.git] / fs / f2fs / gc.c
1 /*
2  * fs/f2fs/gc.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/module.h>
13 #include <linux/backing-dev.h>
14 #include <linux/init.h>
15 #include <linux/f2fs_fs.h>
16 #include <linux/kthread.h>
17 #include <linux/delay.h>
18 #include <linux/freezer.h>
19
20 #include "f2fs.h"
21 #include "node.h"
22 #include "segment.h"
23 #include "gc.h"
24 #include <trace/events/f2fs.h>
25
26 static int gc_thread_func(void *data)
27 {
28         struct f2fs_sb_info *sbi = data;
29         struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
30         wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
31         unsigned int wait_ms;
32
33         wait_ms = gc_th->min_sleep_time;
34
35         set_freezable();
36         do {
37                 wait_event_interruptible_timeout(*wq,
38                                 kthread_should_stop() || freezing(current) ||
39                                 gc_th->gc_wake,
40                                 msecs_to_jiffies(wait_ms));
41
42                 /* give it a try one time */
43                 if (gc_th->gc_wake)
44                         gc_th->gc_wake = 0;
45
46                 if (try_to_freeze())
47                         continue;
48                 if (kthread_should_stop())
49                         break;
50
51                 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
52                         increase_sleep_time(gc_th, &wait_ms);
53                         continue;
54                 }
55
56                 if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
57                         f2fs_show_injection_info(FAULT_CHECKPOINT);
58                         f2fs_stop_checkpoint(sbi, false);
59                 }
60
61                 if (!sb_start_write_trylock(sbi->sb))
62                         continue;
63
64                 /*
65                  * [GC triggering condition]
66                  * 0. GC is not conducted currently.
67                  * 1. There are enough dirty segments.
68                  * 2. IO subsystem is idle by checking the # of writeback pages.
69                  * 3. IO subsystem is idle by checking the # of requests in
70                  *    bdev's request list.
71                  *
72                  * Note) We have to avoid triggering GCs frequently.
73                  * Because it is possible that some segments can be
74                  * invalidated soon after by user update or deletion.
75                  * So, I'd like to wait some time to collect dirty segments.
76                  */
77                 if (sbi->gc_mode == GC_URGENT) {
78                         wait_ms = gc_th->urgent_sleep_time;
79                         mutex_lock(&sbi->gc_mutex);
80                         goto do_gc;
81                 }
82
83                 if (!mutex_trylock(&sbi->gc_mutex))
84                         goto next;
85
86                 if (!is_idle(sbi)) {
87                         increase_sleep_time(gc_th, &wait_ms);
88                         mutex_unlock(&sbi->gc_mutex);
89                         goto next;
90                 }
91
92                 if (has_enough_invalid_blocks(sbi))
93                         decrease_sleep_time(gc_th, &wait_ms);
94                 else
95                         increase_sleep_time(gc_th, &wait_ms);
96 do_gc:
97                 stat_inc_bggc_count(sbi);
98
99                 /* if return value is not zero, no victim was selected */
100                 if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
101                         wait_ms = gc_th->no_gc_sleep_time;
102
103                 trace_f2fs_background_gc(sbi->sb, wait_ms,
104                                 prefree_segments(sbi), free_segments(sbi));
105
106                 /* balancing f2fs's metadata periodically */
107                 f2fs_balance_fs_bg(sbi);
108 next:
109                 sb_end_write(sbi->sb);
110
111         } while (!kthread_should_stop());
112         return 0;
113 }
114
115 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
116 {
117         struct f2fs_gc_kthread *gc_th;
118         dev_t dev = sbi->sb->s_bdev->bd_dev;
119         int err = 0;
120
121         gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
122         if (!gc_th) {
123                 err = -ENOMEM;
124                 goto out;
125         }
126
127         gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
128         gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
129         gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
130         gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
131
132         gc_th->gc_wake= 0;
133
134         sbi->gc_thread = gc_th;
135         init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
136         sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
137                         "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
138         if (IS_ERR(gc_th->f2fs_gc_task)) {
139                 err = PTR_ERR(gc_th->f2fs_gc_task);
140                 kfree(gc_th);
141                 sbi->gc_thread = NULL;
142         }
143 out:
144         return err;
145 }
146
147 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
148 {
149         struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
150         if (!gc_th)
151                 return;
152         kthread_stop(gc_th->f2fs_gc_task);
153         kfree(gc_th);
154         sbi->gc_thread = NULL;
155 }
156
157 static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
158 {
159         int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
160
161         switch (sbi->gc_mode) {
162         case GC_IDLE_CB:
163                 gc_mode = GC_CB;
164                 break;
165         case GC_IDLE_GREEDY:
166         case GC_URGENT:
167                 gc_mode = GC_GREEDY;
168                 break;
169         }
170         return gc_mode;
171 }
172
173 static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
174                         int type, struct victim_sel_policy *p)
175 {
176         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
177
178         if (p->alloc_mode == SSR) {
179                 p->gc_mode = GC_GREEDY;
180                 p->dirty_segmap = dirty_i->dirty_segmap[type];
181                 p->max_search = dirty_i->nr_dirty[type];
182                 p->ofs_unit = 1;
183         } else {
184                 p->gc_mode = select_gc_type(sbi, gc_type);
185                 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
186                 p->max_search = dirty_i->nr_dirty[DIRTY];
187                 p->ofs_unit = sbi->segs_per_sec;
188         }
189
190         /* we need to check every dirty segments in the FG_GC case */
191         if (gc_type != FG_GC &&
192                         (sbi->gc_mode != GC_URGENT) &&
193                         p->max_search > sbi->max_victim_search)
194                 p->max_search = sbi->max_victim_search;
195
196         /* let's select beginning hot/small space first in no_heap mode*/
197         if (test_opt(sbi, NOHEAP) &&
198                 (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
199                 p->offset = 0;
200         else
201                 p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
202 }
203
204 static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
205                                 struct victim_sel_policy *p)
206 {
207         /* SSR allocates in a segment unit */
208         if (p->alloc_mode == SSR)
209                 return sbi->blocks_per_seg;
210         if (p->gc_mode == GC_GREEDY)
211                 return 2 * sbi->blocks_per_seg * p->ofs_unit;
212         else if (p->gc_mode == GC_CB)
213                 return UINT_MAX;
214         else /* No other gc_mode */
215                 return 0;
216 }
217
218 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
219 {
220         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
221         unsigned int secno;
222
223         /*
224          * If the gc_type is FG_GC, we can select victim segments
225          * selected by background GC before.
226          * Those segments guarantee they have small valid blocks.
227          */
228         for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
229                 if (sec_usage_check(sbi, secno))
230                         continue;
231                 clear_bit(secno, dirty_i->victim_secmap);
232                 return GET_SEG_FROM_SEC(sbi, secno);
233         }
234         return NULL_SEGNO;
235 }
236
237 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
238 {
239         struct sit_info *sit_i = SIT_I(sbi);
240         unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
241         unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
242         unsigned long long mtime = 0;
243         unsigned int vblocks;
244         unsigned char age = 0;
245         unsigned char u;
246         unsigned int i;
247
248         for (i = 0; i < sbi->segs_per_sec; i++)
249                 mtime += get_seg_entry(sbi, start + i)->mtime;
250         vblocks = get_valid_blocks(sbi, segno, true);
251
252         mtime = div_u64(mtime, sbi->segs_per_sec);
253         vblocks = div_u64(vblocks, sbi->segs_per_sec);
254
255         u = (vblocks * 100) >> sbi->log_blocks_per_seg;
256
257         /* Handle if the system time has changed by the user */
258         if (mtime < sit_i->min_mtime)
259                 sit_i->min_mtime = mtime;
260         if (mtime > sit_i->max_mtime)
261                 sit_i->max_mtime = mtime;
262         if (sit_i->max_mtime != sit_i->min_mtime)
263                 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
264                                 sit_i->max_mtime - sit_i->min_mtime);
265
266         return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
267 }
268
269 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
270                         unsigned int segno, struct victim_sel_policy *p)
271 {
272         if (p->alloc_mode == SSR)
273                 return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
274
275         /* alloc_mode == LFS */
276         if (p->gc_mode == GC_GREEDY)
277                 return get_valid_blocks(sbi, segno, true);
278         else
279                 return get_cb_cost(sbi, segno);
280 }
281
282 static unsigned int count_bits(const unsigned long *addr,
283                                 unsigned int offset, unsigned int len)
284 {
285         unsigned int end = offset + len, sum = 0;
286
287         while (offset < end) {
288                 if (test_bit(offset++, addr))
289                         ++sum;
290         }
291         return sum;
292 }
293
294 /*
295  * This function is called from two paths.
296  * One is garbage collection and the other is SSR segment selection.
297  * When it is called during GC, it just gets a victim segment
298  * and it does not remove it from dirty seglist.
299  * When it is called from SSR segment selection, it finds a segment
300  * which has minimum valid blocks and removes it from dirty seglist.
301  */
302 static int get_victim_by_default(struct f2fs_sb_info *sbi,
303                 unsigned int *result, int gc_type, int type, char alloc_mode)
304 {
305         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
306         struct sit_info *sm = SIT_I(sbi);
307         struct victim_sel_policy p;
308         unsigned int secno, last_victim;
309         unsigned int last_segment = MAIN_SEGS(sbi);
310         unsigned int nsearched = 0;
311
312         mutex_lock(&dirty_i->seglist_lock);
313
314         p.alloc_mode = alloc_mode;
315         select_policy(sbi, gc_type, type, &p);
316
317         p.min_segno = NULL_SEGNO;
318         p.min_cost = get_max_cost(sbi, &p);
319
320         if (*result != NULL_SEGNO) {
321                 if (get_valid_blocks(sbi, *result, false) &&
322                         !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
323                         p.min_segno = *result;
324                 goto out;
325         }
326
327         if (p.max_search == 0)
328                 goto out;
329
330         last_victim = sm->last_victim[p.gc_mode];
331         if (p.alloc_mode == LFS && gc_type == FG_GC) {
332                 p.min_segno = check_bg_victims(sbi);
333                 if (p.min_segno != NULL_SEGNO)
334                         goto got_it;
335         }
336
337         while (1) {
338                 unsigned long cost;
339                 unsigned int segno;
340
341                 segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
342                 if (segno >= last_segment) {
343                         if (sm->last_victim[p.gc_mode]) {
344                                 last_segment =
345                                         sm->last_victim[p.gc_mode];
346                                 sm->last_victim[p.gc_mode] = 0;
347                                 p.offset = 0;
348                                 continue;
349                         }
350                         break;
351                 }
352
353                 p.offset = segno + p.ofs_unit;
354                 if (p.ofs_unit > 1) {
355                         p.offset -= segno % p.ofs_unit;
356                         nsearched += count_bits(p.dirty_segmap,
357                                                 p.offset - p.ofs_unit,
358                                                 p.ofs_unit);
359                 } else {
360                         nsearched++;
361                 }
362
363                 secno = GET_SEC_FROM_SEG(sbi, segno);
364
365                 if (sec_usage_check(sbi, secno))
366                         goto next;
367                 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
368                         goto next;
369
370                 cost = get_gc_cost(sbi, segno, &p);
371
372                 if (p.min_cost > cost) {
373                         p.min_segno = segno;
374                         p.min_cost = cost;
375                 }
376 next:
377                 if (nsearched >= p.max_search) {
378                         if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
379                                 sm->last_victim[p.gc_mode] = last_victim + 1;
380                         else
381                                 sm->last_victim[p.gc_mode] = segno + 1;
382                         sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
383                         break;
384                 }
385         }
386         if (p.min_segno != NULL_SEGNO) {
387 got_it:
388                 if (p.alloc_mode == LFS) {
389                         secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
390                         if (gc_type == FG_GC)
391                                 sbi->cur_victim_sec = secno;
392                         else
393                                 set_bit(secno, dirty_i->victim_secmap);
394                 }
395                 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
396
397                 trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
398                                 sbi->cur_victim_sec,
399                                 prefree_segments(sbi), free_segments(sbi));
400         }
401 out:
402         mutex_unlock(&dirty_i->seglist_lock);
403
404         return (p.min_segno == NULL_SEGNO) ? 0 : 1;
405 }
406
407 static const struct victim_selection default_v_ops = {
408         .get_victim = get_victim_by_default,
409 };
410
411 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
412 {
413         struct inode_entry *ie;
414
415         ie = radix_tree_lookup(&gc_list->iroot, ino);
416         if (ie)
417                 return ie->inode;
418         return NULL;
419 }
420
421 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
422 {
423         struct inode_entry *new_ie;
424
425         if (inode == find_gc_inode(gc_list, inode->i_ino)) {
426                 iput(inode);
427                 return;
428         }
429         new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
430         new_ie->inode = inode;
431
432         f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
433         list_add_tail(&new_ie->list, &gc_list->ilist);
434 }
435
436 static void put_gc_inode(struct gc_inode_list *gc_list)
437 {
438         struct inode_entry *ie, *next_ie;
439         list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
440                 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
441                 iput(ie->inode);
442                 list_del(&ie->list);
443                 kmem_cache_free(f2fs_inode_entry_slab, ie);
444         }
445 }
446
447 static int check_valid_map(struct f2fs_sb_info *sbi,
448                                 unsigned int segno, int offset)
449 {
450         struct sit_info *sit_i = SIT_I(sbi);
451         struct seg_entry *sentry;
452         int ret;
453
454         down_read(&sit_i->sentry_lock);
455         sentry = get_seg_entry(sbi, segno);
456         ret = f2fs_test_bit(offset, sentry->cur_valid_map);
457         up_read(&sit_i->sentry_lock);
458         return ret;
459 }
460
461 /*
462  * This function compares node address got in summary with that in NAT.
463  * On validity, copy that node with cold status, otherwise (invalid node)
464  * ignore that.
465  */
466 static void gc_node_segment(struct f2fs_sb_info *sbi,
467                 struct f2fs_summary *sum, unsigned int segno, int gc_type)
468 {
469         struct f2fs_summary *entry;
470         block_t start_addr;
471         int off;
472         int phase = 0;
473         bool fggc = (gc_type == FG_GC);
474
475         start_addr = START_BLOCK(sbi, segno);
476
477 next_step:
478         entry = sum;
479
480         if (fggc && phase == 2)
481                 atomic_inc(&sbi->wb_sync_req[NODE]);
482
483         for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
484                 nid_t nid = le32_to_cpu(entry->nid);
485                 struct page *node_page;
486                 struct node_info ni;
487
488                 /* stop BG_GC if there is not enough free sections. */
489                 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
490                         return;
491
492                 if (check_valid_map(sbi, segno, off) == 0)
493                         continue;
494
495                 if (phase == 0) {
496                         f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
497                                                         META_NAT, true);
498                         continue;
499                 }
500
501                 if (phase == 1) {
502                         f2fs_ra_node_page(sbi, nid);
503                         continue;
504                 }
505
506                 /* phase == 2 */
507                 node_page = f2fs_get_node_page(sbi, nid);
508                 if (IS_ERR(node_page))
509                         continue;
510
511                 /* block may become invalid during f2fs_get_node_page */
512                 if (check_valid_map(sbi, segno, off) == 0) {
513                         f2fs_put_page(node_page, 1);
514                         continue;
515                 }
516
517                 if (f2fs_get_node_info(sbi, nid, &ni)) {
518                         f2fs_put_page(node_page, 1);
519                         continue;
520                 }
521
522                 if (ni.blk_addr != start_addr + off) {
523                         f2fs_put_page(node_page, 1);
524                         continue;
525                 }
526
527                 f2fs_move_node_page(node_page, gc_type);
528                 stat_inc_node_blk_count(sbi, 1, gc_type);
529         }
530
531         if (++phase < 3)
532                 goto next_step;
533
534         if (fggc)
535                 atomic_dec(&sbi->wb_sync_req[NODE]);
536 }
537
538 /*
539  * Calculate start block index indicating the given node offset.
540  * Be careful, caller should give this node offset only indicating direct node
541  * blocks. If any node offsets, which point the other types of node blocks such
542  * as indirect or double indirect node blocks, are given, it must be a caller's
543  * bug.
544  */
545 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
546 {
547         unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
548         unsigned int bidx;
549
550         if (node_ofs == 0)
551                 return 0;
552
553         if (node_ofs <= 2) {
554                 bidx = node_ofs - 1;
555         } else if (node_ofs <= indirect_blks) {
556                 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
557                 bidx = node_ofs - 2 - dec;
558         } else {
559                 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
560                 bidx = node_ofs - 5 - dec;
561         }
562         return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
563 }
564
565 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
566                 struct node_info *dni, block_t blkaddr, unsigned int *nofs)
567 {
568         struct page *node_page;
569         nid_t nid;
570         unsigned int ofs_in_node;
571         block_t source_blkaddr;
572
573         nid = le32_to_cpu(sum->nid);
574         ofs_in_node = le16_to_cpu(sum->ofs_in_node);
575
576         node_page = f2fs_get_node_page(sbi, nid);
577         if (IS_ERR(node_page))
578                 return false;
579
580         if (f2fs_get_node_info(sbi, nid, dni)) {
581                 f2fs_put_page(node_page, 1);
582                 return false;
583         }
584
585         if (sum->version != dni->version) {
586                 f2fs_msg(sbi->sb, KERN_WARNING,
587                                 "%s: valid data with mismatched node version.",
588                                 __func__);
589                 set_sbi_flag(sbi, SBI_NEED_FSCK);
590         }
591
592         if (f2fs_check_nid_range(sbi, dni->ino)) {
593                 f2fs_put_page(node_page, 1);
594                 return false;
595         }
596
597         *nofs = ofs_of_node(node_page);
598         source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
599         f2fs_put_page(node_page, 1);
600
601         if (source_blkaddr != blkaddr)
602                 return false;
603         return true;
604 }
605
606 static int ra_data_block(struct inode *inode, pgoff_t index)
607 {
608         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
609         struct address_space *mapping = inode->i_mapping;
610         struct dnode_of_data dn;
611         struct page *page;
612         struct extent_info ei = {0, 0, 0};
613         struct f2fs_io_info fio = {
614                 .sbi = sbi,
615                 .ino = inode->i_ino,
616                 .type = DATA,
617                 .temp = COLD,
618                 .op = REQ_OP_READ,
619                 .op_flags = 0,
620                 .encrypted_page = NULL,
621                 .in_list = false,
622                 .retry = false,
623         };
624         int err;
625
626         page = f2fs_grab_cache_page(mapping, index, true);
627         if (!page)
628                 return -ENOMEM;
629
630         if (f2fs_lookup_extent_cache(inode, index, &ei)) {
631                 dn.data_blkaddr = ei.blk + index - ei.fofs;
632                 goto got_it;
633         }
634
635         set_new_dnode(&dn, inode, NULL, NULL, 0);
636         err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
637         if (err)
638                 goto put_page;
639         f2fs_put_dnode(&dn);
640
641         if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
642                                                 DATA_GENERIC))) {
643                 err = -EFSCORRUPTED;
644                 goto put_page;
645         }
646 got_it:
647         /* read page */
648         fio.page = page;
649         fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
650
651         fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
652                                         dn.data_blkaddr,
653                                         FGP_LOCK | FGP_CREAT, GFP_NOFS);
654         if (!fio.encrypted_page) {
655                 err = -ENOMEM;
656                 goto put_page;
657         }
658
659         err = f2fs_submit_page_bio(&fio);
660         if (err)
661                 goto put_encrypted_page;
662         f2fs_put_page(fio.encrypted_page, 0);
663         f2fs_put_page(page, 1);
664         return 0;
665 put_encrypted_page:
666         f2fs_put_page(fio.encrypted_page, 1);
667 put_page:
668         f2fs_put_page(page, 1);
669         return err;
670 }
671
672 /*
673  * Move data block via META_MAPPING while keeping locked data page.
674  * This can be used to move blocks, aka LBAs, directly on disk.
675  */
676 static void move_data_block(struct inode *inode, block_t bidx,
677                                 int gc_type, unsigned int segno, int off)
678 {
679         struct f2fs_io_info fio = {
680                 .sbi = F2FS_I_SB(inode),
681                 .ino = inode->i_ino,
682                 .type = DATA,
683                 .temp = COLD,
684                 .op = REQ_OP_READ,
685                 .op_flags = 0,
686                 .encrypted_page = NULL,
687                 .in_list = false,
688                 .retry = false,
689         };
690         struct dnode_of_data dn;
691         struct f2fs_summary sum;
692         struct node_info ni;
693         struct page *page, *mpage;
694         block_t newaddr;
695         int err;
696         bool lfs_mode = test_opt(fio.sbi, LFS);
697
698         /* do not read out */
699         page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
700         if (!page)
701                 return;
702
703         if (!check_valid_map(F2FS_I_SB(inode), segno, off))
704                 goto out;
705
706         if (f2fs_is_atomic_file(inode)) {
707                 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
708                 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
709                 goto out;
710         }
711
712         if (f2fs_is_pinned_file(inode)) {
713                 f2fs_pin_file_control(inode, true);
714                 goto out;
715         }
716
717         set_new_dnode(&dn, inode, NULL, NULL, 0);
718         err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
719         if (err)
720                 goto out;
721
722         if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
723                 ClearPageUptodate(page);
724                 goto put_out;
725         }
726
727         /*
728          * don't cache encrypted data into meta inode until previous dirty
729          * data were writebacked to avoid racing between GC and flush.
730          */
731         f2fs_wait_on_page_writeback(page, DATA, true);
732
733         err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
734         if (err)
735                 goto put_out;
736
737         set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
738
739         /* read page */
740         fio.page = page;
741         fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
742
743         if (lfs_mode)
744                 down_write(&fio.sbi->io_order_lock);
745
746         f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
747                                         &sum, CURSEG_COLD_DATA, NULL, false);
748
749         fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
750                                 newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
751         if (!fio.encrypted_page) {
752                 err = -ENOMEM;
753                 goto recover_block;
754         }
755
756         mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
757                                         fio.old_blkaddr, FGP_LOCK, GFP_NOFS);
758         if (mpage) {
759                 bool updated = false;
760
761                 if (PageUptodate(mpage)) {
762                         memcpy(page_address(fio.encrypted_page),
763                                         page_address(mpage), PAGE_SIZE);
764                         updated = true;
765                 }
766                 f2fs_put_page(mpage, 1);
767                 invalidate_mapping_pages(META_MAPPING(fio.sbi),
768                                         fio.old_blkaddr, fio.old_blkaddr);
769                 if (updated)
770                         goto write_page;
771         }
772
773         err = f2fs_submit_page_bio(&fio);
774         if (err)
775                 goto put_page_out;
776
777         /* write page */
778         lock_page(fio.encrypted_page);
779
780         if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
781                 err = -EIO;
782                 goto put_page_out;
783         }
784         if (unlikely(!PageUptodate(fio.encrypted_page))) {
785                 err = -EIO;
786                 goto put_page_out;
787         }
788
789 write_page:
790         set_page_dirty(fio.encrypted_page);
791         f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
792         if (clear_page_dirty_for_io(fio.encrypted_page))
793                 dec_page_count(fio.sbi, F2FS_DIRTY_META);
794
795         set_page_writeback(fio.encrypted_page);
796         ClearPageError(page);
797
798         /* allocate block address */
799         f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
800
801         fio.op = REQ_OP_WRITE;
802         fio.op_flags = REQ_SYNC;
803         fio.new_blkaddr = newaddr;
804         f2fs_submit_page_write(&fio);
805         if (fio.retry) {
806                 if (PageWriteback(fio.encrypted_page))
807                         end_page_writeback(fio.encrypted_page);
808                 goto put_page_out;
809         }
810
811         f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
812
813         f2fs_update_data_blkaddr(&dn, newaddr);
814         set_inode_flag(inode, FI_APPEND_WRITE);
815         if (page->index == 0)
816                 set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
817 put_page_out:
818         f2fs_put_page(fio.encrypted_page, 1);
819 recover_block:
820         if (lfs_mode)
821                 up_write(&fio.sbi->io_order_lock);
822         if (err)
823                 f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
824                                                                 true, true);
825 put_out:
826         f2fs_put_dnode(&dn);
827 out:
828         f2fs_put_page(page, 1);
829 }
830
831 static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
832                                                         unsigned int segno, int off)
833 {
834         struct page *page;
835
836         page = f2fs_get_lock_data_page(inode, bidx, true);
837         if (IS_ERR(page))
838                 return;
839
840         if (!check_valid_map(F2FS_I_SB(inode), segno, off))
841                 goto out;
842
843         if (f2fs_is_atomic_file(inode)) {
844                 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
845                 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
846                 goto out;
847         }
848         if (f2fs_is_pinned_file(inode)) {
849                 if (gc_type == FG_GC)
850                         f2fs_pin_file_control(inode, true);
851                 goto out;
852         }
853
854         if (gc_type == BG_GC) {
855                 if (PageWriteback(page))
856                         goto out;
857                 set_page_dirty(page);
858                 set_cold_data(page);
859         } else {
860                 struct f2fs_io_info fio = {
861                         .sbi = F2FS_I_SB(inode),
862                         .ino = inode->i_ino,
863                         .type = DATA,
864                         .temp = COLD,
865                         .op = REQ_OP_WRITE,
866                         .op_flags = REQ_SYNC,
867                         .old_blkaddr = NULL_ADDR,
868                         .page = page,
869                         .encrypted_page = NULL,
870                         .need_lock = LOCK_REQ,
871                         .io_type = FS_GC_DATA_IO,
872                 };
873                 bool is_dirty = PageDirty(page);
874                 int err;
875
876 retry:
877                 set_page_dirty(page);
878                 f2fs_wait_on_page_writeback(page, DATA, true);
879                 if (clear_page_dirty_for_io(page)) {
880                         inode_dec_dirty_pages(inode);
881                         f2fs_remove_dirty_inode(inode);
882                 }
883
884                 set_cold_data(page);
885
886                 err = f2fs_do_write_data_page(&fio);
887                 if (err) {
888                         clear_cold_data(page);
889                         if (err == -ENOMEM) {
890                                 congestion_wait(BLK_RW_ASYNC, HZ/50);
891                                 goto retry;
892                         }
893                         if (is_dirty)
894                                 set_page_dirty(page);
895                 }
896         }
897 out:
898         f2fs_put_page(page, 1);
899 }
900
901 /*
902  * This function tries to get parent node of victim data block, and identifies
903  * data block validity. If the block is valid, copy that with cold status and
904  * modify parent node.
905  * If the parent node is not valid or the data block address is different,
906  * the victim data block is ignored.
907  */
908 static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
909                 struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
910 {
911         struct super_block *sb = sbi->sb;
912         struct f2fs_summary *entry;
913         block_t start_addr;
914         int off;
915         int phase = 0;
916
917         start_addr = START_BLOCK(sbi, segno);
918
919 next_step:
920         entry = sum;
921
922         for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
923                 struct page *data_page;
924                 struct inode *inode;
925                 struct node_info dni; /* dnode info for the data */
926                 unsigned int ofs_in_node, nofs;
927                 block_t start_bidx;
928                 nid_t nid = le32_to_cpu(entry->nid);
929
930                 /* stop BG_GC if there is not enough free sections. */
931                 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
932                         return;
933
934                 if (check_valid_map(sbi, segno, off) == 0)
935                         continue;
936
937                 if (phase == 0) {
938                         f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
939                                                         META_NAT, true);
940                         continue;
941                 }
942
943                 if (phase == 1) {
944                         f2fs_ra_node_page(sbi, nid);
945                         continue;
946                 }
947
948                 /* Get an inode by ino with checking validity */
949                 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
950                         continue;
951
952                 if (phase == 2) {
953                         f2fs_ra_node_page(sbi, dni.ino);
954                         continue;
955                 }
956
957                 ofs_in_node = le16_to_cpu(entry->ofs_in_node);
958
959                 if (phase == 3) {
960                         inode = f2fs_iget(sb, dni.ino);
961                         if (IS_ERR(inode) || is_bad_inode(inode))
962                                 continue;
963
964                         if (!down_write_trylock(
965                                 &F2FS_I(inode)->i_gc_rwsem[WRITE])) {
966                                 iput(inode);
967                                 sbi->skipped_gc_rwsem++;
968                                 continue;
969                         }
970
971                         start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
972                                                                 ofs_in_node;
973
974                         if (f2fs_post_read_required(inode)) {
975                                 int err = ra_data_block(inode, start_bidx);
976
977                                 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
978                                 if (err) {
979                                         iput(inode);
980                                         continue;
981                                 }
982                                 add_gc_inode(gc_list, inode);
983                                 continue;
984                         }
985
986                         data_page = f2fs_get_read_data_page(inode,
987                                                 start_bidx, REQ_RAHEAD, true);
988                         up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
989                         if (IS_ERR(data_page)) {
990                                 iput(inode);
991                                 continue;
992                         }
993
994                         f2fs_put_page(data_page, 0);
995                         add_gc_inode(gc_list, inode);
996                         continue;
997                 }
998
999                 /* phase 4 */
1000                 inode = find_gc_inode(gc_list, dni.ino);
1001                 if (inode) {
1002                         struct f2fs_inode_info *fi = F2FS_I(inode);
1003                         bool locked = false;
1004
1005                         if (S_ISREG(inode->i_mode)) {
1006                                 if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
1007                                         sbi->skipped_gc_rwsem++;
1008                                         continue;
1009                                 }
1010                                 if (!down_write_trylock(
1011                                                 &fi->i_gc_rwsem[WRITE])) {
1012                                         sbi->skipped_gc_rwsem++;
1013                                         up_write(&fi->i_gc_rwsem[READ]);
1014                                         continue;
1015                                 }
1016                                 locked = true;
1017
1018                                 /* wait for all inflight aio data */
1019                                 inode_dio_wait(inode);
1020                         }
1021
1022                         start_bidx = f2fs_start_bidx_of_node(nofs, inode)
1023                                                                 + ofs_in_node;
1024                         if (f2fs_post_read_required(inode))
1025                                 move_data_block(inode, start_bidx, gc_type,
1026                                                                 segno, off);
1027                         else
1028                                 move_data_page(inode, start_bidx, gc_type,
1029                                                                 segno, off);
1030
1031                         if (locked) {
1032                                 up_write(&fi->i_gc_rwsem[WRITE]);
1033                                 up_write(&fi->i_gc_rwsem[READ]);
1034                         }
1035
1036                         stat_inc_data_blk_count(sbi, 1, gc_type);
1037                 }
1038         }
1039
1040         if (++phase < 5)
1041                 goto next_step;
1042 }
1043
1044 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
1045                         int gc_type)
1046 {
1047         struct sit_info *sit_i = SIT_I(sbi);
1048         int ret;
1049
1050         down_write(&sit_i->sentry_lock);
1051         ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
1052                                               NO_CHECK_TYPE, LFS);
1053         up_write(&sit_i->sentry_lock);
1054         return ret;
1055 }
1056
1057 static int do_garbage_collect(struct f2fs_sb_info *sbi,
1058                                 unsigned int start_segno,
1059                                 struct gc_inode_list *gc_list, int gc_type)
1060 {
1061         struct page *sum_page;
1062         struct f2fs_summary_block *sum;
1063         struct blk_plug plug;
1064         unsigned int segno = start_segno;
1065         unsigned int end_segno = start_segno + sbi->segs_per_sec;
1066         int seg_freed = 0;
1067         unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
1068                                                 SUM_TYPE_DATA : SUM_TYPE_NODE;
1069
1070         /* readahead multi ssa blocks those have contiguous address */
1071         if (sbi->segs_per_sec > 1)
1072                 f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
1073                                         sbi->segs_per_sec, META_SSA, true);
1074
1075         /* reference all summary page */
1076         while (segno < end_segno) {
1077                 sum_page = f2fs_get_sum_page(sbi, segno++);
1078                 unlock_page(sum_page);
1079         }
1080
1081         blk_start_plug(&plug);
1082
1083         for (segno = start_segno; segno < end_segno; segno++) {
1084
1085                 /* find segment summary of victim */
1086                 sum_page = find_get_page(META_MAPPING(sbi),
1087                                         GET_SUM_BLOCK(sbi, segno));
1088                 f2fs_put_page(sum_page, 0);
1089
1090                 if (get_valid_blocks(sbi, segno, false) == 0)
1091                         goto freed;
1092                 if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
1093                         goto next;
1094
1095                 sum = page_address(sum_page);
1096                 if (type != GET_SUM_TYPE((&sum->footer))) {
1097                         f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) "
1098                                 "type [%d, %d] in SSA and SIT",
1099                                 segno, type, GET_SUM_TYPE((&sum->footer)));
1100                         set_sbi_flag(sbi, SBI_NEED_FSCK);
1101                         goto next;
1102                 }
1103
1104                 /*
1105                  * this is to avoid deadlock:
1106                  * - lock_page(sum_page)         - f2fs_replace_block
1107                  *  - check_valid_map()            - down_write(sentry_lock)
1108                  *   - down_read(sentry_lock)     - change_curseg()
1109                  *                                  - lock_page(sum_page)
1110                  */
1111                 if (type == SUM_TYPE_NODE)
1112                         gc_node_segment(sbi, sum->entries, segno, gc_type);
1113                 else
1114                         gc_data_segment(sbi, sum->entries, gc_list, segno,
1115                                                                 gc_type);
1116
1117                 stat_inc_seg_count(sbi, type, gc_type);
1118
1119 freed:
1120                 if (gc_type == FG_GC &&
1121                                 get_valid_blocks(sbi, segno, false) == 0)
1122                         seg_freed++;
1123 next:
1124                 f2fs_put_page(sum_page, 0);
1125         }
1126
1127         if (gc_type == FG_GC)
1128                 f2fs_submit_merged_write(sbi,
1129                                 (type == SUM_TYPE_NODE) ? NODE : DATA);
1130
1131         blk_finish_plug(&plug);
1132
1133         stat_inc_call_count(sbi->stat_info);
1134
1135         return seg_freed;
1136 }
1137
1138 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
1139                         bool background, unsigned int segno)
1140 {
1141         int gc_type = sync ? FG_GC : BG_GC;
1142         int sec_freed = 0, seg_freed = 0, total_freed = 0;
1143         int ret = 0;
1144         struct cp_control cpc;
1145         unsigned int init_segno = segno;
1146         struct gc_inode_list gc_list = {
1147                 .ilist = LIST_HEAD_INIT(gc_list.ilist),
1148                 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
1149         };
1150         unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
1151         unsigned long long first_skipped;
1152         unsigned int skipped_round = 0, round = 0;
1153
1154         trace_f2fs_gc_begin(sbi->sb, sync, background,
1155                                 get_pages(sbi, F2FS_DIRTY_NODES),
1156                                 get_pages(sbi, F2FS_DIRTY_DENTS),
1157                                 get_pages(sbi, F2FS_DIRTY_IMETA),
1158                                 free_sections(sbi),
1159                                 free_segments(sbi),
1160                                 reserved_segments(sbi),
1161                                 prefree_segments(sbi));
1162
1163         cpc.reason = __get_cp_reason(sbi);
1164         sbi->skipped_gc_rwsem = 0;
1165         first_skipped = last_skipped;
1166 gc_more:
1167         if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
1168                 ret = -EINVAL;
1169                 goto stop;
1170         }
1171         if (unlikely(f2fs_cp_error(sbi))) {
1172                 ret = -EIO;
1173                 goto stop;
1174         }
1175
1176         if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
1177                 /*
1178                  * For example, if there are many prefree_segments below given
1179                  * threshold, we can make them free by checkpoint. Then, we
1180                  * secure free segments which doesn't need fggc any more.
1181                  */
1182                 if (prefree_segments(sbi)) {
1183                         ret = f2fs_write_checkpoint(sbi, &cpc);
1184                         if (ret)
1185                                 goto stop;
1186                 }
1187                 if (has_not_enough_free_secs(sbi, 0, 0))
1188                         gc_type = FG_GC;
1189         }
1190
1191         /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
1192         if (gc_type == BG_GC && !background) {
1193                 ret = -EINVAL;
1194                 goto stop;
1195         }
1196         if (!__get_victim(sbi, &segno, gc_type)) {
1197                 ret = -ENODATA;
1198                 goto stop;
1199         }
1200
1201         seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
1202         if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
1203                 sec_freed++;
1204         total_freed += seg_freed;
1205
1206         if (gc_type == FG_GC) {
1207                 if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
1208                                                 sbi->skipped_gc_rwsem)
1209                         skipped_round++;
1210                 last_skipped = sbi->skipped_atomic_files[FG_GC];
1211                 round++;
1212         }
1213
1214         if (gc_type == FG_GC)
1215                 sbi->cur_victim_sec = NULL_SEGNO;
1216
1217         if (sync)
1218                 goto stop;
1219
1220         if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
1221                 if (skipped_round <= MAX_SKIP_GC_COUNT ||
1222                                         skipped_round * 2 < round) {
1223                         segno = NULL_SEGNO;
1224                         goto gc_more;
1225                 }
1226
1227                 if (first_skipped < last_skipped &&
1228                                 (last_skipped - first_skipped) >
1229                                                 sbi->skipped_gc_rwsem) {
1230                         f2fs_drop_inmem_pages_all(sbi, true);
1231                         segno = NULL_SEGNO;
1232                         goto gc_more;
1233                 }
1234                 if (gc_type == FG_GC)
1235                         ret = f2fs_write_checkpoint(sbi, &cpc);
1236         }
1237 stop:
1238         SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
1239         SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
1240
1241         trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
1242                                 get_pages(sbi, F2FS_DIRTY_NODES),
1243                                 get_pages(sbi, F2FS_DIRTY_DENTS),
1244                                 get_pages(sbi, F2FS_DIRTY_IMETA),
1245                                 free_sections(sbi),
1246                                 free_segments(sbi),
1247                                 reserved_segments(sbi),
1248                                 prefree_segments(sbi));
1249
1250         mutex_unlock(&sbi->gc_mutex);
1251
1252         put_gc_inode(&gc_list);
1253
1254         if (sync && !ret)
1255                 ret = sec_freed ? 0 : -EAGAIN;
1256         return ret;
1257 }
1258
1259 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
1260 {
1261         DIRTY_I(sbi)->v_ops = &default_v_ops;
1262
1263         sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
1264
1265         /* give warm/cold data area from slower device */
1266         if (f2fs_is_multi_device(sbi) && sbi->segs_per_sec == 1)
1267                 SIT_I(sbi)->last_victim[ALLOC_NEXT] =
1268                                 GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
1269 }