GNU Linux-libre 6.7.9-gnu
[releases.git] / drivers / vfio / pci / pds / dirty.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2023 Advanced Micro Devices, Inc. */
3
4 #include <linux/interval_tree.h>
5 #include <linux/vfio.h>
6
7 #include <linux/pds/pds_common.h>
8 #include <linux/pds/pds_core_if.h>
9 #include <linux/pds/pds_adminq.h>
10
11 #include "vfio_dev.h"
12 #include "cmds.h"
13 #include "dirty.h"
14
15 #define READ_SEQ true
16 #define WRITE_ACK false
17
18 bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio)
19 {
20         return pds_vfio->dirty.is_enabled;
21 }
22
23 void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio)
24 {
25         pds_vfio->dirty.is_enabled = true;
26 }
27
28 void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio)
29 {
30         pds_vfio->dirty.is_enabled = false;
31 }
32
33 static void
34 pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
35                                  u8 max_regions)
36 {
37         int len = max_regions * sizeof(struct pds_lm_dirty_region_info);
38         struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
39         struct device *pdsc_dev = &pci_physfn(pdev)->dev;
40         struct pds_lm_dirty_region_info *region_info;
41         dma_addr_t regions_dma;
42         u8 num_regions;
43         int err;
44
45         region_info = kcalloc(max_regions,
46                               sizeof(struct pds_lm_dirty_region_info),
47                               GFP_KERNEL);
48         if (!region_info)
49                 return;
50
51         regions_dma =
52                 dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE);
53         if (dma_mapping_error(pdsc_dev, regions_dma))
54                 goto out_free_region_info;
55
56         err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions,
57                                         &num_regions);
58         dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE);
59         if (err)
60                 goto out_free_region_info;
61
62         for (unsigned int i = 0; i < num_regions; i++)
63                 dev_dbg(&pdev->dev,
64                         "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
65                         i, le64_to_cpu(region_info[i].dma_base),
66                         le32_to_cpu(region_info[i].page_count),
67                         region_info[i].page_size_log2);
68
69 out_free_region_info:
70         kfree(region_info);
71 }
72
73 static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
74                                         unsigned long bytes)
75 {
76         unsigned long *host_seq_bmp, *host_ack_bmp;
77
78         host_seq_bmp = vzalloc(bytes);
79         if (!host_seq_bmp)
80                 return -ENOMEM;
81
82         host_ack_bmp = vzalloc(bytes);
83         if (!host_ack_bmp) {
84                 bitmap_free(host_seq_bmp);
85                 return -ENOMEM;
86         }
87
88         dirty->host_seq.bmp = host_seq_bmp;
89         dirty->host_ack.bmp = host_ack_bmp;
90
91         return 0;
92 }
93
94 static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
95 {
96         vfree(dirty->host_seq.bmp);
97         vfree(dirty->host_ack.bmp);
98         dirty->host_seq.bmp = NULL;
99         dirty->host_ack.bmp = NULL;
100 }
101
102 static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
103                                       struct pds_vfio_bmp_info *bmp_info)
104 {
105         struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
106         struct device *pdsc_dev = &pci_physfn(pdev)->dev;
107
108         dma_unmap_single(pdsc_dev, bmp_info->sgl_addr,
109                          bmp_info->num_sge * sizeof(struct pds_lm_sg_elem),
110                          DMA_BIDIRECTIONAL);
111         kfree(bmp_info->sgl);
112
113         bmp_info->num_sge = 0;
114         bmp_info->sgl = NULL;
115         bmp_info->sgl_addr = 0;
116 }
117
118 static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
119 {
120         if (pds_vfio->dirty.host_seq.sgl)
121                 __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq);
122         if (pds_vfio->dirty.host_ack.sgl)
123                 __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack);
124 }
125
126 static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
127                                       struct pds_vfio_bmp_info *bmp_info,
128                                       u32 page_count)
129 {
130         struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
131         struct device *pdsc_dev = &pci_physfn(pdev)->dev;
132         struct pds_lm_sg_elem *sgl;
133         dma_addr_t sgl_addr;
134         size_t sgl_size;
135         u32 max_sge;
136
137         max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8);
138         sgl_size = max_sge * sizeof(struct pds_lm_sg_elem);
139
140         sgl = kzalloc(sgl_size, GFP_KERNEL);
141         if (!sgl)
142                 return -ENOMEM;
143
144         sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL);
145         if (dma_mapping_error(pdsc_dev, sgl_addr)) {
146                 kfree(sgl);
147                 return -EIO;
148         }
149
150         bmp_info->sgl = sgl;
151         bmp_info->num_sge = max_sge;
152         bmp_info->sgl_addr = sgl_addr;
153
154         return 0;
155 }
156
157 static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
158                                     u32 page_count)
159 {
160         struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
161         int err;
162
163         err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq,
164                                          page_count);
165         if (err)
166                 return err;
167
168         err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack,
169                                          page_count);
170         if (err) {
171                 __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq);
172                 return err;
173         }
174
175         return 0;
176 }
177
178 static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
179                                  struct rb_root_cached *ranges, u32 nnodes,
180                                  u64 *page_size)
181 {
182         struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
183         struct device *pdsc_dev = &pci_physfn(pdev)->dev;
184         struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
185         u64 region_start, region_size, region_page_size;
186         struct pds_lm_dirty_region_info *region_info;
187         struct interval_tree_node *node = NULL;
188         u8 max_regions = 0, num_regions;
189         dma_addr_t regions_dma = 0;
190         u32 num_ranges = nnodes;
191         u32 page_count;
192         u16 len;
193         int err;
194
195         dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
196                 pds_vfio->vf_id);
197
198         if (pds_vfio_dirty_is_enabled(pds_vfio))
199                 return -EINVAL;
200
201         /* find if dirty tracking is disabled, i.e. num_regions == 0 */
202         err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
203                                         &num_regions);
204         if (err < 0) {
205                 dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
206                         ERR_PTR(err));
207                 return err;
208         } else if (num_regions) {
209                 dev_err(&pdev->dev,
210                         "Dirty tracking already enabled for %d regions\n",
211                         num_regions);
212                 return -EEXIST;
213         } else if (!max_regions) {
214                 dev_err(&pdev->dev,
215                         "Device doesn't support dirty tracking, max_regions %d\n",
216                         max_regions);
217                 return -EOPNOTSUPP;
218         }
219
220         /*
221          * Only support 1 region for now. If there are any large gaps in the
222          * VM's address regions, then this would be a waste of memory as we are
223          * generating 2 bitmaps (ack/seq) from the min address to the max
224          * address of the VM's address regions. In the future, if we support
225          * more than one region in the device/driver we can split the bitmaps
226          * on the largest address region gaps. We can do this split up to the
227          * max_regions times returned from the dirty_status command.
228          */
229         max_regions = 1;
230         if (num_ranges > max_regions) {
231                 vfio_combine_iova_ranges(ranges, nnodes, max_regions);
232                 num_ranges = max_regions;
233         }
234
235         node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
236         if (!node)
237                 return -EINVAL;
238
239         region_size = node->last - node->start + 1;
240         region_start = node->start;
241         region_page_size = *page_size;
242
243         len = sizeof(*region_info);
244         region_info = kzalloc(len, GFP_KERNEL);
245         if (!region_info)
246                 return -ENOMEM;
247
248         page_count = DIV_ROUND_UP(region_size, region_page_size);
249
250         region_info->dma_base = cpu_to_le64(region_start);
251         region_info->page_count = cpu_to_le32(page_count);
252         region_info->page_size_log2 = ilog2(region_page_size);
253
254         regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
255                                      DMA_BIDIRECTIONAL);
256         if (dma_mapping_error(pdsc_dev, regions_dma)) {
257                 err = -ENOMEM;
258                 goto out_free_region_info;
259         }
260
261         err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
262         dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
263         if (err)
264                 goto out_free_region_info;
265
266         /*
267          * page_count might be adjusted by the device,
268          * update it before freeing region_info DMA
269          */
270         page_count = le32_to_cpu(region_info->page_count);
271
272         dev_dbg(&pdev->dev,
273                 "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
274                 regions_dma, region_start, page_count,
275                 (u8)ilog2(region_page_size));
276
277         err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
278         if (err) {
279                 dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
280                         ERR_PTR(err));
281                 goto out_free_region_info;
282         }
283
284         err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
285         if (err) {
286                 dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
287                         ERR_PTR(err));
288                 goto out_free_bitmaps;
289         }
290
291         dirty->region_start = region_start;
292         dirty->region_size = region_size;
293         dirty->region_page_size = region_page_size;
294         pds_vfio_dirty_set_enabled(pds_vfio);
295
296         pds_vfio_print_guest_region_info(pds_vfio, max_regions);
297
298         kfree(region_info);
299
300         return 0;
301
302 out_free_bitmaps:
303         pds_vfio_dirty_free_bitmaps(dirty);
304 out_free_region_info:
305         kfree(region_info);
306         return err;
307 }
308
309 void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
310 {
311         if (pds_vfio_dirty_is_enabled(pds_vfio)) {
312                 pds_vfio_dirty_set_disabled(pds_vfio);
313                 if (send_cmd)
314                         pds_vfio_dirty_disable_cmd(pds_vfio);
315                 pds_vfio_dirty_free_sgl(pds_vfio);
316                 pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
317         }
318
319         if (send_cmd)
320                 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
321 }
322
323 static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
324                                   struct pds_vfio_bmp_info *bmp_info,
325                                   u32 offset, u32 bmp_bytes, bool read_seq)
326 {
327         const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
328         u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
329         struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
330         struct device *pdsc_dev = &pci_physfn(pdev)->dev;
331         unsigned long long npages;
332         struct sg_table sg_table;
333         struct scatterlist *sg;
334         struct page **pages;
335         u32 page_offset;
336         const void *bmp;
337         size_t size;
338         u16 num_sge;
339         int err;
340         int i;
341
342         bmp = (void *)((u64)bmp_info->bmp + offset);
343         page_offset = offset_in_page(bmp);
344         bmp -= page_offset;
345
346         /*
347          * Start and end of bitmap section to seq/ack might not be page
348          * aligned, so use the page_offset to account for that so there
349          * will be enough pages to represent the bmp_bytes
350          */
351         npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
352         pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
353         if (!pages)
354                 return -ENOMEM;
355
356         for (unsigned long long i = 0; i < npages; i++) {
357                 struct page *page = vmalloc_to_page(bmp);
358
359                 if (!page) {
360                         err = -EFAULT;
361                         goto out_free_pages;
362                 }
363
364                 pages[i] = page;
365                 bmp += PAGE_SIZE;
366         }
367
368         err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
369                                         bmp_bytes, GFP_KERNEL);
370         if (err)
371                 goto out_free_pages;
372
373         err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
374         if (err)
375                 goto out_free_sg_table;
376
377         for_each_sgtable_dma_sg(&sg_table, sg, i) {
378                 struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
379
380                 sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
381                 sg_elem->len = cpu_to_le32(sg_dma_len(sg));
382         }
383
384         num_sge = sg_table.nents;
385         size = num_sge * sizeof(struct pds_lm_sg_elem);
386         dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
387         err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge,
388                                          offset, bmp_bytes, read_seq);
389         if (err)
390                 dev_err(&pdev->dev,
391                         "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
392                         bmp_type_str, offset, bmp_bytes,
393                         num_sge, bmp_info->sgl_addr, ERR_PTR(err));
394         dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
395
396         dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
397 out_free_sg_table:
398         sg_free_table(&sg_table);
399 out_free_pages:
400         kfree(pages);
401
402         return err;
403 }
404
405 static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
406                                     u32 offset, u32 len)
407 {
408         return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
409                                       offset, len, WRITE_ACK);
410 }
411
412 static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
413                                    u32 offset, u32 len)
414 {
415         return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
416                                       offset, len, READ_SEQ);
417 }
418
419 static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
420                                           struct iova_bitmap *dirty_bitmap,
421                                           u32 bmp_offset, u32 len_bytes)
422 {
423         u64 page_size = pds_vfio->dirty.region_page_size;
424         u64 region_start = pds_vfio->dirty.region_start;
425         u32 bmp_offset_bit;
426         __le64 *seq, *ack;
427         int dword_count;
428
429         dword_count = len_bytes / sizeof(u64);
430         seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
431         ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
432         bmp_offset_bit = bmp_offset * 8;
433
434         for (int i = 0; i < dword_count; i++) {
435                 u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
436
437                 /* prepare for next write_ack call */
438                 ack[i] = seq[i];
439
440                 for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
441                         if (xor & BIT(bit_i)) {
442                                 u64 abs_bit_i = bmp_offset_bit +
443                                                 i * BITS_PER_TYPE(u64) + bit_i;
444                                 u64 addr = abs_bit_i * page_size + region_start;
445
446                                 iova_bitmap_set(dirty_bitmap, addr, page_size);
447                         }
448                 }
449         }
450
451         return 0;
452 }
453
454 static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
455                                struct iova_bitmap *dirty_bitmap,
456                                unsigned long iova, unsigned long length)
457 {
458         struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
459         struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
460         u64 bmp_offset, bmp_bytes;
461         u64 bitmap_size, pages;
462         int err;
463
464         dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
465
466         if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
467                 dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
468                         pds_vfio->vf_id);
469                 return -EINVAL;
470         }
471
472         pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
473         bitmap_size =
474                 round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
475
476         dev_dbg(dev,
477                 "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
478                 pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
479                 pages, bitmap_size);
480
481         if (!length || ((iova - dirty->region_start + length) > dirty->region_size)) {
482                 dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
483                         iova, length);
484                 return -EINVAL;
485         }
486
487         /* bitmap is modified in 64 bit chunks */
488         bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
489                                        sizeof(u64)),
490                           sizeof(u64));
491         if (bmp_bytes != bitmap_size) {
492                 dev_err(dev,
493                         "Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
494                         bmp_bytes, bitmap_size);
495                 return -EINVAL;
496         }
497
498         bmp_offset = DIV_ROUND_UP((iova - dirty->region_start) /
499                                   dirty->region_page_size, sizeof(u64));
500
501         dev_dbg(dev,
502                 "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
503                 iova, length, bmp_offset, bmp_bytes);
504
505         err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
506         if (err)
507                 return err;
508
509         err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset,
510                                              bmp_bytes);
511         if (err)
512                 return err;
513
514         err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
515         if (err)
516                 return err;
517
518         return 0;
519 }
520
521 int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
522                                 unsigned long length, struct iova_bitmap *dirty)
523 {
524         struct pds_vfio_pci_device *pds_vfio =
525                 container_of(vdev, struct pds_vfio_pci_device,
526                              vfio_coredev.vdev);
527         int err;
528
529         mutex_lock(&pds_vfio->state_mutex);
530         err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
531         pds_vfio_state_mutex_unlock(pds_vfio);
532
533         return err;
534 }
535
536 int pds_vfio_dma_logging_start(struct vfio_device *vdev,
537                                struct rb_root_cached *ranges, u32 nnodes,
538                                u64 *page_size)
539 {
540         struct pds_vfio_pci_device *pds_vfio =
541                 container_of(vdev, struct pds_vfio_pci_device,
542                              vfio_coredev.vdev);
543         int err;
544
545         mutex_lock(&pds_vfio->state_mutex);
546         pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
547         err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
548         pds_vfio_state_mutex_unlock(pds_vfio);
549
550         return err;
551 }
552
553 int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
554 {
555         struct pds_vfio_pci_device *pds_vfio =
556                 container_of(vdev, struct pds_vfio_pci_device,
557                              vfio_coredev.vdev);
558
559         mutex_lock(&pds_vfio->state_mutex);
560         pds_vfio_dirty_disable(pds_vfio, true);
561         pds_vfio_state_mutex_unlock(pds_vfio);
562
563         return 0;
564 }