1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2023 Advanced Micro Devices, Inc. */
4 #include <linux/interval_tree.h>
5 #include <linux/vfio.h>
7 #include <linux/pds/pds_common.h>
8 #include <linux/pds/pds_core_if.h>
9 #include <linux/pds/pds_adminq.h>
16 #define WRITE_ACK false
18 bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio)
20 return pds_vfio->dirty.is_enabled;
23 void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio)
25 pds_vfio->dirty.is_enabled = true;
28 void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio)
30 pds_vfio->dirty.is_enabled = false;
34 pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
37 int len = max_regions * sizeof(struct pds_lm_dirty_region_info);
38 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
39 struct device *pdsc_dev = &pci_physfn(pdev)->dev;
40 struct pds_lm_dirty_region_info *region_info;
41 dma_addr_t regions_dma;
45 region_info = kcalloc(max_regions,
46 sizeof(struct pds_lm_dirty_region_info),
52 dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE);
53 if (dma_mapping_error(pdsc_dev, regions_dma))
54 goto out_free_region_info;
56 err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions,
58 dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE);
60 goto out_free_region_info;
62 for (unsigned int i = 0; i < num_regions; i++)
64 "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
65 i, le64_to_cpu(region_info[i].dma_base),
66 le32_to_cpu(region_info[i].page_count),
67 region_info[i].page_size_log2);
73 static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
76 unsigned long *host_seq_bmp, *host_ack_bmp;
78 host_seq_bmp = vzalloc(bytes);
82 host_ack_bmp = vzalloc(bytes);
84 bitmap_free(host_seq_bmp);
88 dirty->host_seq.bmp = host_seq_bmp;
89 dirty->host_ack.bmp = host_ack_bmp;
94 static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
96 vfree(dirty->host_seq.bmp);
97 vfree(dirty->host_ack.bmp);
98 dirty->host_seq.bmp = NULL;
99 dirty->host_ack.bmp = NULL;
102 static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
103 struct pds_vfio_bmp_info *bmp_info)
105 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
106 struct device *pdsc_dev = &pci_physfn(pdev)->dev;
108 dma_unmap_single(pdsc_dev, bmp_info->sgl_addr,
109 bmp_info->num_sge * sizeof(struct pds_lm_sg_elem),
111 kfree(bmp_info->sgl);
113 bmp_info->num_sge = 0;
114 bmp_info->sgl = NULL;
115 bmp_info->sgl_addr = 0;
118 static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
120 if (pds_vfio->dirty.host_seq.sgl)
121 __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq);
122 if (pds_vfio->dirty.host_ack.sgl)
123 __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack);
126 static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
127 struct pds_vfio_bmp_info *bmp_info,
130 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
131 struct device *pdsc_dev = &pci_physfn(pdev)->dev;
132 struct pds_lm_sg_elem *sgl;
137 max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8);
138 sgl_size = max_sge * sizeof(struct pds_lm_sg_elem);
140 sgl = kzalloc(sgl_size, GFP_KERNEL);
144 sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL);
145 if (dma_mapping_error(pdsc_dev, sgl_addr)) {
151 bmp_info->num_sge = max_sge;
152 bmp_info->sgl_addr = sgl_addr;
157 static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
160 struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
163 err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq,
168 err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack,
171 __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq);
178 static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
179 struct rb_root_cached *ranges, u32 nnodes,
182 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
183 struct device *pdsc_dev = &pci_physfn(pdev)->dev;
184 struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
185 u64 region_start, region_size, region_page_size;
186 struct pds_lm_dirty_region_info *region_info;
187 struct interval_tree_node *node = NULL;
188 u8 max_regions = 0, num_regions;
189 dma_addr_t regions_dma = 0;
190 u32 num_ranges = nnodes;
195 dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
198 if (pds_vfio_dirty_is_enabled(pds_vfio))
201 /* find if dirty tracking is disabled, i.e. num_regions == 0 */
202 err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
205 dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
208 } else if (num_regions) {
210 "Dirty tracking already enabled for %d regions\n",
213 } else if (!max_regions) {
215 "Device doesn't support dirty tracking, max_regions %d\n",
221 * Only support 1 region for now. If there are any large gaps in the
222 * VM's address regions, then this would be a waste of memory as we are
223 * generating 2 bitmaps (ack/seq) from the min address to the max
224 * address of the VM's address regions. In the future, if we support
225 * more than one region in the device/driver we can split the bitmaps
226 * on the largest address region gaps. We can do this split up to the
227 * max_regions times returned from the dirty_status command.
230 if (num_ranges > max_regions) {
231 vfio_combine_iova_ranges(ranges, nnodes, max_regions);
232 num_ranges = max_regions;
235 node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
239 region_size = node->last - node->start + 1;
240 region_start = node->start;
241 region_page_size = *page_size;
243 len = sizeof(*region_info);
244 region_info = kzalloc(len, GFP_KERNEL);
248 page_count = DIV_ROUND_UP(region_size, region_page_size);
250 region_info->dma_base = cpu_to_le64(region_start);
251 region_info->page_count = cpu_to_le32(page_count);
252 region_info->page_size_log2 = ilog2(region_page_size);
254 regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
256 if (dma_mapping_error(pdsc_dev, regions_dma)) {
258 goto out_free_region_info;
261 err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
262 dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
264 goto out_free_region_info;
267 * page_count might be adjusted by the device,
268 * update it before freeing region_info DMA
270 page_count = le32_to_cpu(region_info->page_count);
273 "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
274 regions_dma, region_start, page_count,
275 (u8)ilog2(region_page_size));
277 err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
279 dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
281 goto out_free_region_info;
284 err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
286 dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
288 goto out_free_bitmaps;
291 dirty->region_start = region_start;
292 dirty->region_size = region_size;
293 dirty->region_page_size = region_page_size;
294 pds_vfio_dirty_set_enabled(pds_vfio);
296 pds_vfio_print_guest_region_info(pds_vfio, max_regions);
303 pds_vfio_dirty_free_bitmaps(dirty);
304 out_free_region_info:
309 void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
311 if (pds_vfio_dirty_is_enabled(pds_vfio)) {
312 pds_vfio_dirty_set_disabled(pds_vfio);
314 pds_vfio_dirty_disable_cmd(pds_vfio);
315 pds_vfio_dirty_free_sgl(pds_vfio);
316 pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
320 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
323 static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
324 struct pds_vfio_bmp_info *bmp_info,
325 u32 offset, u32 bmp_bytes, bool read_seq)
327 const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
328 u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
329 struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
330 struct device *pdsc_dev = &pci_physfn(pdev)->dev;
331 unsigned long long npages;
332 struct sg_table sg_table;
333 struct scatterlist *sg;
342 bmp = (void *)((u64)bmp_info->bmp + offset);
343 page_offset = offset_in_page(bmp);
347 * Start and end of bitmap section to seq/ack might not be page
348 * aligned, so use the page_offset to account for that so there
349 * will be enough pages to represent the bmp_bytes
351 npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
352 pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
356 for (unsigned long long i = 0; i < npages; i++) {
357 struct page *page = vmalloc_to_page(bmp);
368 err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
369 bmp_bytes, GFP_KERNEL);
373 err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
375 goto out_free_sg_table;
377 for_each_sgtable_dma_sg(&sg_table, sg, i) {
378 struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
380 sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
381 sg_elem->len = cpu_to_le32(sg_dma_len(sg));
384 num_sge = sg_table.nents;
385 size = num_sge * sizeof(struct pds_lm_sg_elem);
386 dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
387 err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge,
388 offset, bmp_bytes, read_seq);
391 "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
392 bmp_type_str, offset, bmp_bytes,
393 num_sge, bmp_info->sgl_addr, ERR_PTR(err));
394 dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
396 dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
398 sg_free_table(&sg_table);
405 static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
408 return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
409 offset, len, WRITE_ACK);
412 static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
415 return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
416 offset, len, READ_SEQ);
419 static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
420 struct iova_bitmap *dirty_bitmap,
421 u32 bmp_offset, u32 len_bytes)
423 u64 page_size = pds_vfio->dirty.region_page_size;
424 u64 region_start = pds_vfio->dirty.region_start;
429 dword_count = len_bytes / sizeof(u64);
430 seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
431 ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
432 bmp_offset_bit = bmp_offset * 8;
434 for (int i = 0; i < dword_count; i++) {
435 u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
437 /* prepare for next write_ack call */
440 for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
441 if (xor & BIT(bit_i)) {
442 u64 abs_bit_i = bmp_offset_bit +
443 i * BITS_PER_TYPE(u64) + bit_i;
444 u64 addr = abs_bit_i * page_size + region_start;
446 iova_bitmap_set(dirty_bitmap, addr, page_size);
454 static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
455 struct iova_bitmap *dirty_bitmap,
456 unsigned long iova, unsigned long length)
458 struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
459 struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
460 u64 bmp_offset, bmp_bytes;
461 u64 bitmap_size, pages;
464 dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
466 if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
467 dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
472 pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
474 round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
477 "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
478 pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
481 if (!length || ((iova - dirty->region_start + length) > dirty->region_size)) {
482 dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
487 /* bitmap is modified in 64 bit chunks */
488 bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
491 if (bmp_bytes != bitmap_size) {
493 "Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
494 bmp_bytes, bitmap_size);
498 bmp_offset = DIV_ROUND_UP((iova - dirty->region_start) /
499 dirty->region_page_size, sizeof(u64));
502 "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
503 iova, length, bmp_offset, bmp_bytes);
505 err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
509 err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset,
514 err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
521 int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
522 unsigned long length, struct iova_bitmap *dirty)
524 struct pds_vfio_pci_device *pds_vfio =
525 container_of(vdev, struct pds_vfio_pci_device,
529 mutex_lock(&pds_vfio->state_mutex);
530 err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
531 pds_vfio_state_mutex_unlock(pds_vfio);
536 int pds_vfio_dma_logging_start(struct vfio_device *vdev,
537 struct rb_root_cached *ranges, u32 nnodes,
540 struct pds_vfio_pci_device *pds_vfio =
541 container_of(vdev, struct pds_vfio_pci_device,
545 mutex_lock(&pds_vfio->state_mutex);
546 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
547 err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
548 pds_vfio_state_mutex_unlock(pds_vfio);
553 int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
555 struct pds_vfio_pci_device *pds_vfio =
556 container_of(vdev, struct pds_vfio_pci_device,
559 mutex_lock(&pds_vfio->state_mutex);
560 pds_vfio_dirty_disable(pds_vfio, true);
561 pds_vfio_state_mutex_unlock(pds_vfio);