1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 void *query_cap = NULL, *cap;
16 query_cap = kzalloc(query_sz, GFP_KERNEL);
20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
45 lockdep_assert_held(&mvdev->state_mutex);
46 if (mvdev->mdev_detach)
50 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 * running. Make sure to run only once there is no active save command.
52 * Running both in parallel, might end-up with a failure in the save
53 * command once it will try to turn on 'tracking' on a suspended device.
56 err = wait_for_completion_interruptible(&migf->save_comp);
61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
67 complete(&migf->save_comp);
72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
77 lockdep_assert_held(&mvdev->state_mutex);
78 if (mvdev->mdev_detach)
81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 size_t *state_size, u64 *total_size,
92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94 bool inc = query_flags & MLX5VF_QUERY_INC;
97 lockdep_assert_held(&mvdev->state_mutex);
98 if (mvdev->mdev_detach)
102 * In case PRE_COPY is used, saving_migf is exposed while device is
103 * running. Make sure to run only once there is no active save command.
104 * Running both in parallel, might end-up with a failure in the
105 * incremental query command on un-tracked vhca.
108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
111 if (mvdev->saving_migf->state ==
112 MLX5_MIGF_STATE_PRE_COPY_ERROR) {
114 * In case we had a PRE_COPY error, only query full
115 * image for final image
117 if (!(query_flags & MLX5VF_QUERY_FINAL)) {
119 complete(&mvdev->saving_migf->save_comp);
122 query_flags &= ~MLX5VF_QUERY_INC;
126 MLX5_SET(query_vhca_migration_state_in, in, opcode,
127 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
128 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
129 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
130 MLX5_SET(query_vhca_migration_state_in, in, incremental,
131 query_flags & MLX5VF_QUERY_INC);
132 MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
134 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
137 complete(&mvdev->saving_migf->save_comp);
142 *state_size = MLX5_GET(query_vhca_migration_state_out, out,
145 *total_size = mvdev->chunk_mode ?
146 MLX5_GET64(query_vhca_migration_state_out, out,
147 remaining_total_size) : *state_size;
152 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
154 /* Mark the tracker under an error and wake it up if it's running */
155 mvdev->tracker.is_err = true;
156 complete(&mvdev->tracker_comp);
159 static int mlx5fv_vf_event(struct notifier_block *nb,
160 unsigned long event, void *data)
162 struct mlx5vf_pci_core_device *mvdev =
163 container_of(nb, struct mlx5vf_pci_core_device, nb);
166 case MLX5_PF_NOTIFY_ENABLE_VF:
167 mutex_lock(&mvdev->state_mutex);
168 mvdev->mdev_detach = false;
169 mlx5vf_state_mutex_unlock(mvdev);
171 case MLX5_PF_NOTIFY_DISABLE_VF:
172 mlx5vf_cmd_close_migratable(mvdev);
173 mutex_lock(&mvdev->state_mutex);
174 mvdev->mdev_detach = true;
175 mlx5vf_state_mutex_unlock(mvdev);
184 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
186 if (!mvdev->migrate_cap)
189 /* Must be done outside the lock to let it progress */
190 set_tracker_error(mvdev);
191 mutex_lock(&mvdev->state_mutex);
192 mlx5vf_disable_fds(mvdev);
193 _mlx5vf_free_page_tracker_resources(mvdev);
194 mlx5vf_state_mutex_unlock(mvdev);
197 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
199 if (!mvdev->migrate_cap)
202 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
204 destroy_workqueue(mvdev->cb_wq);
207 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
208 const struct vfio_migration_ops *mig_ops,
209 const struct vfio_log_ops *log_ops)
211 struct pci_dev *pdev = mvdev->core_device.pdev;
214 if (!pdev->is_virtfn)
217 mvdev->mdev = mlx5_vf_get_core_dev(pdev);
221 if (!MLX5_CAP_GEN(mvdev->mdev, migration))
224 mvdev->vf_id = pci_iov_vf_id(pdev);
225 if (mvdev->vf_id < 0)
228 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
232 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
236 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
240 mutex_init(&mvdev->state_mutex);
241 spin_lock_init(&mvdev->reset_lock);
242 mvdev->nb.notifier_call = mlx5fv_vf_event;
243 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
246 destroy_workqueue(mvdev->cb_wq);
250 mvdev->migrate_cap = 1;
251 mvdev->core_device.vdev.migration_flags =
252 VFIO_MIGRATION_STOP_COPY |
254 mvdev->core_device.vdev.mig_ops = mig_ops;
255 init_completion(&mvdev->tracker_comp);
256 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
257 mvdev->core_device.vdev.log_ops = log_ops;
259 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
260 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
261 mvdev->core_device.vdev.migration_flags |=
262 VFIO_MIGRATION_PRE_COPY;
264 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
265 mvdev->chunk_mode = 1;
268 mlx5_vf_put_core_dev(mvdev->mdev);
271 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
274 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
279 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
280 out = kzalloc(out_size, GFP_KERNEL);
284 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
285 MLX5_SET(query_hca_cap_in, in, other_function, 1);
286 MLX5_SET(query_hca_cap_in, in, function_id, function_id);
287 MLX5_SET(query_hca_cap_in, in, op_mod,
288 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
289 HCA_CAP_OPMOD_GET_CUR);
291 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
295 *vhca_id = MLX5_GET(query_hca_cap_out, out,
296 capability.cmd_hca_cap.vhca_id);
303 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
304 struct mlx5_vhca_data_buffer *buf,
305 struct mlx5_vhca_recv_buf *recv_buf,
308 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
315 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
316 sizeof(*mtt) * round_up(npages, 2);
318 in = kvzalloc(inlen, GFP_KERNEL);
322 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
323 DIV_ROUND_UP(npages, 2));
324 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
327 struct sg_dma_page_iter dma_iter;
329 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
330 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
334 for (i = 0; i < npages; i++)
335 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
338 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
339 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
340 MLX5_SET(mkc, mkc, lr, 1);
341 MLX5_SET(mkc, mkc, lw, 1);
342 MLX5_SET(mkc, mkc, rr, 1);
343 MLX5_SET(mkc, mkc, rw, 1);
344 MLX5_SET(mkc, mkc, pd, pdn);
345 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
346 MLX5_SET(mkc, mkc, qpn, 0xffffff);
347 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
348 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
349 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
350 err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
355 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
357 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
358 struct mlx5_core_dev *mdev = mvdev->mdev;
361 lockdep_assert_held(&mvdev->state_mutex);
362 if (mvdev->mdev_detach)
365 if (buf->dmaed || !buf->allocated_length)
368 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
372 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
380 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
384 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
386 struct mlx5_vf_migration_file *migf = buf->migf;
387 struct sg_page_iter sg_iter;
389 lockdep_assert_held(&migf->mvdev->state_mutex);
390 WARN_ON(migf->mvdev->mdev_detach);
393 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
394 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
398 /* Undo alloc_pages_bulk_array() */
399 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
400 __free_page(sg_page_iter_page(&sg_iter));
401 sg_free_append_table(&buf->table);
405 struct mlx5_vhca_data_buffer *
406 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
408 enum dma_data_direction dma_dir)
410 struct mlx5_vhca_data_buffer *buf;
413 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
415 return ERR_PTR(-ENOMEM);
417 buf->dma_dir = dma_dir;
420 ret = mlx5vf_add_migration_pages(buf,
421 DIV_ROUND_UP_ULL(length, PAGE_SIZE));
425 if (dma_dir != DMA_NONE) {
426 ret = mlx5vf_dma_data_buffer(buf);
434 mlx5vf_free_data_buffer(buf);
438 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
440 spin_lock_irq(&buf->migf->list_lock);
441 buf->stop_copy_chunk_num = 0;
442 list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
443 spin_unlock_irq(&buf->migf->list_lock);
446 struct mlx5_vhca_data_buffer *
447 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
448 size_t length, enum dma_data_direction dma_dir)
450 struct mlx5_vhca_data_buffer *buf, *temp_buf;
451 struct list_head free_list;
453 lockdep_assert_held(&migf->mvdev->state_mutex);
454 if (migf->mvdev->mdev_detach)
455 return ERR_PTR(-ENOTCONN);
457 INIT_LIST_HEAD(&free_list);
459 spin_lock_irq(&migf->list_lock);
460 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
461 if (buf->dma_dir == dma_dir) {
462 list_del_init(&buf->buf_elm);
463 if (buf->allocated_length >= length) {
464 spin_unlock_irq(&migf->list_lock);
468 * Prevent holding redundant buffers. Put in a free
469 * list and call at the end not under the spin lock
470 * (&migf->list_lock) to mlx5vf_free_data_buffer which
473 list_add(&buf->buf_elm, &free_list);
476 spin_unlock_irq(&migf->list_lock);
477 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
480 while ((temp_buf = list_first_entry_or_null(&free_list,
481 struct mlx5_vhca_data_buffer, buf_elm))) {
482 list_del(&temp_buf->buf_elm);
483 mlx5vf_free_data_buffer(temp_buf);
490 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
491 struct mlx5vf_async_data *async_data)
493 kvfree(async_data->out);
494 complete(&migf->save_comp);
498 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
500 struct mlx5vf_async_data *async_data = container_of(_work,
501 struct mlx5vf_async_data, work);
502 struct mlx5_vf_migration_file *migf = container_of(async_data,
503 struct mlx5_vf_migration_file, async_data);
505 mutex_lock(&migf->lock);
506 if (async_data->status) {
507 mlx5vf_put_data_buffer(async_data->buf);
508 if (async_data->header_buf)
509 mlx5vf_put_data_buffer(async_data->header_buf);
510 if (!async_data->stop_copy_chunk &&
511 async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
512 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
514 migf->state = MLX5_MIGF_STATE_ERROR;
515 wake_up_interruptible(&migf->poll_wait);
517 mutex_unlock(&migf->lock);
518 mlx5vf_save_callback_complete(migf, async_data);
521 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
522 size_t image_size, bool initial_pre_copy)
524 struct mlx5_vf_migration_file *migf = header_buf->migf;
525 struct mlx5_vf_migration_header header = {};
530 header.record_size = cpu_to_le64(image_size);
531 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
532 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
533 page = mlx5vf_get_migration_page(header_buf, 0);
536 to_buff = kmap_local_page(page);
537 memcpy(to_buff, &header, sizeof(header));
538 kunmap_local(to_buff);
539 header_buf->length = sizeof(header);
540 header_buf->start_pos = header_buf->migf->max_pos;
541 migf->max_pos += header_buf->length;
542 spin_lock_irqsave(&migf->list_lock, flags);
543 list_add_tail(&header_buf->buf_elm, &migf->buf_list);
544 spin_unlock_irqrestore(&migf->list_lock, flags);
545 if (initial_pre_copy)
546 migf->pre_copy_initial_bytes += sizeof(header);
550 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
552 struct mlx5vf_async_data *async_data = container_of(context,
553 struct mlx5vf_async_data, cb_work);
554 struct mlx5_vf_migration_file *migf = container_of(async_data,
555 struct mlx5_vf_migration_file, async_data);
558 size_t next_required_umem_size = 0;
559 bool stop_copy_last_chunk;
562 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
563 !async_data->stop_copy_chunk;
565 image_size = MLX5_GET(save_vhca_state_out, async_data->out,
567 if (async_data->buf->stop_copy_chunk_num)
568 next_required_umem_size = MLX5_GET(save_vhca_state_out,
569 async_data->out, next_required_umem_size);
570 stop_copy_last_chunk = async_data->stop_copy_chunk &&
571 !next_required_umem_size;
572 if (async_data->header_buf) {
573 status = add_buf_header(async_data->header_buf, image_size,
578 async_data->buf->length = image_size;
579 async_data->buf->start_pos = migf->max_pos;
580 migf->max_pos += async_data->buf->length;
581 spin_lock_irqsave(&migf->list_lock, flags);
582 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
583 if (async_data->buf->stop_copy_chunk_num) {
584 migf->num_ready_chunks++;
585 if (next_required_umem_size &&
586 migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
587 /* Delay the next SAVE till one chunk be consumed */
588 migf->next_required_umem_size = next_required_umem_size;
589 next_required_umem_size = 0;
592 spin_unlock_irqrestore(&migf->list_lock, flags);
593 if (initial_pre_copy) {
594 migf->pre_copy_initial_bytes += image_size;
595 migf->state = MLX5_MIGF_STATE_PRE_COPY;
597 if (stop_copy_last_chunk)
598 migf->state = MLX5_MIGF_STATE_COMPLETE;
599 wake_up_interruptible(&migf->poll_wait);
600 if (next_required_umem_size)
601 mlx5vf_mig_file_set_save_work(migf,
602 /* Picking up the next chunk num */
603 (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
604 next_required_umem_size);
605 mlx5vf_save_callback_complete(migf, async_data);
610 /* The error flow can't run from an interrupt context */
611 if (status == -EREMOTEIO)
612 status = MLX5_GET(save_vhca_state_out, async_data->out, status);
613 async_data->status = status;
614 queue_work(migf->mvdev->cb_wq, &async_data->work);
617 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
618 struct mlx5_vf_migration_file *migf,
619 struct mlx5_vhca_data_buffer *buf, bool inc,
622 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
623 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
624 struct mlx5_vhca_data_buffer *header_buf = NULL;
625 struct mlx5vf_async_data *async_data;
628 lockdep_assert_held(&mvdev->state_mutex);
629 if (mvdev->mdev_detach)
632 err = wait_for_completion_interruptible(&migf->save_comp);
636 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
638 * In case we had a PRE_COPY error, SAVE is triggered only for
639 * the final image, read device full image.
643 MLX5_SET(save_vhca_state_in, in, opcode,
644 MLX5_CMD_OP_SAVE_VHCA_STATE);
645 MLX5_SET(save_vhca_state_in, in, op_mod, 0);
646 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
647 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
648 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
649 MLX5_SET(save_vhca_state_in, in, incremental, inc);
650 MLX5_SET(save_vhca_state_in, in, set_track, track);
652 async_data = &migf->async_data;
653 async_data->buf = buf;
654 async_data->stop_copy_chunk = !track;
655 async_data->out = kvzalloc(out_size, GFP_KERNEL);
656 if (!async_data->out) {
661 if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
662 if (async_data->stop_copy_chunk) {
663 u8 header_idx = buf->stop_copy_chunk_num ?
664 buf->stop_copy_chunk_num - 1 : 0;
666 header_buf = migf->buf_header[header_idx];
667 migf->buf_header[header_idx] = NULL;
671 header_buf = mlx5vf_get_data_buffer(migf,
672 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
673 if (IS_ERR(header_buf)) {
674 err = PTR_ERR(header_buf);
680 if (async_data->stop_copy_chunk)
681 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
683 async_data->header_buf = header_buf;
684 get_file(migf->filp);
685 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
687 out_size, mlx5vf_save_callback,
688 &async_data->cb_work);
696 mlx5vf_put_data_buffer(header_buf);
699 kvfree(async_data->out);
701 complete(&migf->save_comp);
705 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
706 struct mlx5_vf_migration_file *migf,
707 struct mlx5_vhca_data_buffer *buf)
709 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
710 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
713 lockdep_assert_held(&mvdev->state_mutex);
714 if (mvdev->mdev_detach)
718 err = mlx5vf_dma_data_buffer(buf);
723 MLX5_SET(load_vhca_state_in, in, opcode,
724 MLX5_CMD_OP_LOAD_VHCA_STATE);
725 MLX5_SET(load_vhca_state_in, in, op_mod, 0);
726 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
727 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
728 MLX5_SET(load_vhca_state_in, in, size, buf->length);
729 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
732 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
736 lockdep_assert_held(&migf->mvdev->state_mutex);
737 if (migf->mvdev->mdev_detach)
740 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
744 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
746 lockdep_assert_held(&migf->mvdev->state_mutex);
747 if (migf->mvdev->mdev_detach)
750 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
753 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
755 struct mlx5_vhca_data_buffer *entry;
758 lockdep_assert_held(&migf->mvdev->state_mutex);
759 WARN_ON(migf->mvdev->mdev_detach);
761 for (i = 0; i < MAX_NUM_CHUNKS; i++) {
763 mlx5vf_free_data_buffer(migf->buf[i]);
767 if (migf->buf_header[i]) {
768 mlx5vf_free_data_buffer(migf->buf_header[i]);
769 migf->buf_header[i] = NULL;
773 list_splice(&migf->avail_list, &migf->buf_list);
775 while ((entry = list_first_entry_or_null(&migf->buf_list,
776 struct mlx5_vhca_data_buffer, buf_elm))) {
777 list_del(&entry->buf_elm);
778 mlx5vf_free_data_buffer(entry);
781 mlx5vf_cmd_dealloc_pd(migf);
784 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
785 struct mlx5vf_pci_core_device *mvdev,
786 struct rb_root_cached *ranges, u32 nnodes)
789 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
790 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
791 int record_size = MLX5_ST_SZ_BYTES(page_track_range);
792 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
793 struct interval_tree_node *node = NULL;
794 u64 total_ranges_len = 0;
795 u32 num_ranges = nnodes;
796 u8 log_addr_space_size;
797 void *range_list_ptr;
805 if (num_ranges > max_num_range) {
806 vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
807 num_ranges = max_num_range;
810 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
811 record_size * num_ranges;
812 in = kzalloc(inlen, GFP_KERNEL);
816 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
817 general_obj_in_cmd_hdr);
818 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
819 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
820 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
821 MLX5_OBJ_TYPE_PAGE_TRACK);
822 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
823 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
824 MLX5_SET(page_track, obj_context, track_type, 1);
825 MLX5_SET(page_track, obj_context, log_page_size,
826 ilog2(tracker->host_qp->tracked_page_size));
827 MLX5_SET(page_track, obj_context, log_msg_size,
828 ilog2(tracker->host_qp->max_msg_size));
829 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
830 MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
832 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
833 node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
834 for (i = 0; i < num_ranges; i++) {
835 void *addr_range_i_base = range_list_ptr + record_size * i;
836 unsigned long length = node->last - node->start + 1;
838 MLX5_SET64(page_track_range, addr_range_i_base, start_address,
840 MLX5_SET64(page_track_range, addr_range_i_base, length, length);
841 total_ranges_len += length;
842 node = interval_tree_iter_next(node, 0, ULONG_MAX);
846 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
847 if (log_addr_space_size <
848 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
849 log_addr_space_size >
850 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
855 MLX5_SET(page_track, obj_context, log_addr_space_size,
856 log_addr_space_size);
857 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
861 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
867 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
870 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
871 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
873 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
874 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
875 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
877 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
880 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
881 u32 tracker_id, unsigned long iova,
882 unsigned long length, u32 tracker_state)
884 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
885 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
889 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
890 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
891 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
892 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
894 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
895 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
896 MLX5_SET64(page_track, obj_context, range_start_address, iova);
897 MLX5_SET64(page_track, obj_context, length, length);
898 MLX5_SET(page_track, obj_context, state, tracker_state);
900 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
903 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
904 struct mlx5_vhca_cq_buf *buf, int nent,
907 struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
908 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
909 u8 log_wq_sz = ilog2(cqe_size);
912 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
913 mdev->priv.numa_node);
917 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
918 buf->cqe_size = cqe_size;
923 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
925 struct mlx5_cqe64 *cqe64;
929 for (i = 0; i < buf->nent; i++) {
930 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
931 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
932 cqe64->op_own = MLX5_CQE_INVALID << 4;
936 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
937 struct mlx5_vhca_cq *cq)
939 mlx5_core_destroy_cq(mdev, &cq->mcq);
940 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
941 mlx5_db_free(mdev, &cq->db);
944 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
946 if (type != MLX5_EVENT_TYPE_CQ_ERROR)
949 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
953 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
956 struct mlx5_vhca_page_tracker *tracker =
957 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
958 struct mlx5vf_pci_core_device *mvdev = container_of(
959 tracker, struct mlx5vf_pci_core_device, tracker);
960 struct mlx5_eqe *eqe = data;
961 u8 event_type = (u8)type;
965 switch (event_type) {
966 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
967 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
968 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
969 queue_type = eqe->data.qp_srq.type;
970 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
972 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
973 if (qp_num != tracker->host_qp->qpn &&
974 qp_num != tracker->fw_qp->qpn)
976 set_tracker_error(mvdev);
985 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
986 struct mlx5_eqe *eqe)
988 struct mlx5vf_pci_core_device *mvdev =
989 container_of(mcq, struct mlx5vf_pci_core_device,
992 complete(&mvdev->tracker_comp);
995 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
996 struct mlx5_vhca_page_tracker *tracker,
999 int cqe_size = cache_line_size() == 128 ? 128 : 64;
1000 u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1001 struct mlx5_vhca_cq *cq;
1002 int inlen, err, eqn;
1008 ncqe = roundup_pow_of_two(ncqe);
1009 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1014 cq->mcq.set_ci_db = cq->db.db;
1015 cq->mcq.arm_db = cq->db.db + 1;
1016 cq->mcq.cqe_sz = cqe_size;
1017 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1021 init_cq_frag_buf(&cq->buf);
1022 inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1023 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1024 cq->buf.frag_buf.npages;
1025 in = kvzalloc(inlen, GFP_KERNEL);
1031 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1032 err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1036 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1037 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1038 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1039 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1040 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1041 MLX5_ADAPTER_PAGE_SHIFT);
1042 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1043 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1044 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1045 cq->mcq.comp = mlx5vf_cq_complete;
1046 cq->mcq.event = mlx5vf_cq_event;
1047 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1051 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1052 cq->mcq.cons_index);
1059 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1061 mlx5_db_free(mdev, &cq->db);
1065 static struct mlx5_vhca_qp *
1066 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1067 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1069 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1070 struct mlx5_vhca_qp *qp;
1078 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1080 return ERR_PTR(-ENOMEM);
1082 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1087 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1088 log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1089 log_rq_sz = ilog2(qp->rq.wqe_cnt);
1090 err = mlx5_frag_buf_alloc_node(mdev,
1091 wq_get_byte_sz(log_rq_sz, log_rq_stride),
1092 &qp->buf, mdev->priv.numa_node);
1095 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1098 qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1099 inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1100 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1102 in = kvzalloc(inlen, GFP_KERNEL);
1108 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1109 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1110 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1111 MLX5_SET(qpc, qpc, pd, tracker->pdn);
1112 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1113 MLX5_SET(qpc, qpc, log_page_size,
1114 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1115 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1116 if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1117 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1118 MLX5_SET(qpc, qpc, no_sq, 1);
1120 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1121 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1122 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1123 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1124 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1125 mlx5_fill_page_frag_array(&qp->buf,
1126 (__be64 *)MLX5_ADDR_OF(create_qp_in,
1129 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1132 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1133 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1138 qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1143 mlx5_frag_buf_free(mdev, &qp->buf);
1145 mlx5_db_free(mdev, &qp->db);
1148 return ERR_PTR(err);
1151 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1153 struct mlx5_wqe_data_seg *data;
1156 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1157 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1158 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1159 data->byte_count = cpu_to_be32(qp->max_msg_size);
1160 data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1161 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1163 /* Make sure that descriptors are written before doorbell record. */
1165 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1168 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1169 struct mlx5_vhca_qp *qp, u32 remote_qpn,
1172 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1173 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1174 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1179 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1180 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1181 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1182 MLX5_SET(qpc, qpc, rre, 1);
1183 MLX5_SET(qpc, qpc, rwe, 1);
1184 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1185 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1186 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1191 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1194 for (i = 0; i < qp->rq.wqe_cnt; i++) {
1195 mlx5vf_post_recv(qp);
1196 recv_buf->next_rq_offset += qp->max_msg_size;
1201 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1202 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1203 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1204 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1205 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1206 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1207 MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1208 MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1209 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1210 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1211 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1216 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1217 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1218 MLX5_SET(qpc, qpc, retry_count, 7);
1219 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1220 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1221 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1222 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1224 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1227 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1228 struct mlx5_vhca_qp *qp)
1230 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1232 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1233 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1234 mlx5_cmd_exec_in(mdev, destroy_qp, in);
1236 mlx5_frag_buf_free(mdev, &qp->buf);
1237 mlx5_db_free(mdev, &qp->db);
1241 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1245 /* Undo alloc_pages_bulk_array() */
1246 for (i = 0; i < recv_buf->npages; i++)
1247 __free_page(recv_buf->page_list[i]);
1249 kvfree(recv_buf->page_list);
1252 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1253 unsigned int npages)
1255 unsigned int filled = 0, done = 0;
1258 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1259 GFP_KERNEL_ACCOUNT);
1260 if (!recv_buf->page_list)
1264 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1266 recv_buf->page_list + done);
1275 recv_buf->npages = npages;
1279 for (i = 0; i < npages; i++) {
1280 if (recv_buf->page_list[i])
1281 __free_page(recv_buf->page_list[i]);
1284 kvfree(recv_buf->page_list);
1288 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1289 struct mlx5_vhca_recv_buf *recv_buf)
1293 recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1294 sizeof(*recv_buf->dma_addrs),
1295 GFP_KERNEL_ACCOUNT);
1296 if (!recv_buf->dma_addrs)
1299 for (i = 0; i < recv_buf->npages; i++) {
1300 recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1301 recv_buf->page_list[i],
1304 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1310 for (j = 0; j < i; j++)
1311 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1312 PAGE_SIZE, DMA_FROM_DEVICE);
1314 kvfree(recv_buf->dma_addrs);
1318 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1319 struct mlx5_vhca_recv_buf *recv_buf)
1323 for (i = 0; i < recv_buf->npages; i++)
1324 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1325 PAGE_SIZE, DMA_FROM_DEVICE);
1327 kvfree(recv_buf->dma_addrs);
1330 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1331 struct mlx5_vhca_qp *qp)
1333 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1335 mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1336 unregister_dma_recv_pages(mdev, recv_buf);
1337 free_recv_pages(&qp->recv_buf);
1340 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1341 struct mlx5_vhca_qp *qp, u32 pdn,
1344 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1345 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1348 err = alloc_recv_pages(recv_buf, npages);
1352 err = register_dma_recv_pages(mdev, recv_buf);
1356 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1358 goto err_create_mkey;
1363 unregister_dma_recv_pages(mdev, recv_buf);
1365 free_recv_pages(recv_buf);
1370 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1372 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1373 struct mlx5_core_dev *mdev = mvdev->mdev;
1375 lockdep_assert_held(&mvdev->state_mutex);
1377 if (!mvdev->log_active)
1380 WARN_ON(mvdev->mdev_detach);
1382 mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1383 mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1384 mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1385 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1386 mlx5vf_destroy_qp(mdev, tracker->host_qp);
1387 mlx5vf_destroy_cq(mdev, &tracker->cq);
1388 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1389 mlx5_put_uars_page(mdev, tracker->uar);
1390 mvdev->log_active = false;
1393 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1395 struct mlx5vf_pci_core_device *mvdev = container_of(
1396 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1398 mutex_lock(&mvdev->state_mutex);
1399 if (!mvdev->log_active)
1402 _mlx5vf_free_page_tracker_resources(mvdev);
1403 mvdev->log_active = false;
1405 mlx5vf_state_mutex_unlock(mvdev);
1409 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1410 struct rb_root_cached *ranges, u32 nnodes,
1413 struct mlx5vf_pci_core_device *mvdev = container_of(
1414 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1415 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1416 u8 log_tracked_page = ilog2(*page_size);
1417 struct mlx5_vhca_qp *host_qp;
1418 struct mlx5_vhca_qp *fw_qp;
1419 struct mlx5_core_dev *mdev;
1420 u32 max_msg_size = PAGE_SIZE;
1421 u64 rq_size = SZ_2M;
1425 mutex_lock(&mvdev->state_mutex);
1426 if (mvdev->mdev_detach) {
1431 if (mvdev->log_active) {
1437 memset(tracker, 0, sizeof(*tracker));
1438 tracker->uar = mlx5_get_uars_page(mdev);
1439 if (IS_ERR(tracker->uar)) {
1440 err = PTR_ERR(tracker->uar);
1444 err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1448 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1449 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1451 goto err_dealloc_pd;
1453 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1454 if (IS_ERR(host_qp)) {
1455 err = PTR_ERR(host_qp);
1459 host_qp->max_msg_size = max_msg_size;
1460 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1461 pg_track_log_min_page_size)) {
1462 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1463 pg_track_log_min_page_size);
1464 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1465 pg_track_log_max_page_size)) {
1466 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1467 pg_track_log_max_page_size);
1470 host_qp->tracked_page_size = (1ULL << log_tracked_page);
1471 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1476 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1477 if (IS_ERR(fw_qp)) {
1478 err = PTR_ERR(fw_qp);
1479 goto err_recv_resources;
1482 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1486 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1490 tracker->host_qp = host_qp;
1491 tracker->fw_qp = fw_qp;
1492 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1496 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1497 mlx5_eq_notifier_register(mdev, &tracker->nb);
1498 *page_size = host_qp->tracked_page_size;
1499 mvdev->log_active = true;
1500 mlx5vf_state_mutex_unlock(mvdev);
1504 mlx5vf_destroy_qp(mdev, fw_qp);
1506 mlx5vf_free_qp_recv_resources(mdev, host_qp);
1508 mlx5vf_destroy_qp(mdev, host_qp);
1510 mlx5vf_destroy_cq(mdev, &tracker->cq);
1512 mlx5_core_dealloc_pd(mdev, tracker->pdn);
1514 mlx5_put_uars_page(mdev, tracker->uar);
1516 mlx5vf_state_mutex_unlock(mvdev);
1521 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1522 struct iova_bitmap *dirty)
1524 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1525 u32 nent = size / entry_size;
1531 if (WARN_ON(index >= qp->recv_buf.npages ||
1532 (nent > qp->max_msg_size / entry_size)))
1535 page = qp->recv_buf.page_list[index];
1536 buf = kmap_local_page(page);
1537 for (i = 0; i < nent; i++) {
1538 addr = MLX5_GET(page_track_report_entry, buf + i,
1540 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1541 dirty_address_high) << 32;
1542 iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1548 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1549 struct iova_bitmap *dirty, int *tracker_status)
1555 *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1556 size = be32_to_cpu(cqe->byte_cnt);
1557 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1559 /* zero length CQE, no data */
1560 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1562 set_report_output(size, ix, qp, dirty);
1564 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1565 mlx5vf_post_recv(qp);
1568 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1570 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1573 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1575 void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1576 struct mlx5_cqe64 *cqe64;
1578 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1580 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1581 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1589 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1590 struct iova_bitmap *dirty, int *tracker_status)
1592 struct mlx5_cqe64 *cqe;
1595 cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1599 ++cq->mcq.cons_index;
1601 * Make sure we read CQ entry contents after we've checked the
1605 opcode = get_cqe_opcode(cqe);
1607 case MLX5_CQE_RESP_SEND_IMM:
1608 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1615 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1616 unsigned long length,
1617 struct iova_bitmap *dirty)
1619 struct mlx5vf_pci_core_device *mvdev = container_of(
1620 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1621 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1622 struct mlx5_vhca_cq *cq = &tracker->cq;
1623 struct mlx5_core_dev *mdev;
1626 mutex_lock(&mvdev->state_mutex);
1627 if (!mvdev->log_active) {
1632 if (mvdev->mdev_detach) {
1638 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1639 MLX5_PAGE_TRACK_STATE_REPORTING);
1643 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1644 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1646 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1648 if (poll_err == CQ_EMPTY) {
1649 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1650 cq->mcq.cons_index);
1651 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1652 dirty, &tracker->status);
1653 if (poll_err == CQ_EMPTY) {
1654 wait_for_completion(&mvdev->tracker_comp);
1658 if (poll_err == CQ_POLL_ERR) {
1662 mlx5_cq_set_ci(&cq->mcq);
1665 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1666 tracker->is_err = true;
1668 if (tracker->is_err)
1671 mlx5vf_state_mutex_unlock(mvdev);