Mention branches and keyring.
[releases.git] / mlx5 / cmd.c
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5
6 #include "cmd.h"
7
8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11 {
12         int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13         void *query_cap = NULL, *cap;
14         int ret;
15
16         query_cap = kzalloc(query_sz, GFP_KERNEL);
17         if (!query_cap)
18                 return -ENOMEM;
19
20         ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21                                             MLX5_CAP_GENERAL_2);
22         if (ret)
23                 goto out;
24
25         cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26         if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27                 ret = -EOPNOTSUPP;
28 out:
29         kfree(query_cap);
30         return ret;
31 }
32
33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34                                   u16 *vhca_id);
35 static void
36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37
38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39 {
40         struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41         u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42         u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43         int err;
44
45         lockdep_assert_held(&mvdev->state_mutex);
46         if (mvdev->mdev_detach)
47                 return -ENOTCONN;
48
49         /*
50          * In case PRE_COPY is used, saving_migf is exposed while the device is
51          * running. Make sure to run only once there is no active save command.
52          * Running both in parallel, might end-up with a failure in the save
53          * command once it will try to turn on 'tracking' on a suspended device.
54          */
55         if (migf) {
56                 err = wait_for_completion_interruptible(&migf->save_comp);
57                 if (err)
58                         return err;
59         }
60
61         MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62         MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63         MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64
65         err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66         if (migf)
67                 complete(&migf->save_comp);
68
69         return err;
70 }
71
72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73 {
74         u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75         u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76
77         lockdep_assert_held(&mvdev->state_mutex);
78         if (mvdev->mdev_detach)
79                 return -ENOTCONN;
80
81         MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82         MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83         MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84
85         return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86 }
87
88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89                                           size_t *state_size, u64 *total_size,
90                                           u8 query_flags)
91 {
92         u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93         u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94         bool inc = query_flags & MLX5VF_QUERY_INC;
95         int ret;
96
97         lockdep_assert_held(&mvdev->state_mutex);
98         if (mvdev->mdev_detach)
99                 return -ENOTCONN;
100
101         /*
102          * In case PRE_COPY is used, saving_migf is exposed while device is
103          * running. Make sure to run only once there is no active save command.
104          * Running both in parallel, might end-up with a failure in the
105          * incremental query command on un-tracked vhca.
106          */
107         if (inc) {
108                 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
109                 if (ret)
110                         return ret;
111                 if (mvdev->saving_migf->state ==
112                     MLX5_MIGF_STATE_PRE_COPY_ERROR) {
113                         /*
114                          * In case we had a PRE_COPY error, only query full
115                          * image for final image
116                          */
117                         if (!(query_flags & MLX5VF_QUERY_FINAL)) {
118                                 *state_size = 0;
119                                 complete(&mvdev->saving_migf->save_comp);
120                                 return 0;
121                         }
122                         query_flags &= ~MLX5VF_QUERY_INC;
123                 }
124         }
125
126         MLX5_SET(query_vhca_migration_state_in, in, opcode,
127                  MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
128         MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
129         MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
130         MLX5_SET(query_vhca_migration_state_in, in, incremental,
131                  query_flags & MLX5VF_QUERY_INC);
132         MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
133
134         ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
135                                   out);
136         if (inc)
137                 complete(&mvdev->saving_migf->save_comp);
138
139         if (ret)
140                 return ret;
141
142         *state_size = MLX5_GET(query_vhca_migration_state_out, out,
143                                required_umem_size);
144         if (total_size)
145                 *total_size = mvdev->chunk_mode ?
146                         MLX5_GET64(query_vhca_migration_state_out, out,
147                                    remaining_total_size) : *state_size;
148
149         return 0;
150 }
151
152 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
153 {
154         /* Mark the tracker under an error and wake it up if it's running */
155         mvdev->tracker.is_err = true;
156         complete(&mvdev->tracker_comp);
157 }
158
159 static int mlx5fv_vf_event(struct notifier_block *nb,
160                            unsigned long event, void *data)
161 {
162         struct mlx5vf_pci_core_device *mvdev =
163                 container_of(nb, struct mlx5vf_pci_core_device, nb);
164
165         switch (event) {
166         case MLX5_PF_NOTIFY_ENABLE_VF:
167                 mutex_lock(&mvdev->state_mutex);
168                 mvdev->mdev_detach = false;
169                 mlx5vf_state_mutex_unlock(mvdev);
170                 break;
171         case MLX5_PF_NOTIFY_DISABLE_VF:
172                 mlx5vf_cmd_close_migratable(mvdev);
173                 mutex_lock(&mvdev->state_mutex);
174                 mvdev->mdev_detach = true;
175                 mlx5vf_state_mutex_unlock(mvdev);
176                 break;
177         default:
178                 break;
179         }
180
181         return 0;
182 }
183
184 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
185 {
186         if (!mvdev->migrate_cap)
187                 return;
188
189         /* Must be done outside the lock to let it progress */
190         set_tracker_error(mvdev);
191         mutex_lock(&mvdev->state_mutex);
192         mlx5vf_disable_fds(mvdev);
193         _mlx5vf_free_page_tracker_resources(mvdev);
194         mlx5vf_state_mutex_unlock(mvdev);
195 }
196
197 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
198 {
199         if (!mvdev->migrate_cap)
200                 return;
201
202         mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
203                                                 &mvdev->nb);
204         destroy_workqueue(mvdev->cb_wq);
205 }
206
207 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
208                                const struct vfio_migration_ops *mig_ops,
209                                const struct vfio_log_ops *log_ops)
210 {
211         struct pci_dev *pdev = mvdev->core_device.pdev;
212         int ret;
213
214         if (!pdev->is_virtfn)
215                 return;
216
217         mvdev->mdev = mlx5_vf_get_core_dev(pdev);
218         if (!mvdev->mdev)
219                 return;
220
221         if (!MLX5_CAP_GEN(mvdev->mdev, migration))
222                 goto end;
223
224         mvdev->vf_id = pci_iov_vf_id(pdev);
225         if (mvdev->vf_id < 0)
226                 goto end;
227
228         ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
229         if (ret)
230                 goto end;
231
232         if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
233                                    &mvdev->vhca_id))
234                 goto end;
235
236         mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
237         if (!mvdev->cb_wq)
238                 goto end;
239
240         mutex_init(&mvdev->state_mutex);
241         spin_lock_init(&mvdev->reset_lock);
242         mvdev->nb.notifier_call = mlx5fv_vf_event;
243         ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
244                                                     &mvdev->nb);
245         if (ret) {
246                 destroy_workqueue(mvdev->cb_wq);
247                 goto end;
248         }
249
250         mvdev->migrate_cap = 1;
251         mvdev->core_device.vdev.migration_flags =
252                 VFIO_MIGRATION_STOP_COPY |
253                 VFIO_MIGRATION_P2P;
254         mvdev->core_device.vdev.mig_ops = mig_ops;
255         init_completion(&mvdev->tracker_comp);
256         if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
257                 mvdev->core_device.vdev.log_ops = log_ops;
258
259         if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
260             MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
261                 mvdev->core_device.vdev.migration_flags |=
262                         VFIO_MIGRATION_PRE_COPY;
263
264         if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
265                 mvdev->chunk_mode = 1;
266
267 end:
268         mlx5_vf_put_core_dev(mvdev->mdev);
269 }
270
271 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
272                                   u16 *vhca_id)
273 {
274         u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
275         int out_size;
276         void *out;
277         int ret;
278
279         out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
280         out = kzalloc(out_size, GFP_KERNEL);
281         if (!out)
282                 return -ENOMEM;
283
284         MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
285         MLX5_SET(query_hca_cap_in, in, other_function, 1);
286         MLX5_SET(query_hca_cap_in, in, function_id, function_id);
287         MLX5_SET(query_hca_cap_in, in, op_mod,
288                  MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
289                  HCA_CAP_OPMOD_GET_CUR);
290
291         ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
292         if (ret)
293                 goto err_exec;
294
295         *vhca_id = MLX5_GET(query_hca_cap_out, out,
296                             capability.cmd_hca_cap.vhca_id);
297
298 err_exec:
299         kfree(out);
300         return ret;
301 }
302
303 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
304                         struct mlx5_vhca_data_buffer *buf,
305                         struct mlx5_vhca_recv_buf *recv_buf,
306                         u32 *mkey)
307 {
308         size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
309                                 recv_buf->npages;
310         int err = 0, inlen;
311         __be64 *mtt;
312         void *mkc;
313         u32 *in;
314
315         inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
316                 sizeof(*mtt) * round_up(npages, 2);
317
318         in = kvzalloc(inlen, GFP_KERNEL);
319         if (!in)
320                 return -ENOMEM;
321
322         MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
323                  DIV_ROUND_UP(npages, 2));
324         mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
325
326         if (buf) {
327                 struct sg_dma_page_iter dma_iter;
328
329                 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
330                         *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
331         } else {
332                 int i;
333
334                 for (i = 0; i < npages; i++)
335                         *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
336         }
337
338         mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
339         MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
340         MLX5_SET(mkc, mkc, lr, 1);
341         MLX5_SET(mkc, mkc, lw, 1);
342         MLX5_SET(mkc, mkc, rr, 1);
343         MLX5_SET(mkc, mkc, rw, 1);
344         MLX5_SET(mkc, mkc, pd, pdn);
345         MLX5_SET(mkc, mkc, bsf_octword_size, 0);
346         MLX5_SET(mkc, mkc, qpn, 0xffffff);
347         MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
348         MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
349         MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
350         err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
351         kvfree(in);
352         return err;
353 }
354
355 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
356 {
357         struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
358         struct mlx5_core_dev *mdev = mvdev->mdev;
359         int ret;
360
361         lockdep_assert_held(&mvdev->state_mutex);
362         if (mvdev->mdev_detach)
363                 return -ENOTCONN;
364
365         if (buf->dmaed || !buf->allocated_length)
366                 return -EINVAL;
367
368         ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
369         if (ret)
370                 return ret;
371
372         ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
373         if (ret)
374                 goto err;
375
376         buf->dmaed = true;
377
378         return 0;
379 err:
380         dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
381         return ret;
382 }
383
384 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
385 {
386         struct mlx5_vf_migration_file *migf = buf->migf;
387         struct sg_page_iter sg_iter;
388
389         lockdep_assert_held(&migf->mvdev->state_mutex);
390         WARN_ON(migf->mvdev->mdev_detach);
391
392         if (buf->dmaed) {
393                 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
394                 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
395                                   buf->dma_dir, 0);
396         }
397
398         /* Undo alloc_pages_bulk_array() */
399         for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
400                 __free_page(sg_page_iter_page(&sg_iter));
401         sg_free_append_table(&buf->table);
402         kfree(buf);
403 }
404
405 struct mlx5_vhca_data_buffer *
406 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
407                          size_t length,
408                          enum dma_data_direction dma_dir)
409 {
410         struct mlx5_vhca_data_buffer *buf;
411         int ret;
412
413         buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
414         if (!buf)
415                 return ERR_PTR(-ENOMEM);
416
417         buf->dma_dir = dma_dir;
418         buf->migf = migf;
419         if (length) {
420                 ret = mlx5vf_add_migration_pages(buf,
421                                 DIV_ROUND_UP_ULL(length, PAGE_SIZE));
422                 if (ret)
423                         goto end;
424
425                 if (dma_dir != DMA_NONE) {
426                         ret = mlx5vf_dma_data_buffer(buf);
427                         if (ret)
428                                 goto end;
429                 }
430         }
431
432         return buf;
433 end:
434         mlx5vf_free_data_buffer(buf);
435         return ERR_PTR(ret);
436 }
437
438 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
439 {
440         spin_lock_irq(&buf->migf->list_lock);
441         buf->stop_copy_chunk_num = 0;
442         list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
443         spin_unlock_irq(&buf->migf->list_lock);
444 }
445
446 struct mlx5_vhca_data_buffer *
447 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
448                        size_t length, enum dma_data_direction dma_dir)
449 {
450         struct mlx5_vhca_data_buffer *buf, *temp_buf;
451         struct list_head free_list;
452
453         lockdep_assert_held(&migf->mvdev->state_mutex);
454         if (migf->mvdev->mdev_detach)
455                 return ERR_PTR(-ENOTCONN);
456
457         INIT_LIST_HEAD(&free_list);
458
459         spin_lock_irq(&migf->list_lock);
460         list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
461                 if (buf->dma_dir == dma_dir) {
462                         list_del_init(&buf->buf_elm);
463                         if (buf->allocated_length >= length) {
464                                 spin_unlock_irq(&migf->list_lock);
465                                 goto found;
466                         }
467                         /*
468                          * Prevent holding redundant buffers. Put in a free
469                          * list and call at the end not under the spin lock
470                          * (&migf->list_lock) to mlx5vf_free_data_buffer which
471                          * might sleep.
472                          */
473                         list_add(&buf->buf_elm, &free_list);
474                 }
475         }
476         spin_unlock_irq(&migf->list_lock);
477         buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
478
479 found:
480         while ((temp_buf = list_first_entry_or_null(&free_list,
481                                 struct mlx5_vhca_data_buffer, buf_elm))) {
482                 list_del(&temp_buf->buf_elm);
483                 mlx5vf_free_data_buffer(temp_buf);
484         }
485
486         return buf;
487 }
488
489 static void
490 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
491                               struct mlx5vf_async_data *async_data)
492 {
493         kvfree(async_data->out);
494         complete(&migf->save_comp);
495         fput(migf->filp);
496 }
497
498 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
499 {
500         struct mlx5vf_async_data *async_data = container_of(_work,
501                 struct mlx5vf_async_data, work);
502         struct mlx5_vf_migration_file *migf = container_of(async_data,
503                 struct mlx5_vf_migration_file, async_data);
504
505         mutex_lock(&migf->lock);
506         if (async_data->status) {
507                 mlx5vf_put_data_buffer(async_data->buf);
508                 if (async_data->header_buf)
509                         mlx5vf_put_data_buffer(async_data->header_buf);
510                 if (!async_data->stop_copy_chunk &&
511                     async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
512                         migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
513                 else
514                         migf->state = MLX5_MIGF_STATE_ERROR;
515                 wake_up_interruptible(&migf->poll_wait);
516         }
517         mutex_unlock(&migf->lock);
518         mlx5vf_save_callback_complete(migf, async_data);
519 }
520
521 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
522                           size_t image_size, bool initial_pre_copy)
523 {
524         struct mlx5_vf_migration_file *migf = header_buf->migf;
525         struct mlx5_vf_migration_header header = {};
526         unsigned long flags;
527         struct page *page;
528         u8 *to_buff;
529
530         header.record_size = cpu_to_le64(image_size);
531         header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
532         header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
533         page = mlx5vf_get_migration_page(header_buf, 0);
534         if (!page)
535                 return -EINVAL;
536         to_buff = kmap_local_page(page);
537         memcpy(to_buff, &header, sizeof(header));
538         kunmap_local(to_buff);
539         header_buf->length = sizeof(header);
540         header_buf->start_pos = header_buf->migf->max_pos;
541         migf->max_pos += header_buf->length;
542         spin_lock_irqsave(&migf->list_lock, flags);
543         list_add_tail(&header_buf->buf_elm, &migf->buf_list);
544         spin_unlock_irqrestore(&migf->list_lock, flags);
545         if (initial_pre_copy)
546                 migf->pre_copy_initial_bytes += sizeof(header);
547         return 0;
548 }
549
550 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
551 {
552         struct mlx5vf_async_data *async_data = container_of(context,
553                         struct mlx5vf_async_data, cb_work);
554         struct mlx5_vf_migration_file *migf = container_of(async_data,
555                         struct mlx5_vf_migration_file, async_data);
556
557         if (!status) {
558                 size_t next_required_umem_size = 0;
559                 bool stop_copy_last_chunk;
560                 size_t image_size;
561                 unsigned long flags;
562                 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
563                                 !async_data->stop_copy_chunk;
564
565                 image_size = MLX5_GET(save_vhca_state_out, async_data->out,
566                                       actual_image_size);
567                 if (async_data->buf->stop_copy_chunk_num)
568                         next_required_umem_size = MLX5_GET(save_vhca_state_out,
569                                         async_data->out, next_required_umem_size);
570                 stop_copy_last_chunk = async_data->stop_copy_chunk &&
571                                 !next_required_umem_size;
572                 if (async_data->header_buf) {
573                         status = add_buf_header(async_data->header_buf, image_size,
574                                                 initial_pre_copy);
575                         if (status)
576                                 goto err;
577                 }
578                 async_data->buf->length = image_size;
579                 async_data->buf->start_pos = migf->max_pos;
580                 migf->max_pos += async_data->buf->length;
581                 spin_lock_irqsave(&migf->list_lock, flags);
582                 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
583                 if (async_data->buf->stop_copy_chunk_num) {
584                         migf->num_ready_chunks++;
585                         if (next_required_umem_size &&
586                             migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
587                                 /* Delay the next SAVE till one chunk be consumed */
588                                 migf->next_required_umem_size = next_required_umem_size;
589                                 next_required_umem_size = 0;
590                         }
591                 }
592                 spin_unlock_irqrestore(&migf->list_lock, flags);
593                 if (initial_pre_copy) {
594                         migf->pre_copy_initial_bytes += image_size;
595                         migf->state = MLX5_MIGF_STATE_PRE_COPY;
596                 }
597                 if (stop_copy_last_chunk)
598                         migf->state = MLX5_MIGF_STATE_COMPLETE;
599                 wake_up_interruptible(&migf->poll_wait);
600                 if (next_required_umem_size)
601                         mlx5vf_mig_file_set_save_work(migf,
602                                 /* Picking up the next chunk num */
603                                 (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
604                                 next_required_umem_size);
605                 mlx5vf_save_callback_complete(migf, async_data);
606                 return;
607         }
608
609 err:
610         /* The error flow can't run from an interrupt context */
611         if (status == -EREMOTEIO)
612                 status = MLX5_GET(save_vhca_state_out, async_data->out, status);
613         async_data->status = status;
614         queue_work(migf->mvdev->cb_wq, &async_data->work);
615 }
616
617 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
618                                struct mlx5_vf_migration_file *migf,
619                                struct mlx5_vhca_data_buffer *buf, bool inc,
620                                bool track)
621 {
622         u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
623         u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
624         struct mlx5_vhca_data_buffer *header_buf = NULL;
625         struct mlx5vf_async_data *async_data;
626         int err;
627
628         lockdep_assert_held(&mvdev->state_mutex);
629         if (mvdev->mdev_detach)
630                 return -ENOTCONN;
631
632         err = wait_for_completion_interruptible(&migf->save_comp);
633         if (err)
634                 return err;
635
636         if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
637                 /*
638                  * In case we had a PRE_COPY error, SAVE is triggered only for
639                  * the final image, read device full image.
640                  */
641                 inc = false;
642
643         MLX5_SET(save_vhca_state_in, in, opcode,
644                  MLX5_CMD_OP_SAVE_VHCA_STATE);
645         MLX5_SET(save_vhca_state_in, in, op_mod, 0);
646         MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
647         MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
648         MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
649         MLX5_SET(save_vhca_state_in, in, incremental, inc);
650         MLX5_SET(save_vhca_state_in, in, set_track, track);
651
652         async_data = &migf->async_data;
653         async_data->buf = buf;
654         async_data->stop_copy_chunk = !track;
655         async_data->out = kvzalloc(out_size, GFP_KERNEL);
656         if (!async_data->out) {
657                 err = -ENOMEM;
658                 goto err_out;
659         }
660
661         if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
662                 if (async_data->stop_copy_chunk) {
663                         u8 header_idx = buf->stop_copy_chunk_num ?
664                                 buf->stop_copy_chunk_num - 1 : 0;
665
666                         header_buf = migf->buf_header[header_idx];
667                         migf->buf_header[header_idx] = NULL;
668                 }
669
670                 if (!header_buf) {
671                         header_buf = mlx5vf_get_data_buffer(migf,
672                                 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
673                         if (IS_ERR(header_buf)) {
674                                 err = PTR_ERR(header_buf);
675                                 goto err_free;
676                         }
677                 }
678         }
679
680         if (async_data->stop_copy_chunk)
681                 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
682
683         async_data->header_buf = header_buf;
684         get_file(migf->filp);
685         err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
686                                async_data->out,
687                                out_size, mlx5vf_save_callback,
688                                &async_data->cb_work);
689         if (err)
690                 goto err_exec;
691
692         return 0;
693
694 err_exec:
695         if (header_buf)
696                 mlx5vf_put_data_buffer(header_buf);
697         fput(migf->filp);
698 err_free:
699         kvfree(async_data->out);
700 err_out:
701         complete(&migf->save_comp);
702         return err;
703 }
704
705 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
706                                struct mlx5_vf_migration_file *migf,
707                                struct mlx5_vhca_data_buffer *buf)
708 {
709         u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
710         u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
711         int err;
712
713         lockdep_assert_held(&mvdev->state_mutex);
714         if (mvdev->mdev_detach)
715                 return -ENOTCONN;
716
717         if (!buf->dmaed) {
718                 err = mlx5vf_dma_data_buffer(buf);
719                 if (err)
720                         return err;
721         }
722
723         MLX5_SET(load_vhca_state_in, in, opcode,
724                  MLX5_CMD_OP_LOAD_VHCA_STATE);
725         MLX5_SET(load_vhca_state_in, in, op_mod, 0);
726         MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
727         MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
728         MLX5_SET(load_vhca_state_in, in, size, buf->length);
729         return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
730 }
731
732 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
733 {
734         int err;
735
736         lockdep_assert_held(&migf->mvdev->state_mutex);
737         if (migf->mvdev->mdev_detach)
738                 return -ENOTCONN;
739
740         err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
741         return err;
742 }
743
744 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
745 {
746         lockdep_assert_held(&migf->mvdev->state_mutex);
747         if (migf->mvdev->mdev_detach)
748                 return;
749
750         mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
751 }
752
753 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
754 {
755         struct mlx5_vhca_data_buffer *entry;
756         int i;
757
758         lockdep_assert_held(&migf->mvdev->state_mutex);
759         WARN_ON(migf->mvdev->mdev_detach);
760
761         for (i = 0; i < MAX_NUM_CHUNKS; i++) {
762                 if (migf->buf[i]) {
763                         mlx5vf_free_data_buffer(migf->buf[i]);
764                         migf->buf[i] = NULL;
765                 }
766
767                 if (migf->buf_header[i]) {
768                         mlx5vf_free_data_buffer(migf->buf_header[i]);
769                         migf->buf_header[i] = NULL;
770                 }
771         }
772
773         list_splice(&migf->avail_list, &migf->buf_list);
774
775         while ((entry = list_first_entry_or_null(&migf->buf_list,
776                                 struct mlx5_vhca_data_buffer, buf_elm))) {
777                 list_del(&entry->buf_elm);
778                 mlx5vf_free_data_buffer(entry);
779         }
780
781         mlx5vf_cmd_dealloc_pd(migf);
782 }
783
784 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
785                                  struct mlx5vf_pci_core_device *mvdev,
786                                  struct rb_root_cached *ranges, u32 nnodes)
787 {
788         int max_num_range =
789                 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
790         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
791         int record_size = MLX5_ST_SZ_BYTES(page_track_range);
792         u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
793         struct interval_tree_node *node = NULL;
794         u64 total_ranges_len = 0;
795         u32 num_ranges = nnodes;
796         u8 log_addr_space_size;
797         void *range_list_ptr;
798         void *obj_context;
799         void *cmd_hdr;
800         int inlen;
801         void *in;
802         int err;
803         int i;
804
805         if (num_ranges > max_num_range) {
806                 vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
807                 num_ranges = max_num_range;
808         }
809
810         inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
811                                  record_size * num_ranges;
812         in = kzalloc(inlen, GFP_KERNEL);
813         if (!in)
814                 return -ENOMEM;
815
816         cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
817                                general_obj_in_cmd_hdr);
818         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
819                  MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
820         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
821                  MLX5_OBJ_TYPE_PAGE_TRACK);
822         obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
823         MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
824         MLX5_SET(page_track, obj_context, track_type, 1);
825         MLX5_SET(page_track, obj_context, log_page_size,
826                  ilog2(tracker->host_qp->tracked_page_size));
827         MLX5_SET(page_track, obj_context, log_msg_size,
828                  ilog2(tracker->host_qp->max_msg_size));
829         MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
830         MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
831
832         range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
833         node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
834         for (i = 0; i < num_ranges; i++) {
835                 void *addr_range_i_base = range_list_ptr + record_size * i;
836                 unsigned long length = node->last - node->start + 1;
837
838                 MLX5_SET64(page_track_range, addr_range_i_base, start_address,
839                            node->start);
840                 MLX5_SET64(page_track_range, addr_range_i_base, length, length);
841                 total_ranges_len += length;
842                 node = interval_tree_iter_next(node, 0, ULONG_MAX);
843         }
844
845         WARN_ON(node);
846         log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
847         if (log_addr_space_size <
848             (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
849             log_addr_space_size >
850             (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
851                 err = -EOPNOTSUPP;
852                 goto out;
853         }
854
855         MLX5_SET(page_track, obj_context, log_addr_space_size,
856                  log_addr_space_size);
857         err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
858         if (err)
859                 goto out;
860
861         tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
862 out:
863         kfree(in);
864         return err;
865 }
866
867 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
868                                       u32 tracker_id)
869 {
870         u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
871         u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
872
873         MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
874         MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
875         MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
876
877         return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
878 }
879
880 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
881                                      u32 tracker_id, unsigned long iova,
882                                      unsigned long length, u32 tracker_state)
883 {
884         u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
885         u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
886         void *obj_context;
887         void *cmd_hdr;
888
889         cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
890         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
891         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
892         MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
893
894         obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
895         MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
896         MLX5_SET64(page_track, obj_context, range_start_address, iova);
897         MLX5_SET64(page_track, obj_context, length, length);
898         MLX5_SET(page_track, obj_context, state, tracker_state);
899
900         return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
901 }
902
903 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
904                              struct mlx5_vhca_cq_buf *buf, int nent,
905                              int cqe_size)
906 {
907         struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
908         u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
909         u8 log_wq_sz = ilog2(cqe_size);
910         int err;
911
912         err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
913                                        mdev->priv.numa_node);
914         if (err)
915                 return err;
916
917         mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
918         buf->cqe_size = cqe_size;
919         buf->nent = nent;
920         return 0;
921 }
922
923 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
924 {
925         struct mlx5_cqe64 *cqe64;
926         void *cqe;
927         int i;
928
929         for (i = 0; i < buf->nent; i++) {
930                 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
931                 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
932                 cqe64->op_own = MLX5_CQE_INVALID << 4;
933         }
934 }
935
936 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
937                               struct mlx5_vhca_cq *cq)
938 {
939         mlx5_core_destroy_cq(mdev, &cq->mcq);
940         mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
941         mlx5_db_free(mdev, &cq->db);
942 }
943
944 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
945 {
946         if (type != MLX5_EVENT_TYPE_CQ_ERROR)
947                 return;
948
949         set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
950                                        tracker.cq.mcq));
951 }
952
953 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
954                                  void *data)
955 {
956         struct mlx5_vhca_page_tracker *tracker =
957                 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
958         struct mlx5vf_pci_core_device *mvdev = container_of(
959                 tracker, struct mlx5vf_pci_core_device, tracker);
960         struct mlx5_eqe *eqe = data;
961         u8 event_type = (u8)type;
962         u8 queue_type;
963         int qp_num;
964
965         switch (event_type) {
966         case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
967         case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
968         case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
969                 queue_type = eqe->data.qp_srq.type;
970                 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
971                         break;
972                 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
973                 if (qp_num != tracker->host_qp->qpn &&
974                     qp_num != tracker->fw_qp->qpn)
975                         break;
976                 set_tracker_error(mvdev);
977                 break;
978         default:
979                 break;
980         }
981
982         return NOTIFY_OK;
983 }
984
985 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
986                                struct mlx5_eqe *eqe)
987 {
988         struct mlx5vf_pci_core_device *mvdev =
989                 container_of(mcq, struct mlx5vf_pci_core_device,
990                              tracker.cq.mcq);
991
992         complete(&mvdev->tracker_comp);
993 }
994
995 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
996                             struct mlx5_vhca_page_tracker *tracker,
997                             size_t ncqe)
998 {
999         int cqe_size = cache_line_size() == 128 ? 128 : 64;
1000         u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1001         struct mlx5_vhca_cq *cq;
1002         int inlen, err, eqn;
1003         void *cqc, *in;
1004         __be64 *pas;
1005         int vector;
1006
1007         cq = &tracker->cq;
1008         ncqe = roundup_pow_of_two(ncqe);
1009         err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1010         if (err)
1011                 return err;
1012
1013         cq->ncqe = ncqe;
1014         cq->mcq.set_ci_db = cq->db.db;
1015         cq->mcq.arm_db = cq->db.db + 1;
1016         cq->mcq.cqe_sz = cqe_size;
1017         err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1018         if (err)
1019                 goto err_db_free;
1020
1021         init_cq_frag_buf(&cq->buf);
1022         inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1023                 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1024                 cq->buf.frag_buf.npages;
1025         in = kvzalloc(inlen, GFP_KERNEL);
1026         if (!in) {
1027                 err = -ENOMEM;
1028                 goto err_buff;
1029         }
1030
1031         vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1032         err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1033         if (err)
1034                 goto err_vec;
1035
1036         cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1037         MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1038         MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1039         MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1040         MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1041                  MLX5_ADAPTER_PAGE_SHIFT);
1042         MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1043         pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1044         mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1045         cq->mcq.comp = mlx5vf_cq_complete;
1046         cq->mcq.event = mlx5vf_cq_event;
1047         err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1048         if (err)
1049                 goto err_vec;
1050
1051         mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1052                     cq->mcq.cons_index);
1053         kvfree(in);
1054         return 0;
1055
1056 err_vec:
1057         kvfree(in);
1058 err_buff:
1059         mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1060 err_db_free:
1061         mlx5_db_free(mdev, &cq->db);
1062         return err;
1063 }
1064
1065 static struct mlx5_vhca_qp *
1066 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1067                     struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1068 {
1069         u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1070         struct mlx5_vhca_qp *qp;
1071         u8 log_rq_stride;
1072         u8 log_rq_sz;
1073         void *qpc;
1074         int inlen;
1075         void *in;
1076         int err;
1077
1078         qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1079         if (!qp)
1080                 return ERR_PTR(-ENOMEM);
1081
1082         err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1083         if (err)
1084                 goto err_free;
1085
1086         if (max_recv_wr) {
1087                 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1088                 log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1089                 log_rq_sz = ilog2(qp->rq.wqe_cnt);
1090                 err = mlx5_frag_buf_alloc_node(mdev,
1091                         wq_get_byte_sz(log_rq_sz, log_rq_stride),
1092                         &qp->buf, mdev->priv.numa_node);
1093                 if (err)
1094                         goto err_db_free;
1095                 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1096         }
1097
1098         qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1099         inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1100                 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1101                 qp->buf.npages;
1102         in = kvzalloc(inlen, GFP_KERNEL);
1103         if (!in) {
1104                 err = -ENOMEM;
1105                 goto err_in;
1106         }
1107
1108         qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1109         MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1110         MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1111         MLX5_SET(qpc, qpc, pd, tracker->pdn);
1112         MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1113         MLX5_SET(qpc, qpc, log_page_size,
1114                  qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1115         MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1116         if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1117                 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1118         MLX5_SET(qpc, qpc, no_sq, 1);
1119         if (max_recv_wr) {
1120                 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1121                 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1122                 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1123                 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1124                 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1125                 mlx5_fill_page_frag_array(&qp->buf,
1126                                           (__be64 *)MLX5_ADDR_OF(create_qp_in,
1127                                                                  in, pas));
1128         } else {
1129                 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1130         }
1131
1132         MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1133         err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1134         kvfree(in);
1135         if (err)
1136                 goto err_in;
1137
1138         qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1139         return qp;
1140
1141 err_in:
1142         if (max_recv_wr)
1143                 mlx5_frag_buf_free(mdev, &qp->buf);
1144 err_db_free:
1145         mlx5_db_free(mdev, &qp->db);
1146 err_free:
1147         kfree(qp);
1148         return ERR_PTR(err);
1149 }
1150
1151 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1152 {
1153         struct mlx5_wqe_data_seg *data;
1154         unsigned int ix;
1155
1156         WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1157         ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1158         data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1159         data->byte_count = cpu_to_be32(qp->max_msg_size);
1160         data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1161         data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1162         qp->rq.pc++;
1163         /* Make sure that descriptors are written before doorbell record. */
1164         dma_wmb();
1165         *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1166 }
1167
1168 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1169                               struct mlx5_vhca_qp *qp, u32 remote_qpn,
1170                               bool host_qp)
1171 {
1172         u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1173         u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1174         u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1175         void *qpc;
1176         int ret;
1177
1178         /* Init */
1179         qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1180         MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1181         MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1182         MLX5_SET(qpc, qpc, rre, 1);
1183         MLX5_SET(qpc, qpc, rwe, 1);
1184         MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1185         MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1186         ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1187         if (ret)
1188                 return ret;
1189
1190         if (host_qp) {
1191                 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1192                 int i;
1193
1194                 for (i = 0; i < qp->rq.wqe_cnt; i++) {
1195                         mlx5vf_post_recv(qp);
1196                         recv_buf->next_rq_offset += qp->max_msg_size;
1197                 }
1198         }
1199
1200         /* RTR */
1201         qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1202         MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1203         MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1204         MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1205         MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1206         MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1207         MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1208         MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1209         MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1210         MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1211         ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1212         if (ret || host_qp)
1213                 return ret;
1214
1215         /* RTS */
1216         qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1217         MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1218         MLX5_SET(qpc, qpc, retry_count, 7);
1219         MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1220         MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1221         MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1222         MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1223
1224         return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1225 }
1226
1227 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1228                               struct mlx5_vhca_qp *qp)
1229 {
1230         u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1231
1232         MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1233         MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1234         mlx5_cmd_exec_in(mdev, destroy_qp, in);
1235
1236         mlx5_frag_buf_free(mdev, &qp->buf);
1237         mlx5_db_free(mdev, &qp->db);
1238         kfree(qp);
1239 }
1240
1241 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1242 {
1243         int i;
1244
1245         /* Undo alloc_pages_bulk_array() */
1246         for (i = 0; i < recv_buf->npages; i++)
1247                 __free_page(recv_buf->page_list[i]);
1248
1249         kvfree(recv_buf->page_list);
1250 }
1251
1252 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1253                             unsigned int npages)
1254 {
1255         unsigned int filled = 0, done = 0;
1256         int i;
1257
1258         recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1259                                        GFP_KERNEL_ACCOUNT);
1260         if (!recv_buf->page_list)
1261                 return -ENOMEM;
1262
1263         for (;;) {
1264                 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1265                                                 npages - done,
1266                                                 recv_buf->page_list + done);
1267                 if (!filled)
1268                         goto err;
1269
1270                 done += filled;
1271                 if (done == npages)
1272                         break;
1273         }
1274
1275         recv_buf->npages = npages;
1276         return 0;
1277
1278 err:
1279         for (i = 0; i < npages; i++) {
1280                 if (recv_buf->page_list[i])
1281                         __free_page(recv_buf->page_list[i]);
1282         }
1283
1284         kvfree(recv_buf->page_list);
1285         return -ENOMEM;
1286 }
1287
1288 static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1289                                    struct mlx5_vhca_recv_buf *recv_buf)
1290 {
1291         int i, j;
1292
1293         recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1294                                        sizeof(*recv_buf->dma_addrs),
1295                                        GFP_KERNEL_ACCOUNT);
1296         if (!recv_buf->dma_addrs)
1297                 return -ENOMEM;
1298
1299         for (i = 0; i < recv_buf->npages; i++) {
1300                 recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1301                                                       recv_buf->page_list[i],
1302                                                       0, PAGE_SIZE,
1303                                                       DMA_FROM_DEVICE);
1304                 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1305                         goto error;
1306         }
1307         return 0;
1308
1309 error:
1310         for (j = 0; j < i; j++)
1311                 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1312                                  PAGE_SIZE, DMA_FROM_DEVICE);
1313
1314         kvfree(recv_buf->dma_addrs);
1315         return -ENOMEM;
1316 }
1317
1318 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1319                                       struct mlx5_vhca_recv_buf *recv_buf)
1320 {
1321         int i;
1322
1323         for (i = 0; i < recv_buf->npages; i++)
1324                 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1325                                  PAGE_SIZE, DMA_FROM_DEVICE);
1326
1327         kvfree(recv_buf->dma_addrs);
1328 }
1329
1330 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1331                                           struct mlx5_vhca_qp *qp)
1332 {
1333         struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1334
1335         mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1336         unregister_dma_recv_pages(mdev, recv_buf);
1337         free_recv_pages(&qp->recv_buf);
1338 }
1339
1340 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1341                                           struct mlx5_vhca_qp *qp, u32 pdn,
1342                                           u64 rq_size)
1343 {
1344         unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1345         struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1346         int err;
1347
1348         err = alloc_recv_pages(recv_buf, npages);
1349         if (err < 0)
1350                 return err;
1351
1352         err = register_dma_recv_pages(mdev, recv_buf);
1353         if (err)
1354                 goto end;
1355
1356         err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1357         if (err)
1358                 goto err_create_mkey;
1359
1360         return 0;
1361
1362 err_create_mkey:
1363         unregister_dma_recv_pages(mdev, recv_buf);
1364 end:
1365         free_recv_pages(recv_buf);
1366         return err;
1367 }
1368
1369 static void
1370 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1371 {
1372         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1373         struct mlx5_core_dev *mdev = mvdev->mdev;
1374
1375         lockdep_assert_held(&mvdev->state_mutex);
1376
1377         if (!mvdev->log_active)
1378                 return;
1379
1380         WARN_ON(mvdev->mdev_detach);
1381
1382         mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1383         mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1384         mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1385         mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1386         mlx5vf_destroy_qp(mdev, tracker->host_qp);
1387         mlx5vf_destroy_cq(mdev, &tracker->cq);
1388         mlx5_core_dealloc_pd(mdev, tracker->pdn);
1389         mlx5_put_uars_page(mdev, tracker->uar);
1390         mvdev->log_active = false;
1391 }
1392
1393 int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1394 {
1395         struct mlx5vf_pci_core_device *mvdev = container_of(
1396                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1397
1398         mutex_lock(&mvdev->state_mutex);
1399         if (!mvdev->log_active)
1400                 goto end;
1401
1402         _mlx5vf_free_page_tracker_resources(mvdev);
1403         mvdev->log_active = false;
1404 end:
1405         mlx5vf_state_mutex_unlock(mvdev);
1406         return 0;
1407 }
1408
1409 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1410                               struct rb_root_cached *ranges, u32 nnodes,
1411                               u64 *page_size)
1412 {
1413         struct mlx5vf_pci_core_device *mvdev = container_of(
1414                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1415         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1416         u8 log_tracked_page = ilog2(*page_size);
1417         struct mlx5_vhca_qp *host_qp;
1418         struct mlx5_vhca_qp *fw_qp;
1419         struct mlx5_core_dev *mdev;
1420         u32 max_msg_size = PAGE_SIZE;
1421         u64 rq_size = SZ_2M;
1422         u32 max_recv_wr;
1423         int err;
1424
1425         mutex_lock(&mvdev->state_mutex);
1426         if (mvdev->mdev_detach) {
1427                 err = -ENOTCONN;
1428                 goto end;
1429         }
1430
1431         if (mvdev->log_active) {
1432                 err = -EINVAL;
1433                 goto end;
1434         }
1435
1436         mdev = mvdev->mdev;
1437         memset(tracker, 0, sizeof(*tracker));
1438         tracker->uar = mlx5_get_uars_page(mdev);
1439         if (IS_ERR(tracker->uar)) {
1440                 err = PTR_ERR(tracker->uar);
1441                 goto end;
1442         }
1443
1444         err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1445         if (err)
1446                 goto err_uar;
1447
1448         max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1449         err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1450         if (err)
1451                 goto err_dealloc_pd;
1452
1453         host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1454         if (IS_ERR(host_qp)) {
1455                 err = PTR_ERR(host_qp);
1456                 goto err_cq;
1457         }
1458
1459         host_qp->max_msg_size = max_msg_size;
1460         if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1461                                 pg_track_log_min_page_size)) {
1462                 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1463                                 pg_track_log_min_page_size);
1464         } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1465                                 pg_track_log_max_page_size)) {
1466                 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1467                                 pg_track_log_max_page_size);
1468         }
1469
1470         host_qp->tracked_page_size = (1ULL << log_tracked_page);
1471         err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1472                                              rq_size);
1473         if (err)
1474                 goto err_host_qp;
1475
1476         fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1477         if (IS_ERR(fw_qp)) {
1478                 err = PTR_ERR(fw_qp);
1479                 goto err_recv_resources;
1480         }
1481
1482         err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1483         if (err)
1484                 goto err_activate;
1485
1486         err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1487         if (err)
1488                 goto err_activate;
1489
1490         tracker->host_qp = host_qp;
1491         tracker->fw_qp = fw_qp;
1492         err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1493         if (err)
1494                 goto err_activate;
1495
1496         MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1497         mlx5_eq_notifier_register(mdev, &tracker->nb);
1498         *page_size = host_qp->tracked_page_size;
1499         mvdev->log_active = true;
1500         mlx5vf_state_mutex_unlock(mvdev);
1501         return 0;
1502
1503 err_activate:
1504         mlx5vf_destroy_qp(mdev, fw_qp);
1505 err_recv_resources:
1506         mlx5vf_free_qp_recv_resources(mdev, host_qp);
1507 err_host_qp:
1508         mlx5vf_destroy_qp(mdev, host_qp);
1509 err_cq:
1510         mlx5vf_destroy_cq(mdev, &tracker->cq);
1511 err_dealloc_pd:
1512         mlx5_core_dealloc_pd(mdev, tracker->pdn);
1513 err_uar:
1514         mlx5_put_uars_page(mdev, tracker->uar);
1515 end:
1516         mlx5vf_state_mutex_unlock(mvdev);
1517         return err;
1518 }
1519
1520 static void
1521 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1522                   struct iova_bitmap *dirty)
1523 {
1524         u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1525         u32 nent = size / entry_size;
1526         struct page *page;
1527         u64 addr;
1528         u64 *buf;
1529         int i;
1530
1531         if (WARN_ON(index >= qp->recv_buf.npages ||
1532                     (nent > qp->max_msg_size / entry_size)))
1533                 return;
1534
1535         page = qp->recv_buf.page_list[index];
1536         buf = kmap_local_page(page);
1537         for (i = 0; i < nent; i++) {
1538                 addr = MLX5_GET(page_track_report_entry, buf + i,
1539                                 dirty_address_low);
1540                 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1541                                       dirty_address_high) << 32;
1542                 iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1543         }
1544         kunmap_local(buf);
1545 }
1546
1547 static void
1548 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1549               struct iova_bitmap *dirty, int *tracker_status)
1550 {
1551         u32 size;
1552         int ix;
1553
1554         qp->rq.cc++;
1555         *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1556         size = be32_to_cpu(cqe->byte_cnt);
1557         ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1558
1559         /* zero length CQE, no data */
1560         WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1561         if (size)
1562                 set_report_output(size, ix, qp, dirty);
1563
1564         qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1565         mlx5vf_post_recv(qp);
1566 }
1567
1568 static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1569 {
1570         return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1571 }
1572
1573 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1574 {
1575         void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1576         struct mlx5_cqe64 *cqe64;
1577
1578         cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1579
1580         if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1581             !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1582                 return cqe64;
1583         } else {
1584                 return NULL;
1585         }
1586 }
1587
1588 static int
1589 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1590                    struct iova_bitmap *dirty, int *tracker_status)
1591 {
1592         struct mlx5_cqe64 *cqe;
1593         u8 opcode;
1594
1595         cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1596         if (!cqe)
1597                 return CQ_EMPTY;
1598
1599         ++cq->mcq.cons_index;
1600         /*
1601          * Make sure we read CQ entry contents after we've checked the
1602          * ownership bit.
1603          */
1604         rmb();
1605         opcode = get_cqe_opcode(cqe);
1606         switch (opcode) {
1607         case MLX5_CQE_RESP_SEND_IMM:
1608                 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1609                 return CQ_OK;
1610         default:
1611                 return CQ_POLL_ERR;
1612         }
1613 }
1614
1615 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1616                                   unsigned long length,
1617                                   struct iova_bitmap *dirty)
1618 {
1619         struct mlx5vf_pci_core_device *mvdev = container_of(
1620                 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1621         struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1622         struct mlx5_vhca_cq *cq = &tracker->cq;
1623         struct mlx5_core_dev *mdev;
1624         int poll_err, err;
1625
1626         mutex_lock(&mvdev->state_mutex);
1627         if (!mvdev->log_active) {
1628                 err = -EINVAL;
1629                 goto end;
1630         }
1631
1632         if (mvdev->mdev_detach) {
1633                 err = -ENOTCONN;
1634                 goto end;
1635         }
1636
1637         mdev = mvdev->mdev;
1638         err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1639                                         MLX5_PAGE_TRACK_STATE_REPORTING);
1640         if (err)
1641                 goto end;
1642
1643         tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1644         while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1645                !tracker->is_err) {
1646                 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1647                                               &tracker->status);
1648                 if (poll_err == CQ_EMPTY) {
1649                         mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1650                                     cq->mcq.cons_index);
1651                         poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1652                                                       dirty, &tracker->status);
1653                         if (poll_err == CQ_EMPTY) {
1654                                 wait_for_completion(&mvdev->tracker_comp);
1655                                 continue;
1656                         }
1657                 }
1658                 if (poll_err == CQ_POLL_ERR) {
1659                         err = -EIO;
1660                         goto end;
1661                 }
1662                 mlx5_cq_set_ci(&cq->mcq);
1663         }
1664
1665         if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1666                 tracker->is_err = true;
1667
1668         if (tracker->is_err)
1669                 err = -EIO;
1670 end:
1671         mlx5vf_state_mutex_unlock(mvdev);
1672         return err;
1673 }