2 * Intel MIC Platform Software Stack (MPSS)
4 * Copyright(c) 2015 Intel Corporation.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2, as
8 * published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
18 #include "scif_main.h"
21 * struct scif_vma_info - Information about a remote memory mapping
22 * created via scif_mmap(..)
23 * @vma: VM area struct
24 * @list: link to list of active vmas
26 struct scif_vma_info {
27 struct vm_area_struct *vma;
28 struct list_head list;
31 void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg)
33 struct scif_rma_req req;
34 struct scif_window *window = NULL;
35 struct scif_window *recv_window =
36 (struct scif_window *)msg->payload[0];
37 struct scif_endpt *ep;
39 ep = (struct scif_endpt *)recv_window->ep;
40 req.out_window = &window;
41 req.offset = recv_window->offset;
42 req.prot = recv_window->prot;
43 req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT;
44 req.type = SCIF_WINDOW_FULL;
45 req.head = &ep->rma_info.reg_list;
46 msg->payload[0] = ep->remote_ep;
48 mutex_lock(&ep->rma_info.rma_lock);
49 /* Does a valid window exist? */
50 if (scif_query_window(&req)) {
51 dev_err(&scifdev->sdev->dev,
52 "%s %d -ENXIO\n", __func__, __LINE__);
53 msg->uop = SCIF_UNREGISTER_ACK;
57 scif_put_window(window, window->nr_pages);
59 if (!window->ref_count) {
60 atomic_inc(&ep->rma_info.tw_refcount);
61 ep->rma_info.async_list_del = 1;
62 list_del_init(&window->list);
63 scif_free_window_offset(ep, window, window->offset);
66 mutex_unlock(&ep->rma_info.rma_lock);
67 if (window && !window->ref_count)
68 scif_queue_for_cleanup(window, &scif_info.rma);
72 * Remove valid remote memory mappings created via scif_mmap(..) from the
73 * process address space since the remote node is lost
75 static void __scif_zap_mmaps(struct scif_endpt *ep)
77 struct list_head *item;
78 struct scif_vma_info *info;
79 struct vm_area_struct *vma;
83 list_for_each(item, &ep->rma_info.vma_list) {
84 info = list_entry(item, struct scif_vma_info, list);
86 size = vma->vm_end - vma->vm_start;
87 zap_vma_ptes(vma, vma->vm_start, size);
88 dev_dbg(scif_info.mdev.this_device,
89 "%s ep %p zap vma %p size 0x%lx\n",
90 __func__, ep, info->vma, size);
92 spin_unlock(&ep->lock);
96 * Traverse the list of endpoints for a particular remote node and
97 * zap valid remote memory mappings since the remote node is lost
99 static void _scif_zap_mmaps(int node, struct list_head *head)
101 struct scif_endpt *ep;
102 struct list_head *item;
104 mutex_lock(&scif_info.connlock);
105 list_for_each(item, head) {
106 ep = list_entry(item, struct scif_endpt, list);
107 if (ep->remote_dev->node == node)
108 __scif_zap_mmaps(ep);
110 mutex_unlock(&scif_info.connlock);
114 * Wrapper for removing remote memory mappings for a particular node. This API
115 * is called by peer nodes as part of handling a lost node.
117 void scif_zap_mmaps(int node)
119 _scif_zap_mmaps(node, &scif_info.connected);
120 _scif_zap_mmaps(node, &scif_info.disconnected);
124 * This API is only called while handling a lost node:
125 * a) Remote node is dead.
126 * b) Remote memory mappings have been zapped
127 * So we can traverse the remote_reg_list without any locks. Since
128 * the window has not yet been unregistered we can drop the ref count
129 * and queue it to the cleanup thread.
131 static void __scif_cleanup_rma_for_zombies(struct scif_endpt *ep)
133 struct list_head *pos, *tmp;
134 struct scif_window *window;
136 list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) {
137 window = list_entry(pos, struct scif_window, list);
138 if (window->ref_count)
139 scif_put_window(window, window->nr_pages);
141 dev_err(scif_info.mdev.this_device,
142 "%s %d unexpected\n",
144 if (!window->ref_count) {
145 atomic_inc(&ep->rma_info.tw_refcount);
146 list_del_init(&window->list);
147 scif_queue_for_cleanup(window, &scif_info.rma);
152 /* Cleanup remote registration lists for zombie endpoints */
153 void scif_cleanup_rma_for_zombies(int node)
155 struct scif_endpt *ep;
156 struct list_head *item;
158 mutex_lock(&scif_info.eplock);
159 list_for_each(item, &scif_info.zombie) {
160 ep = list_entry(item, struct scif_endpt, list);
161 if (ep->remote_dev && ep->remote_dev->node == node)
162 __scif_cleanup_rma_for_zombies(ep);
164 mutex_unlock(&scif_info.eplock);
165 flush_work(&scif_info.misc_work);
168 /* Insert the VMA into the per endpoint VMA list */
169 static int scif_insert_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
171 struct scif_vma_info *info;
174 info = kzalloc(sizeof(*info), GFP_KERNEL);
180 spin_lock(&ep->lock);
181 list_add_tail(&info->list, &ep->rma_info.vma_list);
182 spin_unlock(&ep->lock);
187 /* Delete the VMA from the per endpoint VMA list */
188 static void scif_delete_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
190 struct list_head *item;
191 struct scif_vma_info *info;
193 spin_lock(&ep->lock);
194 list_for_each(item, &ep->rma_info.vma_list) {
195 info = list_entry(item, struct scif_vma_info, list);
196 if (info->vma == vma) {
197 list_del(&info->list);
202 spin_unlock(&ep->lock);
205 static phys_addr_t scif_get_phys(phys_addr_t phys, struct scif_endpt *ep)
207 struct scif_dev *scifdev = (struct scif_dev *)ep->remote_dev;
208 struct scif_hw_dev *sdev = scifdev->sdev;
209 phys_addr_t out_phys, apt_base = 0;
212 * If the DMA address is card relative then we need to add the
213 * aperture base for mmap to work correctly
215 if (!scifdev_self(scifdev) && sdev->aper && sdev->card_rel_da)
216 apt_base = sdev->aper->pa;
217 out_phys = apt_base + phys;
221 int scif_get_pages(scif_epd_t epd, off_t offset, size_t len,
222 struct scif_range **pages)
224 struct scif_endpt *ep = (struct scif_endpt *)epd;
225 struct scif_rma_req req;
226 struct scif_window *window = NULL;
227 int nr_pages, err, i;
229 dev_dbg(scif_info.mdev.this_device,
230 "SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n",
232 err = scif_verify_epd(ep);
236 if (!len || (offset < 0) ||
237 (offset + len < offset) ||
238 (ALIGN(offset, PAGE_SIZE) != offset) ||
239 (ALIGN(len, PAGE_SIZE) != len))
242 nr_pages = len >> PAGE_SHIFT;
244 req.out_window = &window;
248 req.type = SCIF_WINDOW_SINGLE;
249 req.head = &ep->rma_info.remote_reg_list;
251 mutex_lock(&ep->rma_info.rma_lock);
252 /* Does a valid window exist? */
253 err = scif_query_window(&req);
255 dev_err(&ep->remote_dev->sdev->dev,
256 "%s %d err %d\n", __func__, __LINE__, err);
260 /* Allocate scif_range */
261 *pages = kzalloc(sizeof(**pages), GFP_KERNEL);
267 /* Allocate phys addr array */
268 (*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t));
269 if (!((*pages)->phys_addr)) {
274 if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) {
275 /* Allocate virtual address array */
276 ((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)));
282 /* Populate the values */
283 (*pages)->cookie = window;
284 (*pages)->nr_pages = nr_pages;
285 (*pages)->prot_flags = window->prot;
287 for (i = 0; i < nr_pages; i++) {
288 (*pages)->phys_addr[i] =
289 __scif_off_to_dma_addr(window, offset +
291 (*pages)->phys_addr[i] = scif_get_phys((*pages)->phys_addr[i],
293 if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev))
295 ep->remote_dev->sdev->aper->va +
296 (*pages)->phys_addr[i] -
297 ep->remote_dev->sdev->aper->pa;
300 scif_get_window(window, nr_pages);
302 mutex_unlock(&ep->rma_info.rma_lock);
305 scif_free((*pages)->phys_addr,
306 nr_pages * sizeof(dma_addr_t));
307 scif_free((*pages)->va,
308 nr_pages * sizeof(void *));
312 dev_err(&ep->remote_dev->sdev->dev,
313 "%s %d err %d\n", __func__, __LINE__, err);
317 EXPORT_SYMBOL_GPL(scif_get_pages);
319 int scif_put_pages(struct scif_range *pages)
321 struct scif_endpt *ep;
322 struct scif_window *window;
325 if (!pages || !pages->cookie)
328 window = pages->cookie;
330 if (!window || window->magic != SCIFEP_MAGIC)
333 ep = (struct scif_endpt *)window->ep;
335 * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
336 * callee should be allowed to release references to the pages,
337 * else the endpoint was not connected in the first place,
338 * hence the ENOTCONN.
340 if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED)
343 mutex_lock(&ep->rma_info.rma_lock);
345 scif_put_window(window, pages->nr_pages);
347 /* Initiate window destruction if ref count is zero */
348 if (!window->ref_count) {
349 list_del(&window->list);
350 mutex_unlock(&ep->rma_info.rma_lock);
351 scif_drain_dma_intr(ep->remote_dev->sdev,
352 ep->rma_info.dma_chan);
353 /* Inform the peer about this window being destroyed. */
354 msg.uop = SCIF_MUNMAP;
356 msg.payload[0] = window->peer_window;
357 /* No error handling for notification messages */
358 scif_nodeqp_send(ep->remote_dev, &msg);
359 /* Destroy this window from the peer's registered AS */
360 scif_destroy_remote_window(window);
362 mutex_unlock(&ep->rma_info.rma_lock);
365 scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t));
366 scif_free(pages->va, pages->nr_pages * sizeof(void *));
370 EXPORT_SYMBOL_GPL(scif_put_pages);
373 * scif_rma_list_mmap:
375 * Traverse the remote registration list starting from start_window:
376 * 1) Create VtoP mappings via remap_pfn_range(..)
377 * 2) Once step 1) and 2) complete successfully then traverse the range of
378 * windows again and bump the reference count.
379 * RMA lock must be held.
381 static int scif_rma_list_mmap(struct scif_window *start_window, s64 offset,
382 int nr_pages, struct vm_area_struct *vma)
384 s64 end_offset, loop_offset = offset;
385 struct scif_window *window = start_window;
386 int loop_nr_pages, nr_pages_left = nr_pages;
387 struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
388 struct list_head *head = &ep->rma_info.remote_reg_list;
390 dma_addr_t phys_addr;
391 struct scif_window_iter src_win_iter;
392 size_t contig_bytes = 0;
395 list_for_each_entry_from(window, head, list) {
396 end_offset = window->offset +
397 (window->nr_pages << PAGE_SHIFT);
398 loop_nr_pages = min_t(int,
399 (end_offset - loop_offset) >> PAGE_SHIFT,
401 scif_init_window_iter(window, &src_win_iter);
402 for (i = 0; i < loop_nr_pages; i++) {
403 phys_addr = scif_off_to_dma_addr(window, loop_offset,
406 phys_addr = scif_get_phys(phys_addr, ep);
407 err = remap_pfn_range(vma,
409 loop_offset - offset,
410 phys_addr >> PAGE_SHIFT,
415 loop_offset += PAGE_SIZE;
417 nr_pages_left -= loop_nr_pages;
422 * No more failures expected. Bump up the ref count for all
423 * the windows. Another traversal from start_window required
424 * for handling errors encountered across windows during
425 * remap_pfn_range(..).
427 loop_offset = offset;
428 nr_pages_left = nr_pages;
429 window = start_window;
430 head = &ep->rma_info.remote_reg_list;
431 list_for_each_entry_from(window, head, list) {
432 end_offset = window->offset +
433 (window->nr_pages << PAGE_SHIFT);
434 loop_nr_pages = min_t(int,
435 (end_offset - loop_offset) >> PAGE_SHIFT,
437 scif_get_window(window, loop_nr_pages);
438 nr_pages_left -= loop_nr_pages;
439 loop_offset += (loop_nr_pages << PAGE_SHIFT);
445 dev_err(scif_info.mdev.this_device,
446 "%s %d err %d\n", __func__, __LINE__, err);
451 * scif_rma_list_munmap:
453 * Traverse the remote registration list starting from window:
454 * 1) Decrement ref count.
455 * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer.
456 * RMA lock must be held.
458 static void scif_rma_list_munmap(struct scif_window *start_window,
459 s64 offset, int nr_pages)
462 s64 loop_offset = offset, end_offset;
463 int loop_nr_pages, nr_pages_left = nr_pages;
464 struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
465 struct list_head *head = &ep->rma_info.remote_reg_list;
466 struct scif_window *window = start_window, *_window;
468 msg.uop = SCIF_MUNMAP;
470 loop_offset = offset;
471 nr_pages_left = nr_pages;
472 list_for_each_entry_safe_from(window, _window, head, list) {
473 end_offset = window->offset +
474 (window->nr_pages << PAGE_SHIFT);
475 loop_nr_pages = min_t(int,
476 (end_offset - loop_offset) >> PAGE_SHIFT,
478 scif_put_window(window, loop_nr_pages);
479 if (!window->ref_count) {
480 struct scif_dev *rdev = ep->remote_dev;
482 scif_drain_dma_intr(rdev->sdev,
483 ep->rma_info.dma_chan);
484 /* Inform the peer about this munmap */
485 msg.payload[0] = window->peer_window;
486 /* No error handling for Notification messages. */
487 scif_nodeqp_send(ep->remote_dev, &msg);
488 list_del(&window->list);
489 /* Destroy this window from the peer's registered AS */
490 scif_destroy_remote_window(window);
492 nr_pages_left -= loop_nr_pages;
493 loop_offset += (loop_nr_pages << PAGE_SHIFT);
500 * The private data field of each VMA used to mmap a remote window
501 * points to an instance of struct vma_pvt
504 struct scif_endpt *ep; /* End point for remote window */
505 s64 offset; /* offset within remote window */
506 bool valid_offset; /* offset is valid only if the original
507 * mmap request was for a single page
508 * else the offset within the vma is
514 static void vma_pvt_release(struct kref *ref)
516 struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref);
522 * scif_vma_open - VMA open driver callback
523 * @vma: VMM memory area.
524 * The open method is called by the kernel to allow the subsystem implementing
525 * the VMA to initialize the area. This method is invoked any time a new
526 * reference to the VMA is made (when a process forks, for example).
527 * The one exception happens when the VMA is first created by mmap;
528 * in this case, the driver's mmap method is called instead.
529 * This function is also invoked when an existing VMA is split by the kernel
530 * due to a call to munmap on a subset of the VMA resulting in two VMAs.
531 * The kernel invokes this function only on one of the two VMAs.
533 static void scif_vma_open(struct vm_area_struct *vma)
535 struct vma_pvt *vmapvt = vma->vm_private_data;
537 dev_dbg(scif_info.mdev.this_device,
538 "SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
539 vma->vm_start, vma->vm_end);
540 scif_insert_vma(vmapvt->ep, vma);
541 kref_get(&vmapvt->ref);
545 * scif_munmap - VMA close driver callback.
546 * @vma: VMM memory area.
547 * When an area is destroyed, the kernel calls its close operation.
548 * Note that there's no usage count associated with VMA's; the area
549 * is opened and closed exactly once by each process that uses it.
551 static void scif_munmap(struct vm_area_struct *vma)
553 struct scif_endpt *ep;
554 struct vma_pvt *vmapvt = vma->vm_private_data;
555 int nr_pages = vma_pages(vma);
557 struct scif_rma_req req;
558 struct scif_window *window = NULL;
562 dev_dbg(scif_info.mdev.this_device,
563 "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
564 vma->vm_start, vma->vm_end);
566 offset = vmapvt->valid_offset ? vmapvt->offset :
567 (vma->vm_pgoff) << PAGE_SHIFT;
568 dev_dbg(scif_info.mdev.this_device,
569 "SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n",
570 ep, nr_pages, offset);
571 req.out_window = &window;
573 req.nr_bytes = vma->vm_end - vma->vm_start;
574 req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
575 req.type = SCIF_WINDOW_PARTIAL;
576 req.head = &ep->rma_info.remote_reg_list;
578 mutex_lock(&ep->rma_info.rma_lock);
580 err = scif_query_window(&req);
582 dev_err(scif_info.mdev.this_device,
583 "%s %d err %d\n", __func__, __LINE__, err);
585 scif_rma_list_munmap(window, offset, nr_pages);
587 mutex_unlock(&ep->rma_info.rma_lock);
589 * The kernel probably zeroes these out but we still want
590 * to clean up our own mess just in case.
593 vma->vm_private_data = NULL;
594 kref_put(&vmapvt->ref, vma_pvt_release);
595 scif_delete_vma(ep, vma);
598 static const struct vm_operations_struct scif_vm_ops = {
599 .open = scif_vma_open,
600 .close = scif_munmap,
604 * scif_mmap - Map pages in virtual address space to a remote window.
605 * @vma: VMM memory area.
606 * @epd: endpoint descriptor
608 * Return: Upon successful completion, scif_mmap() returns zero
609 * else an apt error is returned as documented in scif.h
611 int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd)
613 struct scif_rma_req req;
614 struct scif_window *window = NULL;
615 struct scif_endpt *ep = (struct scif_endpt *)epd;
616 s64 start_offset = vma->vm_pgoff << PAGE_SHIFT;
617 int nr_pages = vma_pages(vma);
619 struct vma_pvt *vmapvt;
621 dev_dbg(scif_info.mdev.this_device,
622 "SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n",
623 ep, start_offset, nr_pages);
624 err = scif_verify_epd(ep);
630 err = scif_insert_vma(ep, vma);
634 vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL);
636 scif_delete_vma(ep, vma);
641 kref_init(&vmapvt->ref);
643 req.out_window = &window;
644 req.offset = start_offset;
645 req.nr_bytes = vma->vm_end - vma->vm_start;
646 req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
647 req.type = SCIF_WINDOW_PARTIAL;
648 req.head = &ep->rma_info.remote_reg_list;
650 mutex_lock(&ep->rma_info.rma_lock);
651 /* Does a valid window exist? */
652 err = scif_query_window(&req);
654 dev_err(&ep->remote_dev->sdev->dev,
655 "%s %d err %d\n", __func__, __LINE__, err);
659 /* Default prot for loopback */
660 if (!scifdev_self(ep->remote_dev))
661 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
664 * VM_DONTCOPY - Do not copy this vma on fork
665 * VM_DONTEXPAND - Cannot expand with mremap()
666 * VM_RESERVED - Count as reserved_vm like IO
667 * VM_PFNMAP - Page-ranges managed without "struct page"
668 * VM_IO - Memory mapped I/O or similar
670 * We do not want to copy this VMA automatically on a fork(),
671 * expand this VMA due to mremap() or swap out these pages since
672 * the VMA is actually backed by physical pages in the remote
673 * node's physical memory and not via a struct page.
675 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
677 if (!scifdev_self(ep->remote_dev))
678 vma->vm_flags |= VM_IO | VM_PFNMAP;
680 /* Map this range of windows */
681 err = scif_rma_list_mmap(window, start_offset, nr_pages, vma);
683 dev_err(&ep->remote_dev->sdev->dev,
684 "%s %d err %d\n", __func__, __LINE__, err);
687 /* Set up the driver call back */
688 vma->vm_ops = &scif_vm_ops;
689 vma->vm_private_data = vmapvt;
691 mutex_unlock(&ep->rma_info.rma_lock);
694 dev_err(&ep->remote_dev->sdev->dev,
695 "%s %d err %d\n", __func__, __LINE__, err);
696 scif_delete_vma(ep, vma);