2 * VMware vSockets Driver
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 #include <linux/types.h>
17 #include <linux/socket.h>
18 #include <linux/stddef.h>
21 #include "vmci_transport_notify.h"
23 #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
25 static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
27 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
31 if (!PKT_FIELD(vsk, peer_waiting_write))
34 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
35 /* When the sender blocks, we take that as a sign that the sender is
36 * faster than the receiver. To reduce the transmit rate of the sender,
37 * we delay the sending of the read notification by decreasing the
38 * write_notify_window. The notification is delayed until the number of
39 * bytes used in the queue drops below the write_notify_window.
42 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
43 PKT_FIELD(vsk, peer_waiting_write_detected) = true;
44 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
45 PKT_FIELD(vsk, write_notify_window) =
46 PKT_FIELD(vsk, write_notify_min_window);
48 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
49 if (PKT_FIELD(vsk, write_notify_window) <
50 PKT_FIELD(vsk, write_notify_min_window))
51 PKT_FIELD(vsk, write_notify_window) =
52 PKT_FIELD(vsk, write_notify_min_window);
56 notify_limit = vmci_trans(vsk)->consume_size -
57 PKT_FIELD(vsk, write_notify_window);
62 /* For now we ignore the wait information and just see if the free
63 * space exceeds the notify limit. Note that improving this function
64 * to be more intelligent will not require a protocol change and will
65 * retain compatibility between endpoints with mixed versions of this
68 * The notify_limit is used to delay notifications in the case where
69 * flow control is enabled. Below the test is expressed in terms of
70 * free space in the queue: if free_space > ConsumeSize -
71 * write_notify_window then notify An alternate way of expressing this
72 * is to rewrite the expression to use the data ready in the receive
73 * queue: if write_notify_window > bufferReady then notify as
74 * free_space == ConsumeSize - bufferReady.
76 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
78 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
81 * Once we notify the peer, we reset the detected flag so the
82 * next wait will again cause a decrease in the window size.
85 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
94 static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk)
96 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
97 if (!PKT_FIELD(vsk, peer_waiting_read))
100 /* For now we ignore the wait information and just see if there is any
101 * data for our peer to read. Note that improving this function to be
102 * more intelligent will not require a protocol change and will retain
103 * compatibility between endpoints with mixed versions of this
106 return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0;
113 vmci_transport_handle_waiting_read(struct sock *sk,
114 struct vmci_transport_packet *pkt,
116 struct sockaddr_vm *dst,
117 struct sockaddr_vm *src)
119 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
120 struct vsock_sock *vsk;
124 PKT_FIELD(vsk, peer_waiting_read) = true;
125 memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait,
126 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
128 if (vmci_transport_notify_waiting_read(vsk)) {
132 sent = vmci_transport_send_wrote_bh(dst, src) > 0;
134 sent = vmci_transport_send_wrote(sk) > 0;
137 PKT_FIELD(vsk, peer_waiting_read) = false;
143 vmci_transport_handle_waiting_write(struct sock *sk,
144 struct vmci_transport_packet *pkt,
146 struct sockaddr_vm *dst,
147 struct sockaddr_vm *src)
149 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
150 struct vsock_sock *vsk;
154 PKT_FIELD(vsk, peer_waiting_write) = true;
155 memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait,
156 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
158 if (vmci_transport_notify_waiting_write(vsk)) {
162 sent = vmci_transport_send_read_bh(dst, src) > 0;
164 sent = vmci_transport_send_read(sk) > 0;
167 PKT_FIELD(vsk, peer_waiting_write) = false;
173 vmci_transport_handle_read(struct sock *sk,
174 struct vmci_transport_packet *pkt,
176 struct sockaddr_vm *dst, struct sockaddr_vm *src)
178 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
179 struct vsock_sock *vsk;
182 PKT_FIELD(vsk, sent_waiting_write) = false;
185 sk->sk_write_space(sk);
188 static bool send_waiting_read(struct sock *sk, u64 room_needed)
190 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
191 struct vsock_sock *vsk;
192 struct vmci_transport_waiting_info waiting_info;
200 if (PKT_FIELD(vsk, sent_waiting_read))
203 if (PKT_FIELD(vsk, write_notify_window) <
204 vmci_trans(vsk)->consume_size)
205 PKT_FIELD(vsk, write_notify_window) =
206 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
207 vmci_trans(vsk)->consume_size);
209 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head);
210 room_left = vmci_trans(vsk)->consume_size - head;
211 if (room_needed >= room_left) {
212 waiting_info.offset = room_needed - room_left;
213 waiting_info.generation =
214 PKT_FIELD(vsk, consume_q_generation) + 1;
216 waiting_info.offset = head + room_needed;
217 waiting_info.generation = PKT_FIELD(vsk, consume_q_generation);
220 ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0;
222 PKT_FIELD(vsk, sent_waiting_read) = true;
230 static bool send_waiting_write(struct sock *sk, u64 room_needed)
232 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
233 struct vsock_sock *vsk;
234 struct vmci_transport_waiting_info waiting_info;
242 if (PKT_FIELD(vsk, sent_waiting_write))
245 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head);
246 room_left = vmci_trans(vsk)->produce_size - tail;
247 if (room_needed + 1 >= room_left) {
248 /* Wraps around to current generation. */
249 waiting_info.offset = room_needed + 1 - room_left;
250 waiting_info.generation = PKT_FIELD(vsk, produce_q_generation);
252 waiting_info.offset = tail + room_needed + 1;
253 waiting_info.generation =
254 PKT_FIELD(vsk, produce_q_generation) - 1;
257 ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0;
259 PKT_FIELD(vsk, sent_waiting_write) = true;
267 static int vmci_transport_send_read_notification(struct sock *sk)
269 struct vsock_sock *vsk;
271 unsigned int retries;
279 if (vmci_transport_notify_waiting_write(vsk)) {
280 /* Notify the peer that we have read, retrying the send on
281 * failure up to our maximum value. XXX For now we just log
282 * the failure, but later we should schedule a work item to
283 * handle the resend until it succeeds. That would require
284 * keeping track of work items in the vsk and cleaning them up
287 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
289 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
290 err = vmci_transport_send_read(sk);
297 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS)
298 pr_err("%p unable to send read notify to peer\n", sk);
300 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
301 PKT_FIELD(vsk, peer_waiting_write) = false;
309 vmci_transport_handle_wrote(struct sock *sk,
310 struct vmci_transport_packet *pkt,
312 struct sockaddr_vm *dst, struct sockaddr_vm *src)
314 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
315 struct vsock_sock *vsk = vsock_sk(sk);
316 PKT_FIELD(vsk, sent_waiting_read) = false;
318 sk->sk_data_ready(sk);
321 static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
323 struct vsock_sock *vsk = vsock_sk(sk);
325 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
326 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
327 PKT_FIELD(vsk, peer_waiting_read) = false;
328 PKT_FIELD(vsk, peer_waiting_write) = false;
329 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
330 PKT_FIELD(vsk, sent_waiting_read) = false;
331 PKT_FIELD(vsk, sent_waiting_write) = false;
332 PKT_FIELD(vsk, produce_q_generation) = 0;
333 PKT_FIELD(vsk, consume_q_generation) = 0;
335 memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0,
336 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
337 memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0,
338 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
341 static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
346 vmci_transport_notify_pkt_poll_in(struct sock *sk,
347 size_t target, bool *data_ready_now)
349 struct vsock_sock *vsk = vsock_sk(sk);
351 if (vsock_stream_has_data(vsk)) {
352 *data_ready_now = true;
354 /* We can't read right now because there is nothing in the
355 * queue. Ask for notifications when there is something to
358 if (sk->sk_state == TCP_ESTABLISHED) {
359 if (!send_waiting_read(sk, 1))
363 *data_ready_now = false;
370 vmci_transport_notify_pkt_poll_out(struct sock *sk,
371 size_t target, bool *space_avail_now)
373 s64 produce_q_free_space;
374 struct vsock_sock *vsk = vsock_sk(sk);
376 produce_q_free_space = vsock_stream_has_space(vsk);
377 if (produce_q_free_space > 0) {
378 *space_avail_now = true;
380 } else if (produce_q_free_space == 0) {
381 /* This is a connected socket but we can't currently send data.
382 * Notify the peer that we are waiting if the queue is full. We
383 * only send a waiting write if the queue is full because
384 * otherwise we end up in an infinite WAITING_WRITE, READ,
385 * WAITING_WRITE, READ, etc. loop. Treat failing to send the
386 * notification as a socket error, passing that back through
389 if (!send_waiting_write(sk, 1))
392 *space_avail_now = false;
399 vmci_transport_notify_pkt_recv_init(
402 struct vmci_transport_recv_notify_data *data)
404 struct vsock_sock *vsk = vsock_sk(sk);
406 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
407 data->consume_head = 0;
408 data->produce_tail = 0;
409 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
410 data->notify_on_block = false;
412 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
413 PKT_FIELD(vsk, write_notify_min_window) = target + 1;
414 if (PKT_FIELD(vsk, write_notify_window) <
415 PKT_FIELD(vsk, write_notify_min_window)) {
416 /* If the current window is smaller than the new
417 * minimal window size, we need to reevaluate whether
418 * we need to notify the sender. If the number of ready
419 * bytes are smaller than the new window, we need to
420 * send a notification to the sender before we block.
423 PKT_FIELD(vsk, write_notify_window) =
424 PKT_FIELD(vsk, write_notify_min_window);
425 data->notify_on_block = true;
435 vmci_transport_notify_pkt_recv_pre_block(
438 struct vmci_transport_recv_notify_data *data)
442 /* Notify our peer that we are waiting for data to read. */
443 if (!send_waiting_read(sk, target)) {
447 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
448 if (data->notify_on_block) {
449 err = vmci_transport_send_read_notification(sk);
453 data->notify_on_block = false;
461 vmci_transport_notify_pkt_recv_pre_dequeue(
464 struct vmci_transport_recv_notify_data *data)
466 struct vsock_sock *vsk = vsock_sk(sk);
468 /* Now consume up to len bytes from the queue. Note that since we have
469 * the socket locked we should copy at least ready bytes.
471 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
472 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair,
474 &data->consume_head);
481 vmci_transport_notify_pkt_recv_post_dequeue(
486 struct vmci_transport_recv_notify_data *data)
488 struct vsock_sock *vsk;
495 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
496 /* Detect a wrap-around to maintain queue generation. Note
497 * that this is safe since we hold the socket lock across the
498 * two queue pair operations.
501 vmci_trans(vsk)->consume_size - data->consume_head)
502 PKT_FIELD(vsk, consume_q_generation)++;
505 err = vmci_transport_send_read_notification(sk);
514 vmci_transport_notify_pkt_send_init(
516 struct vmci_transport_send_notify_data *data)
518 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
519 data->consume_head = 0;
520 data->produce_tail = 0;
527 vmci_transport_notify_pkt_send_pre_block(
529 struct vmci_transport_send_notify_data *data)
531 /* Notify our peer that we are waiting for room to write. */
532 if (!send_waiting_write(sk, 1))
533 return -EHOSTUNREACH;
539 vmci_transport_notify_pkt_send_pre_enqueue(
541 struct vmci_transport_send_notify_data *data)
543 struct vsock_sock *vsk = vsock_sk(sk);
545 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
546 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair,
548 &data->consume_head);
555 vmci_transport_notify_pkt_send_post_enqueue(
558 struct vmci_transport_send_notify_data *data)
561 struct vsock_sock *vsk;
562 bool sent_wrote = false;
567 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
568 /* Detect a wrap-around to maintain queue generation. Note that this
569 * is safe since we hold the socket lock across the two queue pair
572 if (written >= vmci_trans(vsk)->produce_size - data->produce_tail)
573 PKT_FIELD(vsk, produce_q_generation)++;
577 if (vmci_transport_notify_waiting_read(vsk)) {
578 /* Notify the peer that we have written, retrying the send on
579 * failure up to our maximum value. See the XXX comment for the
580 * corresponding piece of code in StreamRecvmsg() for potential
583 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
585 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
586 err = vmci_transport_send_wrote(sk);
593 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
594 pr_err("%p unable to send wrote notify to peer\n", sk);
597 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
598 PKT_FIELD(vsk, peer_waiting_read) = false;
606 vmci_transport_notify_pkt_handle_pkt(
608 struct vmci_transport_packet *pkt,
610 struct sockaddr_vm *dst,
611 struct sockaddr_vm *src, bool *pkt_processed)
613 bool processed = false;
616 case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
617 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
620 case VMCI_TRANSPORT_PACKET_TYPE_READ:
621 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
624 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
625 vmci_transport_handle_waiting_write(sk, pkt, bottom_half,
630 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
631 vmci_transport_handle_waiting_read(sk, pkt, bottom_half,
638 *pkt_processed = processed;
641 static void vmci_transport_notify_pkt_process_request(struct sock *sk)
643 struct vsock_sock *vsk = vsock_sk(sk);
645 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
646 if (vmci_trans(vsk)->consume_size <
647 PKT_FIELD(vsk, write_notify_min_window))
648 PKT_FIELD(vsk, write_notify_min_window) =
649 vmci_trans(vsk)->consume_size;
652 static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
654 struct vsock_sock *vsk = vsock_sk(sk);
656 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
657 if (vmci_trans(vsk)->consume_size <
658 PKT_FIELD(vsk, write_notify_min_window))
659 PKT_FIELD(vsk, write_notify_min_window) =
660 vmci_trans(vsk)->consume_size;
663 /* Socket control packet based operations. */
664 const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
665 .socket_init = vmci_transport_notify_pkt_socket_init,
666 .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
667 .poll_in = vmci_transport_notify_pkt_poll_in,
668 .poll_out = vmci_transport_notify_pkt_poll_out,
669 .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
670 .recv_init = vmci_transport_notify_pkt_recv_init,
671 .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
672 .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
673 .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
674 .send_init = vmci_transport_notify_pkt_send_init,
675 .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
676 .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
677 .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
678 .process_request = vmci_transport_notify_pkt_process_request,
679 .process_negotiate = vmci_transport_notify_pkt_process_negotiate,