Index: netinet/tcp_input.c =================================================================== --- netinet/tcp_input.c (revision 218148) +++ netinet/tcp_input.c (working copy) @@ -161,6 +161,11 @@ &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); +VNET_DEFINE(int, tcp_do_rfc5682) = 1; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc5682, CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc5682), 0, + "Enable RFC 5682 (Forward RTO-Recovery: F-RTO)"); + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 0; @@ -254,6 +259,212 @@ } } +/* revert to the conventional RTO recovery. */ +#define TCP_FRTO_REVERT(tp) do { \ + (tp)->frto_flags = 0; \ +} while (0) + +static int +tcp_frto_send2mss(struct tcpcb *tp, struct tcphdr *th, uint16_t type) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + u_long oldcwnd; + tcp_seq onxt; + + /* + * XXXWG If the TCP sender does not have any new data to send, OR the + * advertised window prohibits new transmissions, the recommended + * action is to skip step 3 of this algorithm and continue with + * slow-start retransmissions, following the conventional RTO recovery + * algorithm. However, alternative ways of handling the window-limited + * cases that could result in better performance are discussed in + * Appendix A. + */ + if (so->so_snd.sb_cc == 0 || tp->snd_wnd == 0) + /* XXXWG skip step 3 OR do alternative ways. */ + return (1); + oldcwnd = tp->snd_cwnd; + onxt = tp->snd_nxt; + /* + * XXXWG transmit up to two new (previously unsent) segments and enter + * step 3 of this algorithm. If the TCP sender does not have enough + * unsent data, it can send only one segment. + */ + tp->snd_cwnd = 2 * tp->t_maxseg; + tp->snd_nxt = tp->snd_max; + /* + * XXXWG in addition, the TCP sender MAY override the Nagle algorithm. + */ + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + tp->snd_nxt = onxt; + tp->snd_cwnd = oldcwnd; + return (0); +} + +static void inline +tcp_frto_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) +{ + + /* SACK-Enhanced F-RTO */ + + if (tp->t_flags & TF_SACK_PERMIT) { + if ((tp->frto_flags & FRTO_INSTEP1) != 0 && + (tp->frto_flags & FRTO_CANGOSTEP2) != 0) { + /* SACK-Enhanced F-RTO - step 2 */ + if (type == CC_DUPACK) + /* + * 2) If duplicate ACKs arrive before the + * cumulative acknowledgment for retransmitted + * data, adjust the scoreboard according to + * the incoming SACK information. Stay in + * step 2 and wait for the next new + * acknowledgment. + */ + return; + KASSERT(type == CC_ACK, + ("%s: expected ACK (but %d)", __func__, type)); + + /* + * 2) When a new acknowledgment arrives, set variable + * "RecoveryPoint" to indicate the highest sequence + * number transmitted so far. + */ + tp->frto_flags |= FRTO_INSTEP2; + tp->snd_recover = tp->snd_max; + + /* + * 2a) If the Cumulative Acknowledgement field covers + * "RecoveryPoint" but not more than + * "RecoveryPoint", revert to the conventional RTO + * recovery and set the congestion window to no + * more than 2 * MSS, like a regular TCP would do. + * Do not enter step 3 of this algorithm. + */ + if (th->th_ack == tp->snd_recover) { + tp->snd_cwnd = 2 * tp->t_maxseg; + TCP_FRTO_REVERT(tp); + return; + /* + * 2b) If the Cumulative Acknowledgment field does not + * cover "RecoveryPoint" but is larger than + * SND.UNA. + */ + } else if (SEQ_LT(th->th_ack, tp->snd_recover) && + SEQ_GT(th->th_ack, tp->snd_una) && + !tcp_frto_send2mss(tp, th, type)) { + tp->frto_flags |= FRTO_CANGOSTEP3; + tp->frto_fack = tp->snd_fack; + } + } + if ((tp->frto_flags & FRTO_INSTEP2) != 0 && + (tp->frto_flags & FRTO_CANGOSTEP3) != 0) { + struct sackhole *q = TAILQ_FIRST(&tp->snd_holes); + /* the Cumulative Acknowledgment */ + tcp_seq cack = SEQ_MAX(th->th_ack, tp->snd_una); + + /* + * XXXWG 3a) If the Cumulative Acknowledgment field or + * the SACK information covers more than + * "RecoveryPoint". + */ + if (SEQ_GEQ(cack, tp->snd_recover) || + (q != NULL && SEQ_GT(q->start, tp->snd_recover))) + goto frto_revert; + /* + * XXXWG 3a) take this branch also when the + * acknowledgment is a duplicate ACK and it does not + * acknowledge any new, previously unacknowledged + * data below "RecoveryPoint" in the SACK information. + * + * FIXME At this implementation there is a limitation + * that it doesn't traverse all SACK information below + * "RecoveryPoint". So needs to implement a simple way + * to check whether SACK information is updated or not + * after receiving this ACK. + */ + if (type == CC_DUPACK && + SEQ_LEQ(tp->snd_fack, tp->snd_recover) && + SEQ_LEQ(tp->snd_fack, tp->frto_fack)) { +frto_revert: + tp->snd_cwnd = 3 * tp->t_maxseg; + TCP_FRTO_REVERT(tp); + return; + } + /* + * XXXWG 3b) If the Cumulative Acknowledgment field or a + * SACK information in the ACK does not cover more than + * "RecoveryPoint" AND it acknowledges data that was + * not acknowledged earlier, declare the timeout + * spurious. + */ + if (type == CC_ACK && + (SEQ_LEQ(cack, tp->snd_recover) || + (q != NULL && + SEQ_LT(q->start, tp->snd_recover)))) + /* SPUR_TO: the timeout is spurious */ + tp->snd_recover = tp->snd_una; + } + return; + } + + /* + * Basic F-RTO algorithm + */ + + if ((tp->frto_flags & FRTO_INSTEP1) != 0 && + (tp->frto_flags & FRTO_CANGOSTEP2) != 0) { + /* Basic F-RTO - step 2 */ + tp->frto_flags |= FRTO_INSTEP2; + tp->snd_recover = tp->snd_max; + + if (type == CC_DUPACK || + /* + * XXXWG The acknowledgment field covers "recover" but not + * more than "recover" + */ + th->th_ack == tp->snd_recover || + /* + * XXXWG acknowledgement does not acknowledge all of + * the data that was retransmitted in step 1. + */ + SEQ_LEQ(th->th_ack, tp->snd_una)) + TCP_FRTO_REVERT(tp); + else if (type == CC_ACK && + /* + * XXXWG Acknowledgement field does not cover + * "recover" + */ + SEQ_LT(th->th_ack, tp->snd_recover) && + !tcp_frto_send2mss(tp, th, type)) + tp->frto_flags |= FRTO_CANGOSTEP3; + else + TCP_FRTO_REVERT(tp); + } + if ((tp->frto_flags & FRTO_INSTEP2) != 0 && + (tp->frto_flags & FRTO_CANGOSTEP3) != 0) { + /* Basic F-RTO - step 3 */ + if (type == CC_DUPACK) { + /* + * XXXWG set the congestion window to no more + * than 3 * MSS (where MSS indicates Maximum + * Segment Size), and continue with the + * slow-start algorithm retransmitting + * unacknowledged segments. The congestion + * window can be set to 3 * MSS, because two + * round-trip times have elapsed since the TRO, + * and a conventional TCP sender would have + * increased cwnd to 3 during the same time. + */ + tp->snd_cwnd = 3 * tp->t_maxseg; + TCP_FRTO_REVERT(tp); + } else if (type == CC_ACK) + /* SPUR_TO: the timeout is spurious */ + tp->snd_recover = tp->snd_una; + } +} + /* * CC wrapper hook functions */ @@ -262,6 +473,9 @@ { INP_WLOCK_ASSERT(tp->t_inpcb); + if (V_tcp_do_rfc5682) + tcp_frto_ack_received(tp, th, type); + tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd)) tp->ccv->flags |= CCF_CWND_LIMITED; Index: netinet/tcp_timer.c =================================================================== --- netinet/tcp_timer.c (revision 218148) +++ netinet/tcp_timer.c (working copy) @@ -60,6 +60,7 @@ #endif #include #include +#include #include #include #include @@ -556,8 +557,64 @@ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } + if (V_tcp_do_rfc5682) { + /* + * If the retrasmission timer expires again during + * the execution of the F-RTO algorithm, the TCP sender MUST + * re-start the algorithm processing from step 1. + */ + tp->frto_flags = FRTO_INSTEP1; + + /* + * If the sender implements some loss recovery algorithm + * other than Reno or NewReno [FHH04], the F-RTO algorithm + * SHOULD NOT be entered when earlier fast recovery is underway. + */ + if (CC_ALGO(tp) != &newreno_cc_algo && + IN_RECOVERY(tp->t_flags)) + goto no_rfc5682; + /* + * SACK-Enhanced F-RTO + * This algorithm SHOULD NOT be applied if the TCP sender is + * already in loss recovery when a retransmission timeout + * occurs. + */ + if ((tp->t_flags & TF_SACK_PERMIT) != 0 && + IN_RECOVERY(tp->t_flags)) + goto no_rfc5682; + + /* + * Basic F-RTO algorithm - step 1: if the TCP sender is already + * in RTO recovery AND "recover" is larger than or equal to + * SND.UNA (the oldest unacknowledged sequence number [Pos81]), + * do not enter step 2 of this algorithm. Instead, store the + * highest sequence number transmitted so far in variable + * "recover". + */ + if ((tp->t_flags & TF_SACK_PERMIT) == 0 && + tp->t_rxtshift > 0 && + SEQ_GEQ(tp->snd_recover, tp->snd_una)) + /* + * continue with slow-start retransmissions following + * the conventional RTO recovery algorithm. + */ + goto no_rfc5682; + /* + * SACK-Enhanced F-RTO - step 1: If "RecoveryPoint" is larger + * than or equal to SND.UNA, do not enter step 2 of this + * algorithm. + */ + else if ((tp->t_flags & TF_SACK_PERMIT) != 0 && + SEQ_GEQ(tp->snd_recover, tp->snd_una)) + goto no_rfc5682; + else + tp->frto_flags |= FRTO_CANGOSTEP2; + } else { +no_rfc5682: + tp->snd_recover = tp->snd_max; + } tp->snd_nxt = tp->snd_una; - tp->snd_recover = tp->snd_max; + /* * Force a segment to be sent. */ Index: netinet/tcp_var.h =================================================================== --- netinet/tcp_var.h (revision 218148) +++ netinet/tcp_var.h (working copy) @@ -202,6 +202,12 @@ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ + u_int frto_flags; /* step flags for RFC5682 where it's */ +#define FRTO_INSTEP1 (1 << 0) +#define FRTO_CANGOSTEP2 (1 << 1) +#define FRTO_INSTEP2 (1 << 2) +#define FRTO_CANGOSTEP3 (1 << 3) + tcp_seq frto_fack; /* previous snd_fack at step 1 */ int t_ispare; /* explicit pad for 64bit alignment */ void *t_pspare2[4]; /* 4 TBD */ @@ -602,6 +608,7 @@ VNET_DECLARE(int, ss_fltsz_local); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_abc_l_var); +VNET_DECLARE(int, tcp_do_rfc5682); #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcpstat VNET(tcpstat) @@ -614,6 +621,7 @@ #define V_ss_fltsz_local VNET(ss_fltsz_local) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) +#define V_tcp_do_rfc5682 VNET(tcp_do_rfc5682) VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */ VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */