Commit 2a91aa39 authored by Andrea Bittau's avatar Andrea Bittau Committed by David S. Miller

[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation

Original work by Andrea Bittau, Arnaldo Melo cleaned up and fixed several
issues on the merge process.

For now CCID2 was turned the default for all SOCK_DCCP connections, but this
will be remedied soon with the merge of the feature negotiation code.
Signed-off-by: default avatarAndrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent aa5d7df3
......@@ -314,9 +314,9 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
/* initial values for each feature */
#define DCCPF_INITIAL_SEQUENCE_WINDOW 100
/* FIXME: for now we're using CCID 3 (TFRC) */
#define DCCPF_INITIAL_CCID 3
#define DCCPF_INITIAL_SEND_ACK_VECTOR 0
/* FIXME: for now we're using CCID 2 (TCP-Like) */
#define DCCPF_INITIAL_CCID 2
#define DCCPF_INITIAL_SEND_ACK_VECTOR 1
/* FIXME: for now we're default to 1 but it should really be 0 */
#define DCCPF_INITIAL_SEND_NDP_COUNT 1
......@@ -430,6 +430,8 @@ struct dccp_sock {
struct timeval dccps_timestamp_time;
__u32 dccps_timestamp_echo;
__u32 dccps_packet_size;
__u16 dccps_l_ack_ratio;
__u16 dccps_r_ack_ratio;
unsigned long dccps_ndp_count;
__u32 dccps_mss_cache;
struct dccp_options dccps_options;
......
......@@ -24,6 +24,10 @@ config INET_DCCP_DIAG
def_tristate y if (IP_DCCP = y && INET_DIAG = y)
def_tristate m
config IP_DCCP_ACKVEC
depends on IP_DCCP
def_bool N
source "net/dccp/ccids/Kconfig"
menu "DCCP Kernel Hacking"
......
menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
depends on IP_DCCP && EXPERIMENTAL
config IP_DCCP_CCID2
tristate "CCID2 (TCP) (EXPERIMENTAL)"
depends on IP_DCCP
select IP_DCCP_ACKVEC
---help---
CCID 2, TCP-like Congestion Control, denotes Additive Increase,
Multiplicative Decrease (AIMD) congestion control with behavior
modelled directly on TCP, including congestion window, slow start,
timeouts, and so forth [RFC 2581]. CCID 2 achieves maximum
bandwidth over the long term, consistent with the use of end-to-end
congestion control, but halves its congestion window in response to
each congestion event. This leads to the abrupt rate changes
typical of TCP. Applications should use CCID 2 if they prefer
maximum bandwidth utilization to steadiness of rate. This is often
the case for applications that are not playing their data directly
to the user. For example, a hypothetical application that
transferred files over DCCP, using application-level retransmissions
for lost packets, would prefer CCID 2 to CCID 3. On-line games may
also prefer CCID 2.
CCID 2 is further described in:
http://www.icir.org/kohler/dccp/draft-ietf-dccp-ccid2-10.txt
This text was extracted from:
http://www.icir.org/kohler/dccp/draft-ietf-dccp-spec-13.txt
If in doubt, say M.
config IP_DCCP_CCID3
tristate "CCID3 (TFRC) (EXPERIMENTAL)"
depends on IP_DCCP
......@@ -15,10 +43,15 @@ config IP_DCCP_CCID3
suitable than CCID 2 for applications such streaming media where a
relatively smooth sending rate is of importance.
CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
congestion control algorithms were initially described in RFC 3448.
CCID 3 is further described in:
http://www.icir.org/kohler/dccp/draft-ietf-dccp-ccid3-11.txt.
The TFRC congestion control algorithms were initially described in
RFC 3448.
This text was extracted from draft-ietf-dccp-spec-11.txt.
This text was extracted from:
http://www.icir.org/kohler/dccp/draft-ietf-dccp-spec-13.txt
If in doubt, say M.
......
......@@ -2,4 +2,8 @@ obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
dccp_ccid3-y := ccid3.o
obj-$(CONFIG_IP_DCCP_CCID2) += dccp_ccid2.o
dccp_ccid2-y := ccid2.o
obj-y += lib/
/*
* net/dccp/ccids/ccid2.c
*
* Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
* Changes to meet Linux coding standards, and DCCP infrastructure fixes.
*
* Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* This implementation should follow: draft-ietf-dccp-ccid2-10.txt
*
* BUGS:
* - sequence number wrapping
* - jiffies wrapping
*/
#include <linux/config.h>
#include "../ccid.h"
#include "../dccp.h"
#include "ccid2.h"
static int ccid2_debug;
#if 0
#define CCID2_DEBUG
#endif
#ifdef CCID2_DEBUG
#define ccid2_pr_debug(format, a...) \
do { if (ccid2_debug) \
printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
} while (0)
#else
#define ccid2_pr_debug(format, a...)
#endif
static const int ccid2_seq_len = 128;
static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
{
return dccp_sk(sk)->dccps_hc_tx_ccid_private;
}
static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
{
return dccp_sk(sk)->dccps_hc_rx_ccid_private;
}
#ifdef CCID2_DEBUG
static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
{
int len = 0;
struct ccid2_seq *seqp;
int pipe = 0;
seqp = hctx->ccid2hctx_seqh;
/* there is data in the chain */
if (seqp != hctx->ccid2hctx_seqt) {
seqp = seqp->ccid2s_prev;
len++;
if (!seqp->ccid2s_acked)
pipe++;
while (seqp != hctx->ccid2hctx_seqt) {
struct ccid2_seq *prev;
prev = seqp->ccid2s_prev;
len++;
if (!prev->ccid2s_acked)
pipe++;
/* packets are sent sequentially */
BUG_ON(seqp->ccid2s_seq <= prev->ccid2s_seq);
BUG_ON(seqp->ccid2s_sent < prev->ccid2s_sent);
BUG_ON(len > ccid2_seq_len);
seqp = prev;
}
}
BUG_ON(pipe != hctx->ccid2hctx_pipe);
ccid2_pr_debug("len of chain=%d\n", len);
do {
seqp = seqp->ccid2s_prev;
len++;
BUG_ON(len > ccid2_seq_len);
} while(seqp != hctx->ccid2hctx_seqh);
BUG_ON(len != ccid2_seq_len);
ccid2_pr_debug("total len=%d\n", len);
}
#else
#define ccid2_hc_tx_check_sanity(hctx) do {} while (0)
#endif
static int ccid2_hc_tx_send_packet(struct sock *sk,
struct sk_buff *skb, int len)
{
struct ccid2_hc_tx_sock *hctx;
switch (DCCP_SKB_CB(skb)->dccpd_type) {
case 0: /* XXX data packets from userland come through like this */
case DCCP_PKT_DATA:
case DCCP_PKT_DATAACK:
break;
/* No congestion control on other packets */
default:
return 0;
}
hctx = ccid2_hc_tx_sk(sk);
ccid2_pr_debug("pipe=%d cwnd=%d\n", hctx->ccid2hctx_pipe,
hctx->ccid2hctx_cwnd);
if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) {
/* OK we can send... make sure previous packet was sent off */
if (!hctx->ccid2hctx_sendwait) {
hctx->ccid2hctx_sendwait = 1;
return 0;
}
}
return 100; /* XXX */
}
static void ccid2_change_l_ack_ratio(struct sock *sk, int val)
{
struct dccp_sock *dp = dccp_sk(sk);
/*
* XXX I don't really agree with val != 2. If cwnd is 1, ack ratio
* should be 1... it shouldn't be allowed to become 2.
* -sorbo.
*/
if (val != 2) {
struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
int max = hctx->ccid2hctx_cwnd / 2;
/* round up */
if (hctx->ccid2hctx_cwnd & 1)
max++;
if (val > max)
val = max;
}
ccid2_pr_debug("changing local ack ratio to %d\n", val);
WARN_ON(val <= 0);
dp->dccps_l_ack_ratio = val;
}
static void ccid2_change_cwnd(struct sock *sk, int val)
{
struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
if (val == 0)
val = 1;
/* XXX do we need to change ack ratio? */
ccid2_pr_debug("change cwnd to %d\n", val);
BUG_ON(val < 1);
hctx->ccid2hctx_cwnd = val;
}
static void ccid2_start_rto_timer(struct sock *sk);
static void ccid2_hc_tx_rto_expire(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
long s;
/* XXX I don't think i'm locking correctly
* -sorbo.
*/
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
jiffies + HZ / 5);
goto out;
}
ccid2_pr_debug("RTO_EXPIRE\n");
ccid2_hc_tx_check_sanity(hctx);
/* back-off timer */
hctx->ccid2hctx_rto <<= 1;
s = hctx->ccid2hctx_rto / HZ;
if (s > 60)
hctx->ccid2hctx_rto = 60 * HZ;
ccid2_start_rto_timer(sk);
/* adjust pipe, cwnd etc */
hctx->ccid2hctx_pipe = 0;
hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd >> 1;
if (hctx->ccid2hctx_ssthresh < 2)
hctx->ccid2hctx_ssthresh = 2;
ccid2_change_cwnd(sk, 1);
/* clear state about stuff we sent */
hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
hctx->ccid2hctx_ssacks = 0;
hctx->ccid2hctx_acks = 0;
hctx->ccid2hctx_sent = 0;
/* clear ack ratio state. */
hctx->ccid2hctx_arsent = 0;
hctx->ccid2hctx_ackloss = 0;
hctx->ccid2hctx_rpseq = 0;
hctx->ccid2hctx_rpdupack = -1;
ccid2_change_l_ack_ratio(sk, 1);
ccid2_hc_tx_check_sanity(hctx);
out:
bh_unlock_sock(sk);
/* sock_put(sk); */
}
static void ccid2_start_rto_timer(struct sock *sk)
{
struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
jiffies + hctx->ccid2hctx_rto);
}
static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
u64 seq;
ccid2_hc_tx_check_sanity(hctx);
BUG_ON(!hctx->ccid2hctx_sendwait);
hctx->ccid2hctx_sendwait = 0;
hctx->ccid2hctx_pipe++;
BUG_ON(hctx->ccid2hctx_pipe < 0);
/* There is an issue. What if another packet is sent between
* packet_send() and packet_sent(). Then the sequence number would be
* wrong.
* -sorbo.
*/
seq = dp->dccps_gss;
hctx->ccid2hctx_seqh->ccid2s_seq = seq;
hctx->ccid2hctx_seqh->ccid2s_acked = 0;
hctx->ccid2hctx_seqh->ccid2s_sent = jiffies;
hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqh->ccid2s_next;
ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
hctx->ccid2hctx_pipe);
if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) {
/* XXX allocate more space */
WARN_ON(1);
}
hctx->ccid2hctx_sent++;
/* Ack Ratio. Need to maintain a concept of how many windows we sent */
hctx->ccid2hctx_arsent++;
/* We had an ack loss in this window... */
if (hctx->ccid2hctx_ackloss) {
if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
hctx->ccid2hctx_arsent = 0;
hctx->ccid2hctx_ackloss = 0;
}
}
/* No acks lost up to now... */
else {
/* decrease ack ratio if enough packets were sent */
if (dp->dccps_l_ack_ratio > 1) {
/* XXX don't calculate denominator each time */
int denom;
denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
dp->dccps_l_ack_ratio;
denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
if (hctx->ccid2hctx_arsent >= denom) {
ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
hctx->ccid2hctx_arsent = 0;
}
}
/* we can't increase ack ratio further [1] */
else {
hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
}
}
/* setup RTO timer */
if (!timer_pending(&hctx->ccid2hctx_rtotimer)) {
ccid2_start_rto_timer(sk);
}
#ifdef CCID2_DEBUG
ccid2_pr_debug("pipe=%d\n", hctx->ccid2hctx_pipe);
ccid2_pr_debug("Sent: seq=%llu\n", seq);
do {
struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
while (seqp != hctx->ccid2hctx_seqh) {
ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
seqp->ccid2s_seq, seqp->ccid2s_acked,
seqp->ccid2s_sent);
seqp = seqp->ccid2s_next;
}
} while(0);
ccid2_pr_debug("=========\n");
ccid2_hc_tx_check_sanity(hctx);
#endif
}
/* XXX Lame code duplication!
* returns -1 if none was found.
* else returns the next offset to use in the function call.
*/
static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
unsigned char **vec, unsigned char *veclen)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
unsigned char *opt_ptr;
const unsigned char *opt_end = (unsigned char *)dh +
(dh->dccph_doff * 4);
unsigned char opt, len;
unsigned char *value;
BUG_ON(offset < 0);
options += offset;
opt_ptr = options;
if (opt_ptr >= opt_end)
return -1;
while (opt_ptr != opt_end) {
opt = *opt_ptr++;
len = 0;
value = NULL;
/* Check if this isn't a single byte option */
if (opt > DCCPO_MAX_RESERVED) {
if (opt_ptr == opt_end)
goto out_invalid_option;
len = *opt_ptr++;
if (len < 3)
goto out_invalid_option;
/*
* Remove the type and len fields, leaving
* just the value size
*/
len -= 2;
value = opt_ptr;
opt_ptr += len;
if (opt_ptr > opt_end)
goto out_invalid_option;
}
switch (opt) {
case DCCPO_ACK_VECTOR_0:
case DCCPO_ACK_VECTOR_1:
*vec = value;
*veclen = len;
return offset + (opt_ptr - options);
break;
}
}
return -1;
out_invalid_option:
BUG_ON(1); /* should never happen... options were previously parsed ! */
return -1;
}
static void ccid2_hc_tx_kill_rto_timer(struct ccid2_hc_tx_sock *hctx)
{
if (del_timer(&hctx->ccid2hctx_rtotimer))
ccid2_pr_debug("deleted RTO timer\n");
}
static inline void ccid2_new_ack(struct sock *sk,
struct ccid2_seq *seqp,
unsigned int *maxincr)
{
struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
/* slow start */
if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
hctx->ccid2hctx_acks = 0;
/* We can increase cwnd at most maxincr [ack_ratio/2] */
if (*maxincr) {
/* increase every 2 acks */
hctx->ccid2hctx_ssacks++;
if (hctx->ccid2hctx_ssacks == 2) {
ccid2_change_cwnd(sk, hctx->ccid2hctx_cwnd + 1);
hctx->ccid2hctx_ssacks = 0;
*maxincr = *maxincr - 1;
}
}
/* increased cwnd enough for this single ack */
else {
hctx->ccid2hctx_ssacks = 0;
}
}
else {
hctx->ccid2hctx_ssacks = 0;
hctx->ccid2hctx_acks++;
if (hctx->ccid2hctx_acks >= hctx->ccid2hctx_cwnd) {
ccid2_change_cwnd(sk, hctx->ccid2hctx_cwnd + 1);
hctx->ccid2hctx_acks = 0;
}
}
/* update RTO */
if (hctx->ccid2hctx_srtt == -1 ||
(jiffies - hctx->ccid2hctx_lastrtt) >= hctx->ccid2hctx_srtt) {
unsigned long r = jiffies - seqp->ccid2s_sent;
int s;
/* first measurement */
if (hctx->ccid2hctx_srtt == -1) {
ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
r, jiffies, seqp->ccid2s_seq);
hctx->ccid2hctx_srtt = r;
hctx->ccid2hctx_rttvar = r >> 1;
}
else {
/* RTTVAR */
long tmp = hctx->ccid2hctx_srtt - r;
if (tmp < 0)
tmp *= -1;
tmp >>= 2;
hctx->ccid2hctx_rttvar *= 3;
hctx->ccid2hctx_rttvar >>= 2;
hctx->ccid2hctx_rttvar += tmp;
/* SRTT */
hctx->ccid2hctx_srtt *= 7;
hctx->ccid2hctx_srtt >>= 3;
tmp = r >> 3;
hctx->ccid2hctx_srtt += tmp;
}
s = hctx->ccid2hctx_rttvar << 2;
/* clock granularity is 1 when based on jiffies */
if (!s)
s = 1;
hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
/* must be at least a second */
s = hctx->ccid2hctx_rto / HZ;
/* DCCP doesn't require this [but I like it cuz my code sux] */
#if 1
if (s < 1)
hctx->ccid2hctx_rto = HZ;
#endif
/* max 60 seconds */
if (s > 60)
hctx->ccid2hctx_rto = HZ * 60;
hctx->ccid2hctx_lastrtt = jiffies;
ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
hctx->ccid2hctx_rto, HZ, r);
hctx->ccid2hctx_sent = 0;
}
/* we got a new ack, so re-start RTO timer */
ccid2_hc_tx_kill_rto_timer(hctx);
ccid2_start_rto_timer(sk);
}
static void ccid2_hc_tx_dec_pipe(struct ccid2_hc_tx_sock *hctx)
{
hctx->ccid2hctx_pipe--;
BUG_ON(hctx->ccid2hctx_pipe < 0);
if (hctx->ccid2hctx_pipe == 0)
ccid2_hc_tx_kill_rto_timer(hctx);
}
static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
u64 ackno, seqno;
struct ccid2_seq *seqp;
unsigned char *vector;
unsigned char veclen;
int offset = 0;
int done = 0;
int loss = 0;
unsigned int maxincr = 0;
ccid2_hc_tx_check_sanity(hctx);
/* check reverse path congestion */
seqno = DCCP_SKB_CB(skb)->dccpd_seq;
/* XXX this whole "algorithm" is broken. Need to fix it to keep track
* of the seqnos of the dupacks so that rpseq and rpdupack are correct
* -sorbo.
*/
/* need to bootstrap */
if (hctx->ccid2hctx_rpdupack == -1) {
hctx->ccid2hctx_rpdupack = 0;