ppp: ppp_mp_explode() redesign

I found the PPP subsystem to not work properly when connecting channels
with different speeds to the same bundle.

Problem Description:

As the "ppp_mp_explode" function fragments the sk_buff buffer evenly
among the PPP channels that are connected to a certain PPP unit to
make up a bundle, if we are transmitting using an upper layer protocol
that requires an Ack before sending the next packet (like TCP/IP for
example), we will have a bandwidth bottleneck on the slowest channel
of the bundle.

Let's clarify by an example. Let's consider a scenario where we have
two PPP links making up a bundle: a slow link (10KB/sec) and a fast
link (1000KB/sec) working at the best (full bandwidth). On the top we
have a TCP/IP stack sending a 1000 Bytes sk_buff buffer down to the
PPP subsystem. The "ppp_mp_explode" function will divide the buffer in
two fragments of 500B each (we are neglecting all the headers, crc,
flags etc?.). Before the TCP/IP stack sends out the next buffer, it
will have to wait for the ACK response from the remote peer, so it
will have to wait for both fragments to have been sent over the two
PPP links, received by the remote peer and reconstructed. The
resulting behaviour is that, rather than having a bundle working
@1010KB/sec (the sum of the channels bandwidths), we'll have a bundle
working @20KB/sec (the double of the slowest channels bandwidth).


Problem Solution:

The problem has been solved by redesigning the "ppp_mp_explode"
function in such a way to make it split the sk_buff buffer according
to the speeds of the underlying PPP channels (the speeds of the serial
interfaces respectively attached to the PPP channels). Referring to
the above example, the redesigned "ppp_mp_explode" function will now
divide the 1000 Bytes buffer into two fragments whose sizes are set
according to the speeds of the channels where they are going to be
sent on (e.g .  10 Byets on 10KB/sec channel and 990 Bytes on
1000KB/sec channel).  The reworked function grants the same
performances of the original one in optimal working conditions (i.e. a
bundle made up of PPP links all working at the same speed), while
greatly improving performances on the bundles made up of channels
working at different speeds.

Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Gabriele Paoloni 2009-03-13 16:09:12 -07:00 committed by David S. Miller
parent a2025b8b10
commit 9c705260fe
4 changed files with 126 additions and 91 deletions

View File

@ -157,6 +157,7 @@ ppp_asynctty_open(struct tty_struct *tty)
{ {
struct asyncppp *ap; struct asyncppp *ap;
int err; int err;
int speed;
if (tty->ops->write == NULL) if (tty->ops->write == NULL)
return -EOPNOTSUPP; return -EOPNOTSUPP;
@ -187,6 +188,8 @@ ppp_asynctty_open(struct tty_struct *tty)
ap->chan.private = ap; ap->chan.private = ap;
ap->chan.ops = &async_ops; ap->chan.ops = &async_ops;
ap->chan.mtu = PPP_MRU; ap->chan.mtu = PPP_MRU;
speed = tty_get_baud_rate(tty);
ap->chan.speed = speed;
err = ppp_register_channel(&ap->chan); err = ppp_register_channel(&ap->chan);
if (err) if (err)
goto out_free; goto out_free;

View File

@ -167,6 +167,7 @@ struct channel {
u8 avail; /* flag used in multilink stuff */ u8 avail; /* flag used in multilink stuff */
u8 had_frag; /* >= 1 fragments have been sent */ u8 had_frag; /* >= 1 fragments have been sent */
u32 lastseq; /* MP: last sequence # received */ u32 lastseq; /* MP: last sequence # received */
int speed; /* speed of the corresponding ppp channel*/
#endif /* CONFIG_PPP_MULTILINK */ #endif /* CONFIG_PPP_MULTILINK */
}; };
@ -1307,138 +1308,181 @@ ppp_push(struct ppp *ppp)
*/ */
static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
{ {
int len, fragsize; int len, totlen;
int i, bits, hdrlen, mtu; int i, bits, hdrlen, mtu;
int flen; int flen;
int navail, nfree; int navail, nfree, nzero;
int nbigger; int nbigger;
int totspeed;
int totfree;
unsigned char *p, *q; unsigned char *p, *q;
struct list_head *list; struct list_head *list;
struct channel *pch; struct channel *pch;
struct sk_buff *frag; struct sk_buff *frag;
struct ppp_channel *chan; struct ppp_channel *chan;
nfree = 0; /* # channels which have no packet already queued */ totspeed = 0; /*total bitrate of the bundle*/
nfree = 0; /* # channels which have no packet already queued */
navail = 0; /* total # of usable channels (not deregistered) */ navail = 0; /* total # of usable channels (not deregistered) */
nzero = 0; /* number of channels with zero speed associated*/
totfree = 0; /*total # of channels available and
*having no queued packets before
*starting the fragmentation*/
hdrlen = (ppp->flags & SC_MP_XSHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN; hdrlen = (ppp->flags & SC_MP_XSHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN;
i = 0; i = 0;
list_for_each_entry(pch, &ppp->channels, clist) { list_for_each_entry(pch, &ppp->channels, clist) {
navail += pch->avail = (pch->chan != NULL); navail += pch->avail = (pch->chan != NULL);
if (pch->avail) { pch->speed = pch->chan->speed;
if (pch->avail) {
if (skb_queue_empty(&pch->file.xq) || if (skb_queue_empty(&pch->file.xq) ||
!pch->had_frag) { !pch->had_frag) {
pch->avail = 2; if (pch->speed == 0)
++nfree; nzero++;
} else
if (!pch->had_frag && i < ppp->nxchan) totspeed += pch->speed;
ppp->nxchan = i;
pch->avail = 2;
++nfree;
++totfree;
}
if (!pch->had_frag && i < ppp->nxchan)
ppp->nxchan = i;
} }
++i; ++i;
} }
/* /*
* Don't start sending this packet unless at least half of * Don't start sending this packet unless at least half of
* the channels are free. This gives much better TCP * the channels are free. This gives much better TCP
* performance if we have a lot of channels. * performance if we have a lot of channels.
*/ */
if (nfree == 0 || nfree < navail / 2) if (nfree == 0 || nfree < navail / 2)
return 0; /* can't take now, leave it in xmit_pending */ return 0; /* can't take now, leave it in xmit_pending */
/* Do protocol field compression (XXX this should be optional) */ /* Do protocol field compression (XXX this should be optional) */
p = skb->data; p = skb->data;
len = skb->len; len = skb->len;
if (*p == 0) { if (*p == 0) {
++p; ++p;
--len; --len;
} }
/* totlen = len;
* Decide on fragment size. nbigger = len % nfree;
* We create a fragment for each free channel regardless of
* how small they are (i.e. even 0 length) in order to minimize
* the time that it will take to detect when a channel drops
* a fragment.
*/
fragsize = len;
if (nfree > 1)
fragsize = DIV_ROUND_UP(fragsize, nfree);
/* nbigger channels get fragsize bytes, the rest get fragsize-1,
except if nbigger==0, then they all get fragsize. */
nbigger = len % nfree;
/* skip to the channel after the one we last used /* skip to the channel after the one we last used
and start at that one */ and start at that one */
list = &ppp->channels; list = &ppp->channels;
for (i = 0; i < ppp->nxchan; ++i) { for (i = 0; i < ppp->nxchan; ++i) {
list = list->next; list = list->next;
if (list == &ppp->channels) { if (list == &ppp->channels) {
i = 0; i = 0;
break; break;
} }
} }
/* create a fragment for each channel */ /* create a fragment for each channel */
bits = B; bits = B;
while (nfree > 0 || len > 0) { while (nfree > 0 && len > 0) {
list = list->next; list = list->next;
if (list == &ppp->channels) { if (list == &ppp->channels) {
i = 0; i = 0;
continue; continue;
} }
pch = list_entry(list, struct channel, clist); pch = list_entry(list, struct channel, clist);
++i; ++i;
if (!pch->avail) if (!pch->avail)
continue; continue;
/* /*
* Skip this channel if it has a fragment pending already and * Skip this channel if it has a fragment pending already and
* we haven't given a fragment to all of the free channels. * we haven't given a fragment to all of the free channels.
*/ */
if (pch->avail == 1) { if (pch->avail == 1) {
if (nfree > 0) if (nfree > 0)
continue; continue;
} else { } else {
--nfree;
pch->avail = 1; pch->avail = 1;
} }
/* check the channel's mtu and whether it is still attached. */ /* check the channel's mtu and whether it is still attached. */
spin_lock_bh(&pch->downl); spin_lock_bh(&pch->downl);
if (pch->chan == NULL) { if (pch->chan == NULL) {
/* can't use this channel, it's being deregistered */ /* can't use this channel, it's being deregistered */
if (pch->speed == 0)
nzero--;
else
totspeed -= pch->speed;
spin_unlock_bh(&pch->downl); spin_unlock_bh(&pch->downl);
pch->avail = 0; pch->avail = 0;
if (--navail == 0) totlen = len;
totfree--;
nfree--;
if (--navail == 0)
break; break;
continue; continue;
} }
/* /*
* Create a fragment for this channel of *if the channel speed is not set divide
* min(max(mtu+2-hdrlen, 4), fragsize, len) bytes. *the packet evenly among the free channels;
* If mtu+2-hdrlen < 4, that is a ridiculously small *otherwise divide it according to the speed
* MTU, so we use mtu = 2 + hdrlen. *of the channel we are going to transmit on
*/
if (pch->speed == 0) {
flen = totlen/nfree ;
if (nbigger > 0) {
flen++;
nbigger--;
}
} else {
flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) /
((totspeed*totfree)/pch->speed)) - hdrlen;
if (nbigger > 0) {
flen += ((totfree - nzero)*pch->speed)/totspeed;
nbigger -= ((totfree - nzero)*pch->speed)/
totspeed;
}
}
nfree--;
/*
*check if we are on the last channel or
*we exceded the lenght of the data to
*fragment
*/ */
if (fragsize > len) if ((nfree == 0) || (flen > len))
fragsize = len; flen = len;
flen = fragsize; /*
mtu = pch->chan->mtu + 2 - hdrlen; *it is not worth to tx on slow channels:
if (mtu < 4) *in that case from the resulting flen according to the
mtu = 4; *above formula will be equal or less than zero.
*Skip the channel in this case
*/
if (flen <= 0) {
pch->avail = 2;
spin_unlock_bh(&pch->downl);
continue;
}
mtu = pch->chan->mtu + 2 - hdrlen;
if (mtu < 4)
mtu = 4;
if (flen > mtu) if (flen > mtu)
flen = mtu; flen = mtu;
if (flen == len && nfree == 0) if (flen == len)
bits |= E; bits |= E;
frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC); frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC);
if (!frag) if (!frag)
goto noskb; goto noskb;
q = skb_put(frag, flen + hdrlen); q = skb_put(frag, flen + hdrlen);
/* make the MP header */ /* make the MP header */
q[0] = PPP_MP >> 8; q[0] = PPP_MP >> 8;
q[1] = PPP_MP; q[1] = PPP_MP;
if (ppp->flags & SC_MP_XSHORTSEQ) { if (ppp->flags & SC_MP_XSHORTSEQ) {
q[2] = bits + ((ppp->nxseq >> 8) & 0xf); q[2] = bits + ((ppp->nxseq >> 8) & 0xf);
q[3] = ppp->nxseq; q[3] = ppp->nxseq;
} else { } else {
q[2] = bits; q[2] = bits;
@ -1447,43 +1491,28 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
q[5] = ppp->nxseq; q[5] = ppp->nxseq;
} }
/* memcpy(q + hdrlen, p, flen);
* Copy the data in.
* Unfortunately there is a bug in older versions of
* the Linux PPP multilink reconstruction code where it
* drops 0-length fragments. Therefore we make sure the
* fragment has at least one byte of data. Any bytes
* we add in this situation will end up as padding on the
* end of the reconstructed packet.
*/
if (flen == 0)
*skb_put(frag, 1) = 0;
else
memcpy(q + hdrlen, p, flen);
/* try to send it down the channel */ /* try to send it down the channel */
chan = pch->chan; chan = pch->chan;
if (!skb_queue_empty(&pch->file.xq) || if (!skb_queue_empty(&pch->file.xq) ||
!chan->ops->start_xmit(chan, frag)) !chan->ops->start_xmit(chan, frag))
skb_queue_tail(&pch->file.xq, frag); skb_queue_tail(&pch->file.xq, frag);
pch->had_frag = 1; pch->had_frag = 1;
p += flen; p += flen;
len -= flen; len -= flen;
++ppp->nxseq; ++ppp->nxseq;
bits = 0; bits = 0;
spin_unlock_bh(&pch->downl); spin_unlock_bh(&pch->downl);
if (--nbigger == 0 && fragsize > 0)
--fragsize;
} }
ppp->nxchan = i; ppp->nxchan = i;
return 1; return 1;
noskb: noskb:
spin_unlock_bh(&pch->downl); spin_unlock_bh(&pch->downl);
if (ppp->debug & 1) if (ppp->debug & 1)
printk(KERN_ERR "PPP: no memory (fragment)\n"); printk(KERN_ERR "PPP: no memory (fragment)\n");
++ppp->dev->stats.tx_errors; ++ppp->dev->stats.tx_errors;
++ppp->nxseq; ++ppp->nxseq;
return 1; /* abandon the frame */ return 1; /* abandon the frame */

View File

@ -206,6 +206,7 @@ ppp_sync_open(struct tty_struct *tty)
{ {
struct syncppp *ap; struct syncppp *ap;
int err; int err;
int speed;
if (tty->ops->write == NULL) if (tty->ops->write == NULL)
return -EOPNOTSUPP; return -EOPNOTSUPP;
@ -234,6 +235,8 @@ ppp_sync_open(struct tty_struct *tty)
ap->chan.ops = &sync_ops; ap->chan.ops = &sync_ops;
ap->chan.mtu = PPP_MRU; ap->chan.mtu = PPP_MRU;
ap->chan.hdrlen = 2; /* for A/C bytes */ ap->chan.hdrlen = 2; /* for A/C bytes */
speed = tty_get_baud_rate(tty);
ap->chan.speed = speed;
err = ppp_register_channel(&ap->chan); err = ppp_register_channel(&ap->chan);
if (err) if (err)
goto out_free; goto out_free;

View File

@ -40,8 +40,8 @@ struct ppp_channel {
int mtu; /* max transmit packet size */ int mtu; /* max transmit packet size */
int hdrlen; /* amount of headroom channel needs */ int hdrlen; /* amount of headroom channel needs */
void *ppp; /* opaque to channel */ void *ppp; /* opaque to channel */
/* the following are not used at present */
int speed; /* transfer rate (bytes/second) */ int speed; /* transfer rate (bytes/second) */
/* the following is not used at present */
int latency; /* overhead time in milliseconds */ int latency; /* overhead time in milliseconds */
}; };