find -type f | xargs sed -i 's/[\t ]*$//g' # Yes, again. Note the star in the regex.
[qemu] / slirp / tcp_input.c
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
3  *      The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the University of
16  *      California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *      @(#)tcp_input.c 8.5 (Berkeley) 4/10/94
34  * tcp_input.c,v 1.10 1994/10/13 18:36:32 wollman Exp
35  */
36
37 /*
38  * Changes and additions relating to SLiRP
39  * Copyright (c) 1995 Danny Gasparovski.
40  *
41  * Please read the file COPYRIGHT for the
42  * terms and conditions of the copyright.
43  */
44
45 #include <slirp.h>
46 #include "ip_icmp.h"
47
48 struct socket tcb;
49
50 int     tcprexmtthresh = 3;
51 struct  socket *tcp_last_so = &tcb;
52
53 tcp_seq tcp_iss;                /* tcp initial send seq # */
54
55 #define TCP_PAWS_IDLE   (24 * 24 * 60 * 60 * PR_SLOWHZ)
56
57 /* for modulo comparisons of timestamps */
58 #define TSTMP_LT(a,b)   ((int)((a)-(b)) < 0)
59 #define TSTMP_GEQ(a,b)  ((int)((a)-(b)) >= 0)
60
61 /*
62  * Insert segment ti into reassembly queue of tcp with
63  * control block tp.  Return TH_FIN if reassembly now includes
64  * a segment with FIN.  The macro form does the common case inline
65  * (segment is the next to be received on an established connection,
66  * and the queue is empty), avoiding linkage into and removal
67  * from the queue and repetition of various conversions.
68  * Set DELACK for segments received in order, but ack immediately
69  * when segments are out of order (so fast retransmit can work).
70  */
71 #ifdef TCP_ACK_HACK
72 #define TCP_REASS(tp, ti, m, so, flags) {\
73        if ((ti)->ti_seq == (tp)->rcv_nxt && \
74            (tp)->seg_next == (tcpiphdrp_32)(tp) && \
75            (tp)->t_state == TCPS_ESTABLISHED) {\
76                if (ti->ti_flags & TH_PUSH) \
77                        tp->t_flags |= TF_ACKNOW; \
78                else \
79                        tp->t_flags |= TF_DELACK; \
80                (tp)->rcv_nxt += (ti)->ti_len; \
81                flags = (ti)->ti_flags & TH_FIN; \
82                tcpstat.tcps_rcvpack++;\
83                tcpstat.tcps_rcvbyte += (ti)->ti_len;\
84                if (so->so_emu) { \
85                        if (tcp_emu((so),(m))) sbappend((so), (m)); \
86                } else \
87                        sbappend((so), (m)); \
88 /*               sorwakeup(so); */ \
89         } else {\
90                (flags) = tcp_reass((tp), (ti), (m)); \
91                tp->t_flags |= TF_ACKNOW; \
92        } \
93 }
94 #else
95 #define TCP_REASS(tp, ti, m, so, flags) { \
96         if ((ti)->ti_seq == (tp)->rcv_nxt && \
97             (tp)->seg_next == (tcpiphdrp_32)(tp) && \
98             (tp)->t_state == TCPS_ESTABLISHED) { \
99                 tp->t_flags |= TF_DELACK; \
100                 (tp)->rcv_nxt += (ti)->ti_len; \
101                 flags = (ti)->ti_flags & TH_FIN; \
102                 tcpstat.tcps_rcvpack++;\
103                 tcpstat.tcps_rcvbyte += (ti)->ti_len;\
104                 if (so->so_emu) { \
105                         if (tcp_emu((so),(m))) sbappend(so, (m)); \
106                 } else \
107                         sbappend((so), (m)); \
108 /*              sorwakeup(so); */ \
109         } else { \
110                 (flags) = tcp_reass((tp), (ti), (m)); \
111                 tp->t_flags |= TF_ACKNOW; \
112         } \
113 }
114 #endif
115
116 int
117 tcp_reass(tp, ti, m)
118         register struct tcpcb *tp;
119         register struct tcpiphdr *ti;
120         struct mbuf *m;
121 {
122         register struct tcpiphdr *q;
123         struct socket *so = tp->t_socket;
124         int flags;
125
126         /*
127          * Call with ti==0 after become established to
128          * force pre-ESTABLISHED data up to user socket.
129          */
130         if (ti == 0)
131                 goto present;
132
133         /*
134          * Find a segment which begins after this one does.
135          */
136         for (q = (struct tcpiphdr *)tp->seg_next; q != (struct tcpiphdr *)tp;
137             q = (struct tcpiphdr *)q->ti_next)
138                 if (SEQ_GT(q->ti_seq, ti->ti_seq))
139                         break;
140
141         /*
142          * If there is a preceding segment, it may provide some of
143          * our data already.  If so, drop the data from the incoming
144          * segment.  If it provides all of our data, drop us.
145          */
146         if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) {
147                 register int i;
148                 q = (struct tcpiphdr *)q->ti_prev;
149                 /* conversion to int (in i) handles seq wraparound */
150                 i = q->ti_seq + q->ti_len - ti->ti_seq;
151                 if (i > 0) {
152                         if (i >= ti->ti_len) {
153                                 tcpstat.tcps_rcvduppack++;
154                                 tcpstat.tcps_rcvdupbyte += ti->ti_len;
155                                 m_freem(m);
156                                 /*
157                                  * Try to present any queued data
158                                  * at the left window edge to the user.
159                                  * This is needed after the 3-WHS
160                                  * completes.
161                                  */
162                                 goto present;   /* ??? */
163                         }
164                         m_adj(m, i);
165                         ti->ti_len -= i;
166                         ti->ti_seq += i;
167                 }
168                 q = (struct tcpiphdr *)(q->ti_next);
169         }
170         tcpstat.tcps_rcvoopack++;
171         tcpstat.tcps_rcvoobyte += ti->ti_len;
172         REASS_MBUF(ti) = (mbufp_32) m;          /* XXX */
173
174         /*
175          * While we overlap succeeding segments trim them or,
176          * if they are completely covered, dequeue them.
177          */
178         while (q != (struct tcpiphdr *)tp) {
179                 register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq;
180                 if (i <= 0)
181                         break;
182                 if (i < q->ti_len) {
183                         q->ti_seq += i;
184                         q->ti_len -= i;
185                         m_adj((struct mbuf *) REASS_MBUF(q), i);
186                         break;
187                 }
188                 q = (struct tcpiphdr *)q->ti_next;
189                 m = (struct mbuf *) REASS_MBUF((struct tcpiphdr *)q->ti_prev);
190                 remque_32((void *)(q->ti_prev));
191                 m_freem(m);
192         }
193
194         /*
195          * Stick new segment in its place.
196          */
197         insque_32(ti, (void *)(q->ti_prev));
198
199 present:
200         /*
201          * Present data to user, advancing rcv_nxt through
202          * completed sequence space.
203          */
204         if (!TCPS_HAVEESTABLISHED(tp->t_state))
205                 return (0);
206         ti = (struct tcpiphdr *) tp->seg_next;
207         if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt)
208                 return (0);
209         if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len)
210                 return (0);
211         do {
212                 tp->rcv_nxt += ti->ti_len;
213                 flags = ti->ti_flags & TH_FIN;
214                 remque_32(ti);
215                 m = (struct mbuf *) REASS_MBUF(ti); /* XXX */
216                 ti = (struct tcpiphdr *)ti->ti_next;
217 /*              if (so->so_state & SS_FCANTRCVMORE) */
218                 if (so->so_state & SS_FCANTSENDMORE)
219                         m_freem(m);
220                 else {
221                         if (so->so_emu) {
222                                 if (tcp_emu(so,m)) sbappend(so, m);
223                         } else
224                                 sbappend(so, m);
225                 }
226         } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt);
227 /*      sorwakeup(so); */
228         return (flags);
229 }
230
231 /*
232  * TCP input routine, follows pages 65-76 of the
233  * protocol specification dated September, 1981 very closely.
234  */
235 void
236 tcp_input(m, iphlen, inso)
237         register struct mbuf *m;
238         int iphlen;
239         struct socket *inso;
240 {
241         struct ip save_ip, *ip;
242         register struct tcpiphdr *ti;
243         caddr_t optp = NULL;
244         int optlen = 0;
245         int len, tlen, off;
246         register struct tcpcb *tp = 0;
247         register int tiflags;
248         struct socket *so = 0;
249         int todrop, acked, ourfinisacked, needoutput = 0;
250 /*      int dropsocket = 0; */
251         int iss = 0;
252         u_long tiwin;
253         int ret;
254 /*      int ts_present = 0; */
255
256         DEBUG_CALL("tcp_input");
257         DEBUG_ARGS((dfd," m = %8lx  iphlen = %2d  inso = %lx\n",
258                     (long )m, iphlen, (long )inso ));
259
260         /*
261          * If called with m == 0, then we're continuing the connect
262          */
263         if (m == NULL) {
264                 so = inso;
265
266                 /* Re-set a few variables */
267                 tp = sototcpcb(so);
268                 m = so->so_m;
269                 so->so_m = 0;
270                 ti = so->so_ti;
271                 tiwin = ti->ti_win;
272                 tiflags = ti->ti_flags;
273
274                 goto cont_conn;
275         }
276
277
278         tcpstat.tcps_rcvtotal++;
279         /*
280          * Get IP and TCP header together in first mbuf.
281          * Note: IP leaves IP header in first mbuf.
282          */
283         ti = mtod(m, struct tcpiphdr *);
284         if (iphlen > sizeof(struct ip )) {
285           ip_stripoptions(m, (struct mbuf *)0);
286           iphlen=sizeof(struct ip );
287         }
288         /* XXX Check if too short */
289
290
291         /*
292          * Save a copy of the IP header in case we want restore it
293          * for sending an ICMP error message in response.
294          */
295         ip=mtod(m, struct ip *);
296         save_ip = *ip;
297         save_ip.ip_len+= iphlen;
298
299         /*
300          * Checksum extended TCP header and data.
301          */
302         tlen = ((struct ip *)ti)->ip_len;
303         ti->ti_next = ti->ti_prev = 0;
304         ti->ti_x1 = 0;
305         ti->ti_len = htons((u_int16_t)tlen);
306         len = sizeof(struct ip ) + tlen;
307         /* keep checksum for ICMP reply
308          * ti->ti_sum = cksum(m, len);
309          * if (ti->ti_sum) { */
310         if(cksum(m, len)) {
311           tcpstat.tcps_rcvbadsum++;
312           goto drop;
313         }
314
315         /*
316          * Check that TCP offset makes sense,
317          * pull out TCP options and adjust length.              XXX
318          */
319         off = ti->ti_off << 2;
320         if (off < sizeof (struct tcphdr) || off > tlen) {
321           tcpstat.tcps_rcvbadoff++;
322           goto drop;
323         }
324         tlen -= off;
325         ti->ti_len = tlen;
326         if (off > sizeof (struct tcphdr)) {
327           optlen = off - sizeof (struct tcphdr);
328           optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
329
330                 /*
331                  * Do quick retrieval of timestamp options ("options
332                  * prediction?").  If timestamp is the only option and it's
333                  * formatted as recommended in RFC 1323 appendix A, we
334                  * quickly get the values now and not bother calling
335                  * tcp_dooptions(), etc.
336                  */
337 /*              if ((optlen == TCPOLEN_TSTAMP_APPA ||
338  *                   (optlen > TCPOLEN_TSTAMP_APPA &&
339  *                      optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
340  *                   *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
341  *                   (ti->ti_flags & TH_SYN) == 0) {
342  *                      ts_present = 1;
343  *                      ts_val = ntohl(*(u_int32_t *)(optp + 4));
344  *                      ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
345  *                      optp = NULL;   / * we've parsed the options * /
346  *              }
347  */
348         }
349         tiflags = ti->ti_flags;
350
351         /*
352          * Convert TCP protocol specific fields to host format.
353          */
354         NTOHL(ti->ti_seq);
355         NTOHL(ti->ti_ack);
356         NTOHS(ti->ti_win);
357         NTOHS(ti->ti_urp);
358
359         /*
360          * Drop TCP, IP headers and TCP options.
361          */
362         m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
363         m->m_len  -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
364
365         /*
366          * Locate pcb for segment.
367          */
368 findso:
369         so = tcp_last_so;
370         if (so->so_fport != ti->ti_dport ||
371             so->so_lport != ti->ti_sport ||
372             so->so_laddr.s_addr != ti->ti_src.s_addr ||
373             so->so_faddr.s_addr != ti->ti_dst.s_addr) {
374                 so = solookup(&tcb, ti->ti_src, ti->ti_sport,
375                                ti->ti_dst, ti->ti_dport);
376                 if (so)
377                         tcp_last_so = so;
378                 ++tcpstat.tcps_socachemiss;
379         }
380
381         /*
382          * If the state is CLOSED (i.e., TCB does not exist) then
383          * all data in the incoming segment is discarded.
384          * If the TCB exists but is in CLOSED state, it is embryonic,
385          * but should either do a listen or a connect soon.
386          *
387          * state == CLOSED means we've done socreate() but haven't
388          * attached it to a protocol yet...
389          *
390          * XXX If a TCB does not exist, and the TH_SYN flag is
391          * the only flag set, then create a session, mark it
392          * as if it was LISTENING, and continue...
393          */
394         if (so == 0) {
395           if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
396             goto dropwithreset;
397
398           if ((so = socreate()) == NULL)
399             goto dropwithreset;
400           if (tcp_attach(so) < 0) {
401             free(so); /* Not sofree (if it failed, it's not insqued) */
402             goto dropwithreset;
403           }
404
405           sbreserve(&so->so_snd, tcp_sndspace);
406           sbreserve(&so->so_rcv, tcp_rcvspace);
407
408           /*            tcp_last_so = so; */  /* XXX ? */
409           /*            tp = sototcpcb(so);    */
410
411           so->so_laddr = ti->ti_src;
412           so->so_lport = ti->ti_sport;
413           so->so_faddr = ti->ti_dst;
414           so->so_fport = ti->ti_dport;
415
416           if ((so->so_iptos = tcp_tos(so)) == 0)
417             so->so_iptos = ((struct ip *)ti)->ip_tos;
418
419           tp = sototcpcb(so);
420           tp->t_state = TCPS_LISTEN;
421         }
422
423         /*
424          * If this is a still-connecting socket, this probably
425          * a retransmit of the SYN.  Whether it's a retransmit SYN
426          * or something else, we nuke it.
427          */
428         if (so->so_state & SS_ISFCONNECTING)
429                 goto drop;
430
431         tp = sototcpcb(so);
432
433         /* XXX Should never fail */
434         if (tp == 0)
435                 goto dropwithreset;
436         if (tp->t_state == TCPS_CLOSED)
437                 goto drop;
438
439         /* Unscale the window into a 32-bit value. */
440 /*      if ((tiflags & TH_SYN) == 0)
441  *              tiwin = ti->ti_win << tp->snd_scale;
442  *      else
443  */
444                 tiwin = ti->ti_win;
445
446         /*
447          * Segment received on connection.
448          * Reset idle time and keep-alive timer.
449          */
450         tp->t_idle = 0;
451         if (so_options)
452            tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
453         else
454            tp->t_timer[TCPT_KEEP] = tcp_keepidle;
455
456         /*
457          * Process options if not in LISTEN state,
458          * else do it below (after getting remote address).
459          */
460         if (optp && tp->t_state != TCPS_LISTEN)
461                 tcp_dooptions(tp, (u_char *)optp, optlen, ti);
462 /* , */
463 /*                      &ts_present, &ts_val, &ts_ecr); */
464
465         /*
466          * Header prediction: check for the two common cases
467          * of a uni-directional data xfer.  If the packet has
468          * no control flags, is in-sequence, the window didn't
469          * change and we're not retransmitting, it's a
470          * candidate.  If the length is zero and the ack moved
471          * forward, we're the sender side of the xfer.  Just
472          * free the data acked & wake any higher level process
473          * that was blocked waiting for space.  If the length
474          * is non-zero and the ack didn't move, we're the
475          * receiver side.  If we're getting packets in-order
476          * (the reassembly queue is empty), add the data to
477          * the socket buffer and note that we need a delayed ack.
478          *
479          * XXX Some of these tests are not needed
480          * eg: the tiwin == tp->snd_wnd prevents many more
481          * predictions.. with no *real* advantage..
482          */
483         if (tp->t_state == TCPS_ESTABLISHED &&
484             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
485 /*          (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && */
486             ti->ti_seq == tp->rcv_nxt &&
487             tiwin && tiwin == tp->snd_wnd &&
488             tp->snd_nxt == tp->snd_max) {
489                 /*
490                  * If last ACK falls within this segment's sequence numbers,
491                  *  record the timestamp.
492                  */
493 /*              if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
494  *                 SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) {
495  *                      tp->ts_recent_age = tcp_now;
496  *                      tp->ts_recent = ts_val;
497  *              }
498  */
499                 if (ti->ti_len == 0) {
500                         if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
501                             SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
502                             tp->snd_cwnd >= tp->snd_wnd) {
503                                 /*
504                                  * this is a pure ack for outstanding data.
505                                  */
506                                 ++tcpstat.tcps_predack;
507 /*                              if (ts_present)
508  *                                      tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
509  *                              else
510  */                                  if (tp->t_rtt &&
511                                             SEQ_GT(ti->ti_ack, tp->t_rtseq))
512                                         tcp_xmit_timer(tp, tp->t_rtt);
513                                 acked = ti->ti_ack - tp->snd_una;
514                                 tcpstat.tcps_rcvackpack++;
515                                 tcpstat.tcps_rcvackbyte += acked;
516                                 sbdrop(&so->so_snd, acked);
517                                 tp->snd_una = ti->ti_ack;
518                                 m_freem(m);
519
520                                 /*
521                                  * If all outstanding data are acked, stop
522                                  * retransmit timer, otherwise restart timer
523                                  * using current (possibly backed-off) value.
524                                  * If process is waiting for space,
525                                  * wakeup/selwakeup/signal.  If data
526                                  * are ready to send, let tcp_output
527                                  * decide between more output or persist.
528                                  */
529                                 if (tp->snd_una == tp->snd_max)
530                                         tp->t_timer[TCPT_REXMT] = 0;
531                                 else if (tp->t_timer[TCPT_PERSIST] == 0)
532                                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
533
534                                 /*
535                                  * There's room in so_snd, sowwakup will read()
536                                  * from the socket if we can
537                                  */
538 /*                              if (so->so_snd.sb_flags & SB_NOTIFY)
539  *                                      sowwakeup(so);
540  */
541                                 /*
542                                  * This is called because sowwakeup might have
543                                  * put data into so_snd.  Since we don't so sowwakeup,
544                                  * we don't need this.. XXX???
545                                  */
546                                 if (so->so_snd.sb_cc)
547                                         (void) tcp_output(tp);
548
549                                 return;
550                         }
551                 } else if (ti->ti_ack == tp->snd_una &&
552                     tp->seg_next == (tcpiphdrp_32)tp &&
553                     ti->ti_len <= sbspace(&so->so_rcv)) {
554                         /*
555                          * this is a pure, in-sequence data packet
556                          * with nothing on the reassembly queue and
557                          * we have enough buffer space to take it.
558                          */
559                         ++tcpstat.tcps_preddat;
560                         tp->rcv_nxt += ti->ti_len;
561                         tcpstat.tcps_rcvpack++;
562                         tcpstat.tcps_rcvbyte += ti->ti_len;
563                         /*
564                          * Add data to socket buffer.
565                          */
566                         if (so->so_emu) {
567                                 if (tcp_emu(so,m)) sbappend(so, m);
568                         } else
569                                 sbappend(so, m);
570
571                         /*
572                          * XXX This is called when data arrives.  Later, check
573                          * if we can actually write() to the socket
574                          * XXX Need to check? It's be NON_BLOCKING
575                          */
576 /*                      sorwakeup(so); */
577
578                         /*
579                          * If this is a short packet, then ACK now - with Nagel
580                          *      congestion avoidance sender won't send more until
581                          *      he gets an ACK.
582                          *
583                          * It is better to not delay acks at all to maximize
584                          * TCP throughput.  See RFC 2581.
585                          */
586                         tp->t_flags |= TF_ACKNOW;
587                         tcp_output(tp);
588                         return;
589                 }
590         } /* header prediction */
591         /*
592          * Calculate amount of space in receive window,
593          * and then do TCP input processing.
594          * Receive window is amount of space in rcv queue,
595          * but not less than advertised window.
596          */
597         { int win;
598           win = sbspace(&so->so_rcv);
599           if (win < 0)
600             win = 0;
601           tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
602         }
603
604         switch (tp->t_state) {
605
606         /*
607          * If the state is LISTEN then ignore segment if it contains an RST.
608          * If the segment contains an ACK then it is bad and send a RST.
609          * If it does not contain a SYN then it is not interesting; drop it.
610          * Don't bother responding if the destination was a broadcast.
611          * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
612          * tp->iss, and send a segment:
613          *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
614          * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
615          * Fill in remote peer address fields if not previously specified.
616          * Enter SYN_RECEIVED state, and process any other fields of this
617          * segment in this state.
618          */
619         case TCPS_LISTEN: {
620
621           if (tiflags & TH_RST)
622             goto drop;
623           if (tiflags & TH_ACK)
624             goto dropwithreset;
625           if ((tiflags & TH_SYN) == 0)
626             goto drop;
627
628           /*
629            * This has way too many gotos...
630            * But a bit of spaghetti code never hurt anybody :)
631            */
632
633           /*
634            * If this is destined for the control address, then flag to
635            * tcp_ctl once connected, otherwise connect
636            */
637           if ((so->so_faddr.s_addr&htonl(0xffffff00)) == special_addr.s_addr) {
638             int lastbyte=ntohl(so->so_faddr.s_addr) & 0xff;
639             if (lastbyte!=CTL_ALIAS && lastbyte!=CTL_DNS) {
640 #if 0
641               if(lastbyte==CTL_CMD || lastbyte==CTL_EXEC) {
642                 /* Command or exec adress */
643                 so->so_state |= SS_CTL;
644               } else
645 #endif
646               {
647                 /* May be an add exec */
648                 struct ex_list *ex_ptr;
649                 for(ex_ptr = exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) {
650                   if(ex_ptr->ex_fport == so->so_fport &&
651                      lastbyte == ex_ptr->ex_addr) {
652                     so->so_state |= SS_CTL;
653                     break;
654                   }
655                 }
656               }
657               if(so->so_state & SS_CTL) goto cont_input;
658             }
659             /* CTL_ALIAS: Do nothing, tcp_fconnect will be called on it */
660           }
661
662           if (so->so_emu & EMU_NOCONNECT) {
663             so->so_emu &= ~EMU_NOCONNECT;
664             goto cont_input;
665           }
666
667           if((tcp_fconnect(so) == -1) && (errno != EINPROGRESS) && (errno != EWOULDBLOCK)) {
668             u_char code=ICMP_UNREACH_NET;
669             DEBUG_MISC((dfd," tcp fconnect errno = %d-%s\n",
670                         errno,strerror(errno)));
671             if(errno == ECONNREFUSED) {
672               /* ACK the SYN, send RST to refuse the connection */
673               tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
674                           TH_RST|TH_ACK);
675             } else {
676               if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST;
677               HTONL(ti->ti_seq);             /* restore tcp header */
678               HTONL(ti->ti_ack);
679               HTONS(ti->ti_win);
680               HTONS(ti->ti_urp);
681               m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
682               m->m_len  += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
683               *ip=save_ip;
684               icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno));
685             }
686             tp = tcp_close(tp);
687             m_free(m);
688           } else {
689             /*
690              * Haven't connected yet, save the current mbuf
691              * and ti, and return
692              * XXX Some OS's don't tell us whether the connect()
693              * succeeded or not.  So we must time it out.
694              */
695             so->so_m = m;
696             so->so_ti = ti;
697             tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
698             tp->t_state = TCPS_SYN_RECEIVED;
699           }
700           return;
701
702         cont_conn:
703           /* m==NULL
704            * Check if the connect succeeded
705            */
706           if (so->so_state & SS_NOFDREF) {
707             tp = tcp_close(tp);
708             goto dropwithreset;
709           }
710         cont_input:
711           tcp_template(tp);
712
713           if (optp)
714             tcp_dooptions(tp, (u_char *)optp, optlen, ti);
715           /* , */
716           /*                            &ts_present, &ts_val, &ts_ecr); */
717
718           if (iss)
719             tp->iss = iss;
720           else
721             tp->iss = tcp_iss;
722           tcp_iss += TCP_ISSINCR/2;
723           tp->irs = ti->ti_seq;
724           tcp_sendseqinit(tp);
725           tcp_rcvseqinit(tp);
726           tp->t_flags |= TF_ACKNOW;
727           tp->t_state = TCPS_SYN_RECEIVED;
728           tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
729           tcpstat.tcps_accepts++;
730           goto trimthenstep6;
731         } /* case TCPS_LISTEN */
732
733         /*
734          * If the state is SYN_SENT:
735          *      if seg contains an ACK, but not for our SYN, drop the input.
736          *      if seg contains a RST, then drop the connection.
737          *      if seg does not contain SYN, then drop it.
738          * Otherwise this is an acceptable SYN segment
739          *      initialize tp->rcv_nxt and tp->irs
740          *      if seg contains ack then advance tp->snd_una
741          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
742          *      arrange for segment to be acked (eventually)
743          *      continue processing rest of data/controls, beginning with URG
744          */
745         case TCPS_SYN_SENT:
746                 if ((tiflags & TH_ACK) &&
747                     (SEQ_LEQ(ti->ti_ack, tp->iss) ||
748                      SEQ_GT(ti->ti_ack, tp->snd_max)))
749                         goto dropwithreset;
750
751                 if (tiflags & TH_RST) {
752                         if (tiflags & TH_ACK)
753                                 tp = tcp_drop(tp,0); /* XXX Check t_softerror! */
754                         goto drop;
755                 }
756
757                 if ((tiflags & TH_SYN) == 0)
758                         goto drop;
759                 if (tiflags & TH_ACK) {
760                         tp->snd_una = ti->ti_ack;
761                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
762                                 tp->snd_nxt = tp->snd_una;
763                 }
764
765                 tp->t_timer[TCPT_REXMT] = 0;
766                 tp->irs = ti->ti_seq;
767                 tcp_rcvseqinit(tp);
768                 tp->t_flags |= TF_ACKNOW;
769                 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
770                         tcpstat.tcps_connects++;
771                         soisfconnected(so);
772                         tp->t_state = TCPS_ESTABLISHED;
773
774                         /* Do window scaling on this connection? */
775 /*                      if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
776  *                              (TF_RCVD_SCALE|TF_REQ_SCALE)) {
777  *                              tp->snd_scale = tp->requested_s_scale;
778  *                              tp->rcv_scale = tp->request_r_scale;
779  *                      }
780  */
781                         (void) tcp_reass(tp, (struct tcpiphdr *)0,
782                                 (struct mbuf *)0);
783                         /*
784                          * if we didn't have to retransmit the SYN,
785                          * use its rtt as our initial srtt & rtt var.
786                          */
787                         if (tp->t_rtt)
788                                 tcp_xmit_timer(tp, tp->t_rtt);
789                 } else
790                         tp->t_state = TCPS_SYN_RECEIVED;
791
792 trimthenstep6:
793                 /*
794                  * Advance ti->ti_seq to correspond to first data byte.
795                  * If data, trim to stay within window,
796                  * dropping FIN if necessary.
797                  */
798                 ti->ti_seq++;
799                 if (ti->ti_len > tp->rcv_wnd) {
800                         todrop = ti->ti_len - tp->rcv_wnd;
801                         m_adj(m, -todrop);
802                         ti->ti_len = tp->rcv_wnd;
803                         tiflags &= ~TH_FIN;
804                         tcpstat.tcps_rcvpackafterwin++;
805                         tcpstat.tcps_rcvbyteafterwin += todrop;
806                 }
807                 tp->snd_wl1 = ti->ti_seq - 1;
808                 tp->rcv_up = ti->ti_seq;
809                 goto step6;
810         } /* switch tp->t_state */
811         /*
812          * States other than LISTEN or SYN_SENT.
813          * First check timestamp, if present.
814          * Then check that at least some bytes of segment are within
815          * receive window.  If segment begins before rcv_nxt,
816          * drop leading data (and SYN); if nothing left, just ack.
817          *
818          * RFC 1323 PAWS: If we have a timestamp reply on this segment
819          * and it's less than ts_recent, drop it.
820          */
821 /*      if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
822  *          TSTMP_LT(ts_val, tp->ts_recent)) {
823  *
824  */             /* Check to see if ts_recent is over 24 days old.  */
825 /*              if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
826  */                     /*
827  *                       * Invalidate ts_recent.  If this segment updates
828  *                       * ts_recent, the age will be reset later and ts_recent
829  *                       * will get a valid value.  If it does not, setting
830  *                       * ts_recent to zero will at least satisfy the
831  *                       * requirement that zero be placed in the timestamp
832  *                       * echo reply when ts_recent isn't valid.  The
833  *                       * age isn't reset until we get a valid ts_recent
834  *                       * because we don't want out-of-order segments to be
835  *                       * dropped when ts_recent is old.
836  *                       */
837 /*                      tp->ts_recent = 0;
838  *              } else {
839  *                      tcpstat.tcps_rcvduppack++;
840  *                      tcpstat.tcps_rcvdupbyte += ti->ti_len;
841  *                      tcpstat.tcps_pawsdrop++;
842  *                      goto dropafterack;
843  *              }
844  *      }
845  */
846
847         todrop = tp->rcv_nxt - ti->ti_seq;
848         if (todrop > 0) {
849                 if (tiflags & TH_SYN) {
850                         tiflags &= ~TH_SYN;
851                         ti->ti_seq++;
852                         if (ti->ti_urp > 1)
853                                 ti->ti_urp--;
854                         else
855                                 tiflags &= ~TH_URG;
856                         todrop--;
857                 }
858                 /*
859                  * Following if statement from Stevens, vol. 2, p. 960.
860                  */
861                 if (todrop > ti->ti_len
862                     || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) {
863                         /*
864                          * Any valid FIN must be to the left of the window.
865                          * At this point the FIN must be a duplicate or out
866                          * of sequence; drop it.
867                          */
868                         tiflags &= ~TH_FIN;
869
870                         /*
871                          * Send an ACK to resynchronize and drop any data.
872                          * But keep on processing for RST or ACK.
873                          */
874                         tp->t_flags |= TF_ACKNOW;
875                         todrop = ti->ti_len;
876                         tcpstat.tcps_rcvduppack++;
877                         tcpstat.tcps_rcvdupbyte += todrop;
878                 } else {
879                         tcpstat.tcps_rcvpartduppack++;
880                         tcpstat.tcps_rcvpartdupbyte += todrop;
881                 }
882                 m_adj(m, todrop);
883                 ti->ti_seq += todrop;
884                 ti->ti_len -= todrop;
885                 if (ti->ti_urp > todrop)
886                         ti->ti_urp -= todrop;
887                 else {
888                         tiflags &= ~TH_URG;
889                         ti->ti_urp = 0;
890                 }
891         }
892         /*
893          * If new data are received on a connection after the
894          * user processes are gone, then RST the other end.
895          */
896         if ((so->so_state & SS_NOFDREF) &&
897             tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
898                 tp = tcp_close(tp);
899                 tcpstat.tcps_rcvafterclose++;
900                 goto dropwithreset;
901         }
902
903         /*
904          * If segment ends after window, drop trailing data
905          * (and PUSH and FIN); if nothing left, just ACK.
906          */
907         todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
908         if (todrop > 0) {
909                 tcpstat.tcps_rcvpackafterwin++;
910                 if (todrop >= ti->ti_len) {
911                         tcpstat.tcps_rcvbyteafterwin += ti->ti_len;
912                         /*
913                          * If a new connection request is received
914                          * while in TIME_WAIT, drop the old connection
915                          * and start over if the sequence numbers
916                          * are above the previous ones.
917                          */
918                         if (tiflags & TH_SYN &&
919                             tp->t_state == TCPS_TIME_WAIT &&
920                             SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
921                                 iss = tp->rcv_nxt + TCP_ISSINCR;
922                                 tp = tcp_close(tp);
923                                 goto findso;
924                         }
925                         /*
926                          * If window is closed can only take segments at
927                          * window edge, and have to drop data and PUSH from
928                          * incoming segments.  Continue processing, but
929                          * remember to ack.  Otherwise, drop segment
930                          * and ack.
931                          */
932                         if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
933                                 tp->t_flags |= TF_ACKNOW;
934                                 tcpstat.tcps_rcvwinprobe++;
935                         } else
936                                 goto dropafterack;
937                 } else
938                         tcpstat.tcps_rcvbyteafterwin += todrop;
939                 m_adj(m, -todrop);
940                 ti->ti_len -= todrop;
941                 tiflags &= ~(TH_PUSH|TH_FIN);
942         }
943
944         /*
945          * If last ACK falls within this segment's sequence numbers,
946          * record its timestamp.
947          */
948 /*      if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) &&
949  *          SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len +
950  *                 ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
951  *              tp->ts_recent_age = tcp_now;
952  *              tp->ts_recent = ts_val;
953  *      }
954  */
955
956         /*
957          * If the RST bit is set examine the state:
958          *    SYN_RECEIVED STATE:
959          *      If passive open, return to LISTEN state.
960          *      If active open, inform user that connection was refused.
961          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
962          *      Inform user that connection was reset, and close tcb.
963          *    CLOSING, LAST_ACK, TIME_WAIT STATES
964          *      Close the tcb.
965          */
966         if (tiflags&TH_RST) switch (tp->t_state) {
967
968         case TCPS_SYN_RECEIVED:
969 /*              so->so_error = ECONNREFUSED; */
970                 goto close;
971
972         case TCPS_ESTABLISHED:
973         case TCPS_FIN_WAIT_1:
974         case TCPS_FIN_WAIT_2:
975         case TCPS_CLOSE_WAIT:
976 /*              so->so_error = ECONNRESET; */
977         close:
978                 tp->t_state = TCPS_CLOSED;
979                 tcpstat.tcps_drops++;
980                 tp = tcp_close(tp);
981                 goto drop;
982
983         case TCPS_CLOSING:
984         case TCPS_LAST_ACK:
985         case TCPS_TIME_WAIT:
986                 tp = tcp_close(tp);
987                 goto drop;
988         }
989
990         /*
991          * If a SYN is in the window, then this is an
992          * error and we send an RST and drop the connection.
993          */
994         if (tiflags & TH_SYN) {
995                 tp = tcp_drop(tp,0);
996                 goto dropwithreset;
997         }
998
999         /*
1000          * If the ACK bit is off we drop the segment and return.
1001          */
1002         if ((tiflags & TH_ACK) == 0) goto drop;
1003
1004         /*
1005          * Ack processing.
1006          */
1007         switch (tp->t_state) {
1008         /*
1009          * In SYN_RECEIVED state if the ack ACKs our SYN then enter
1010          * ESTABLISHED state and continue processing, otherwise
1011          * send an RST.  una<=ack<=max
1012          */
1013         case TCPS_SYN_RECEIVED:
1014
1015                 if (SEQ_GT(tp->snd_una, ti->ti_ack) ||
1016                     SEQ_GT(ti->ti_ack, tp->snd_max))
1017                         goto dropwithreset;
1018                 tcpstat.tcps_connects++;
1019                 tp->t_state = TCPS_ESTABLISHED;
1020                 /*
1021                  * The sent SYN is ack'ed with our sequence number +1
1022                  * The first data byte already in the buffer will get
1023                  * lost if no correction is made.  This is only needed for
1024                  * SS_CTL since the buffer is empty otherwise.
1025                  * tp->snd_una++; or:
1026                  */
1027                 tp->snd_una=ti->ti_ack;
1028                 if (so->so_state & SS_CTL) {
1029                   /* So tcp_ctl reports the right state */
1030                   ret = tcp_ctl(so);
1031                   if (ret == 1) {
1032                     soisfconnected(so);
1033                     so->so_state &= ~SS_CTL;   /* success XXX */
1034                   } else if (ret == 2) {
1035                     so->so_state = SS_NOFDREF; /* CTL_CMD */
1036                   } else {
1037                     needoutput = 1;
1038                     tp->t_state = TCPS_FIN_WAIT_1;
1039                   }
1040                 } else {
1041                   soisfconnected(so);
1042                 }
1043
1044                 /* Do window scaling? */
1045 /*              if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1046  *                      (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1047  *                      tp->snd_scale = tp->requested_s_scale;
1048  *                      tp->rcv_scale = tp->request_r_scale;
1049  *              }
1050  */
1051                 (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0);
1052                 tp->snd_wl1 = ti->ti_seq - 1;
1053                 /* Avoid ack processing; snd_una==ti_ack  =>  dup ack */
1054                 goto synrx_to_est;
1055                 /* fall into ... */
1056
1057         /*
1058          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
1059          * ACKs.  If the ack is in the range
1060          *      tp->snd_una < ti->ti_ack <= tp->snd_max
1061          * then advance tp->snd_una to ti->ti_ack and drop
1062          * data from the retransmission queue.  If this ACK reflects
1063          * more up to date window information we update our window information.
1064          */
1065         case TCPS_ESTABLISHED:
1066         case TCPS_FIN_WAIT_1:
1067         case TCPS_FIN_WAIT_2:
1068         case TCPS_CLOSE_WAIT:
1069         case TCPS_CLOSING:
1070         case TCPS_LAST_ACK:
1071         case TCPS_TIME_WAIT:
1072
1073                 if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
1074                         if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
1075                           tcpstat.tcps_rcvdupack++;
1076                           DEBUG_MISC((dfd," dup ack  m = %lx  so = %lx \n",
1077                                       (long )m, (long )so));
1078                                 /*
1079                                  * If we have outstanding data (other than
1080                                  * a window probe), this is a completely
1081                                  * duplicate ack (ie, window info didn't
1082                                  * change), the ack is the biggest we've
1083                                  * seen and we've seen exactly our rexmt
1084                                  * threshold of them, assume a packet
1085                                  * has been dropped and retransmit it.
1086                                  * Kludge snd_nxt & the congestion
1087                                  * window so we send only this one
1088                                  * packet.
1089                                  *
1090                                  * We know we're losing at the current
1091                                  * window size so do congestion avoidance
1092                                  * (set ssthresh to half the current window
1093                                  * and pull our congestion window back to
1094                                  * the new ssthresh).
1095                                  *
1096                                  * Dup acks mean that packets have left the
1097                                  * network (they're now cached at the receiver)
1098                                  * so bump cwnd by the amount in the receiver
1099                                  * to keep a constant cwnd packets in the
1100                                  * network.
1101                                  */
1102                                 if (tp->t_timer[TCPT_REXMT] == 0 ||
1103                                     ti->ti_ack != tp->snd_una)
1104                                         tp->t_dupacks = 0;
1105                                 else if (++tp->t_dupacks == tcprexmtthresh) {
1106                                         tcp_seq onxt = tp->snd_nxt;
1107                                         u_int win =
1108                                             min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1109                                                 tp->t_maxseg;
1110
1111                                         if (win < 2)
1112                                                 win = 2;
1113                                         tp->snd_ssthresh = win * tp->t_maxseg;
1114                                         tp->t_timer[TCPT_REXMT] = 0;
1115                                         tp->t_rtt = 0;
1116                                         tp->snd_nxt = ti->ti_ack;
1117                                         tp->snd_cwnd = tp->t_maxseg;
1118                                         (void) tcp_output(tp);
1119                                         tp->snd_cwnd = tp->snd_ssthresh +
1120                                                tp->t_maxseg * tp->t_dupacks;
1121                                         if (SEQ_GT(onxt, tp->snd_nxt))
1122                                                 tp->snd_nxt = onxt;
1123                                         goto drop;
1124                                 } else if (tp->t_dupacks > tcprexmtthresh) {
1125                                         tp->snd_cwnd += tp->t_maxseg;
1126                                         (void) tcp_output(tp);
1127                                         goto drop;
1128                                 }
1129                         } else
1130                                 tp->t_dupacks = 0;
1131                         break;
1132                 }
1133         synrx_to_est:
1134                 /*
1135                  * If the congestion window was inflated to account
1136                  * for the other side's cached packets, retract it.
1137                  */
1138                 if (tp->t_dupacks > tcprexmtthresh &&
1139                     tp->snd_cwnd > tp->snd_ssthresh)
1140                         tp->snd_cwnd = tp->snd_ssthresh;
1141                 tp->t_dupacks = 0;
1142                 if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
1143                         tcpstat.tcps_rcvacktoomuch++;
1144                         goto dropafterack;
1145                 }
1146                 acked = ti->ti_ack - tp->snd_una;
1147                 tcpstat.tcps_rcvackpack++;
1148                 tcpstat.tcps_rcvackbyte += acked;
1149
1150                 /*
1151                  * If we have a timestamp reply, update smoothed
1152                  * round trip time.  If no timestamp is present but
1153                  * transmit timer is running and timed sequence
1154                  * number was acked, update smoothed round trip time.
1155                  * Since we now have an rtt measurement, cancel the
1156                  * timer backoff (cf., Phil Karn's retransmit alg.).
1157                  * Recompute the initial retransmit timer.
1158                  */
1159 /*              if (ts_present)
1160  *                      tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
1161  *              else
1162  */
1163                      if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
1164                         tcp_xmit_timer(tp,tp->t_rtt);
1165
1166                 /*
1167                  * If all outstanding data is acked, stop retransmit
1168                  * timer and remember to restart (more output or persist).
1169                  * If there is more data to be acked, restart retransmit
1170                  * timer, using current (possibly backed-off) value.
1171                  */
1172                 if (ti->ti_ack == tp->snd_max) {
1173                         tp->t_timer[TCPT_REXMT] = 0;
1174                         needoutput = 1;
1175                 } else if (tp->t_timer[TCPT_PERSIST] == 0)
1176                         tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
1177                 /*
1178                  * When new data is acked, open the congestion window.
1179                  * If the window gives us less than ssthresh packets
1180                  * in flight, open exponentially (maxseg per packet).
1181                  * Otherwise open linearly: maxseg per window
1182                  * (maxseg^2 / cwnd per packet).
1183                  */
1184                 {
1185                   register u_int cw = tp->snd_cwnd;
1186                   register u_int incr = tp->t_maxseg;
1187
1188                   if (cw > tp->snd_ssthresh)
1189                     incr = incr * incr / cw;
1190                   tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
1191                 }
1192                 if (acked > so->so_snd.sb_cc) {
1193                         tp->snd_wnd -= so->so_snd.sb_cc;
1194                         sbdrop(&so->so_snd, (int )so->so_snd.sb_cc);
1195                         ourfinisacked = 1;
1196                 } else {
1197                         sbdrop(&so->so_snd, acked);
1198                         tp->snd_wnd -= acked;
1199                         ourfinisacked = 0;
1200                 }
1201                 /*
1202                  * XXX sowwakup is called when data is acked and there's room for
1203                  * for more data... it should read() the socket
1204                  */
1205 /*              if (so->so_snd.sb_flags & SB_NOTIFY)
1206  *                      sowwakeup(so);
1207  */
1208                 tp->snd_una = ti->ti_ack;
1209                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
1210                         tp->snd_nxt = tp->snd_una;
1211
1212                 switch (tp->t_state) {
1213
1214                 /*
1215                  * In FIN_WAIT_1 STATE in addition to the processing
1216                  * for the ESTABLISHED state if our FIN is now acknowledged
1217                  * then enter FIN_WAIT_2.
1218                  */
1219                 case TCPS_FIN_WAIT_1:
1220                         if (ourfinisacked) {
1221                                 /*
1222                                  * If we can't receive any more
1223                                  * data, then closing user can proceed.
1224                                  * Starting the timer is contrary to the
1225                                  * specification, but if we don't get a FIN
1226                                  * we'll hang forever.
1227                                  */
1228                                 if (so->so_state & SS_FCANTRCVMORE) {
1229                                         soisfdisconnected(so);
1230                                         tp->t_timer[TCPT_2MSL] = tcp_maxidle;
1231                                 }
1232                                 tp->t_state = TCPS_FIN_WAIT_2;
1233                         }
1234                         break;
1235
1236                 /*
1237                  * In CLOSING STATE in addition to the processing for
1238                  * the ESTABLISHED state if the ACK acknowledges our FIN
1239                  * then enter the TIME-WAIT state, otherwise ignore
1240                  * the segment.
1241                  */
1242                 case TCPS_CLOSING:
1243                         if (ourfinisacked) {
1244                                 tp->t_state = TCPS_TIME_WAIT;
1245                                 tcp_canceltimers(tp);
1246                                 tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1247                                 soisfdisconnected(so);
1248                         }
1249                         break;
1250
1251                 /*
1252                  * In LAST_ACK, we may still be waiting for data to drain
1253                  * and/or to be acked, as well as for the ack of our FIN.
1254                  * If our FIN is now acknowledged, delete the TCB,
1255                  * enter the closed state and return.
1256                  */
1257                 case TCPS_LAST_ACK:
1258                         if (ourfinisacked) {
1259                                 tp = tcp_close(tp);
1260                                 goto drop;
1261                         }
1262                         break;
1263
1264                 /*
1265                  * In TIME_WAIT state the only thing that should arrive
1266                  * is a retransmission of the remote FIN.  Acknowledge
1267                  * it and restart the finack timer.
1268                  */
1269                 case TCPS_TIME_WAIT:
1270                         tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1271                         goto dropafterack;
1272                 }
1273         } /* switch(tp->t_state) */
1274
1275 step6:
1276         /*
1277          * Update window information.
1278          * Don't look at window if no ACK: TAC's send garbage on first SYN.
1279          */
1280         if ((tiflags & TH_ACK) &&
1281             (SEQ_LT(tp->snd_wl1, ti->ti_seq) ||
1282             (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) ||
1283             (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) {
1284                 /* keep track of pure window updates */
1285                 if (ti->ti_len == 0 &&
1286                     tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)
1287                         tcpstat.tcps_rcvwinupd++;
1288                 tp->snd_wnd = tiwin;
1289                 tp->snd_wl1 = ti->ti_seq;
1290                 tp->snd_wl2 = ti->ti_ack;
1291                 if (tp->snd_wnd > tp->max_sndwnd)
1292                         tp->max_sndwnd = tp->snd_wnd;
1293                 needoutput = 1;
1294         }
1295
1296         /*
1297          * Process segments with URG.
1298          */
1299         if ((tiflags & TH_URG) && ti->ti_urp &&
1300             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1301                 /*
1302                  * This is a kludge, but if we receive and accept
1303                  * random urgent pointers, we'll crash in
1304                  * soreceive.  It's hard to imagine someone
1305                  * actually wanting to send this much urgent data.
1306                  */
1307                 if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen) {
1308                         ti->ti_urp = 0;
1309                         tiflags &= ~TH_URG;
1310                         goto dodata;
1311                 }
1312                 /*
1313                  * If this segment advances the known urgent pointer,
1314                  * then mark the data stream.  This should not happen
1315                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
1316                  * a FIN has been received from the remote side.
1317                  * In these states we ignore the URG.
1318                  *
1319                  * According to RFC961 (Assigned Protocols),
1320                  * the urgent pointer points to the last octet
1321                  * of urgent data.  We continue, however,
1322                  * to consider it to indicate the first octet
1323                  * of data past the urgent section as the original
1324                  * spec states (in one of two places).
1325                  */
1326                 if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
1327                         tp->rcv_up = ti->ti_seq + ti->ti_urp;
1328                         so->so_urgc =  so->so_rcv.sb_cc +
1329                                 (tp->rcv_up - tp->rcv_nxt); /* -1; */
1330                         tp->rcv_up = ti->ti_seq + ti->ti_urp;
1331
1332                 }
1333         } else
1334                 /*
1335                  * If no out of band data is expected,
1336                  * pull receive urgent pointer along
1337                  * with the receive window.
1338                  */
1339                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
1340                         tp->rcv_up = tp->rcv_nxt;
1341 dodata:
1342
1343         /*
1344          * Process the segment text, merging it into the TCP sequencing queue,
1345          * and arranging for acknowledgment of receipt if necessary.
1346          * This process logically involves adjusting tp->rcv_wnd as data
1347          * is presented to the user (this happens in tcp_usrreq.c,
1348          * case PRU_RCVD).  If a FIN has already been received on this
1349          * connection then we just ignore the text.
1350          */
1351         if ((ti->ti_len || (tiflags&TH_FIN)) &&
1352             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1353                 TCP_REASS(tp, ti, m, so, tiflags);
1354                 /*
1355                  * Note the amount of data that peer has sent into
1356                  * our window, in order to estimate the sender's
1357                  * buffer size.
1358                  */
1359                 len = so->so_rcv.sb_datalen - (tp->rcv_adv - tp->rcv_nxt);
1360         } else {
1361                 m_free(m);
1362                 tiflags &= ~TH_FIN;
1363         }
1364
1365         /*
1366          * If FIN is received ACK the FIN and let the user know
1367          * that the connection is closing.
1368          */
1369         if (tiflags & TH_FIN) {
1370                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
1371                         /*
1372                          * If we receive a FIN we can't send more data,
1373                          * set it SS_FDRAIN
1374                          * Shutdown the socket if there is no rx data in the
1375                          * buffer.
1376                          * soread() is called on completion of shutdown() and
1377                          * will got to TCPS_LAST_ACK, and use tcp_output()
1378                          * to send the FIN.
1379                          */
1380 /*                      sofcantrcvmore(so); */
1381                         sofwdrain(so);
1382
1383                         tp->t_flags |= TF_ACKNOW;
1384                         tp->rcv_nxt++;
1385                 }
1386                 switch (tp->t_state) {
1387
1388                 /*
1389                  * In SYN_RECEIVED and ESTABLISHED STATES
1390                  * enter the CLOSE_WAIT state.
1391                  */
1392                 case TCPS_SYN_RECEIVED:
1393                 case TCPS_ESTABLISHED:
1394                   if(so->so_emu == EMU_CTL)        /* no shutdown on socket */
1395                     tp->t_state = TCPS_LAST_ACK;
1396                   else
1397                     tp->t_state = TCPS_CLOSE_WAIT;
1398                   break;
1399
1400                 /*
1401                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
1402                  * enter the CLOSING state.
1403                  */
1404                 case TCPS_FIN_WAIT_1:
1405                         tp->t_state = TCPS_CLOSING;
1406                         break;
1407
1408                 /*
1409                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
1410                  * starting the time-wait timer, turning off the other
1411                  * standard timers.
1412                  */
1413                 case TCPS_FIN_WAIT_2:
1414                         tp->t_state = TCPS_TIME_WAIT;
1415                         tcp_canceltimers(tp);
1416                         tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1417                         soisfdisconnected(so);
1418                         break;
1419
1420                 /*
1421                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
1422                  */
1423                 case TCPS_TIME_WAIT:
1424                         tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
1425                         break;
1426                 }
1427         }
1428
1429         /*
1430          * If this is a small packet, then ACK now - with Nagel
1431          *      congestion avoidance sender won't send more until
1432          *      he gets an ACK.
1433          *
1434          * See above.
1435          */
1436 /*      if (ti->ti_len && (unsigned)ti->ti_len < tp->t_maxseg) {
1437  */
1438 /*      if ((ti->ti_len && (unsigned)ti->ti_len < tp->t_maxseg &&
1439  *              (so->so_iptos & IPTOS_LOWDELAY) == 0) ||
1440  *             ((so->so_iptos & IPTOS_LOWDELAY) &&
1441  *             ((struct tcpiphdr_2 *)ti)->first_char == (char)27)) {
1442  */
1443         if (ti->ti_len && (unsigned)ti->ti_len <= 5 &&
1444             ((struct tcpiphdr_2 *)ti)->first_char == (char)27) {
1445                 tp->t_flags |= TF_ACKNOW;
1446         }
1447
1448         /*
1449          * Return any desired output.
1450          */
1451         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
1452                 (void) tcp_output(tp);
1453         }
1454         return;
1455
1456 dropafterack:
1457         /*
1458          * Generate an ACK dropping incoming segment if it occupies
1459          * sequence space, where the ACK reflects our state.
1460          */
1461         if (tiflags & TH_RST)
1462                 goto drop;
1463         m_freem(m);
1464         tp->t_flags |= TF_ACKNOW;
1465         (void) tcp_output(tp);
1466         return;
1467
1468 dropwithreset:
1469         /* reuses m if m!=NULL, m_free() unnecessary */
1470         if (tiflags & TH_ACK)
1471                 tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
1472         else {
1473                 if (tiflags & TH_SYN) ti->ti_len++;
1474                 tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
1475                     TH_RST|TH_ACK);
1476         }
1477
1478         return;
1479
1480 drop:
1481         /*
1482          * Drop space held by incoming segment and return.
1483          */
1484         m_free(m);
1485
1486         return;
1487 }
1488
1489  /* , ts_present, ts_val, ts_ecr) */
1490 /*      int *ts_present;
1491  *      u_int32_t *ts_val, *ts_ecr;
1492  */
1493 void
1494 tcp_dooptions(tp, cp, cnt, ti)
1495         struct tcpcb *tp;
1496         u_char *cp;
1497         int cnt;
1498         struct tcpiphdr *ti;
1499 {
1500         u_int16_t mss;
1501         int opt, optlen;
1502
1503         DEBUG_CALL("tcp_dooptions");
1504         DEBUG_ARGS((dfd," tp = %lx  cnt=%i \n", (long )tp, cnt));
1505
1506         for (; cnt > 0; cnt -= optlen, cp += optlen) {
1507                 opt = cp[0];
1508                 if (opt == TCPOPT_EOL)
1509                         break;
1510                 if (opt == TCPOPT_NOP)
1511                         optlen = 1;
1512                 else {
1513                         optlen = cp[1];
1514                         if (optlen <= 0)
1515                                 break;
1516                 }
1517                 switch (opt) {
1518
1519                 default:
1520                         continue;
1521
1522                 case TCPOPT_MAXSEG:
1523                         if (optlen != TCPOLEN_MAXSEG)
1524                                 continue;
1525                         if (!(ti->ti_flags & TH_SYN))
1526                                 continue;
1527                         memcpy((char *) &mss, (char *) cp + 2, sizeof(mss));
1528                         NTOHS(mss);
1529                         (void) tcp_mss(tp, mss);        /* sets t_maxseg */
1530                         break;
1531
1532 /*              case TCPOPT_WINDOW:
1533  *                      if (optlen != TCPOLEN_WINDOW)
1534  *                              continue;
1535  *                      if (!(ti->ti_flags & TH_SYN))
1536  *                              continue;
1537  *                      tp->t_flags |= TF_RCVD_SCALE;
1538  *                      tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
1539  *                      break;
1540  */
1541 /*              case TCPOPT_TIMESTAMP:
1542  *                      if (optlen != TCPOLEN_TIMESTAMP)
1543  *                              continue;
1544  *                      *ts_present = 1;
1545  *                      memcpy((char *) ts_val, (char *)cp + 2, sizeof(*ts_val));
1546  *                      NTOHL(*ts_val);
1547  *                      memcpy((char *) ts_ecr, (char *)cp + 6, sizeof(*ts_ecr));
1548  *                      NTOHL(*ts_ecr);
1549  *
1550  */                     /*
1551  *                       * A timestamp received in a SYN makes
1552  *                       * it ok to send timestamp requests and replies.
1553  *                       */
1554 /*                      if (ti->ti_flags & TH_SYN) {
1555  *                              tp->t_flags |= TF_RCVD_TSTMP;
1556  *                              tp->ts_recent = *ts_val;
1557  *                              tp->ts_recent_age = tcp_now;
1558  *                      }
1559  */                     break;
1560                 }
1561         }
1562 }
1563
1564
1565 /*
1566  * Pull out of band byte out of a segment so
1567  * it doesn't appear in the user's data queue.
1568  * It is still reflected in the segment length for
1569  * sequencing purposes.
1570  */
1571
1572 #ifdef notdef
1573
1574 void
1575 tcp_pulloutofband(so, ti, m)
1576         struct socket *so;
1577         struct tcpiphdr *ti;
1578         register struct mbuf *m;
1579 {
1580         int cnt = ti->ti_urp - 1;
1581
1582         while (cnt >= 0) {
1583                 if (m->m_len > cnt) {
1584                         char *cp = mtod(m, caddr_t) + cnt;
1585                         struct tcpcb *tp = sototcpcb(so);
1586
1587                         tp->t_iobc = *cp;
1588                         tp->t_oobflags |= TCPOOB_HAVEDATA;
1589                         memcpy(sp, cp+1, (unsigned)(m->m_len - cnt - 1));
1590                         m->m_len--;
1591                         return;
1592                 }
1593                 cnt -= m->m_len;
1594                 m = m->m_next; /* XXX WRONG! Fix it! */
1595                 if (m == 0)
1596                         break;
1597         }
1598         panic("tcp_pulloutofband");
1599 }
1600
1601 #endif /* notdef */
1602
1603 /*
1604  * Collect new round-trip time estimate
1605  * and update averages and current timeout.
1606  */
1607
1608 void
1609 tcp_xmit_timer(tp, rtt)
1610         register struct tcpcb *tp;
1611         int rtt;
1612 {
1613         register short delta;
1614
1615         DEBUG_CALL("tcp_xmit_timer");
1616         DEBUG_ARG("tp = %lx", (long)tp);
1617         DEBUG_ARG("rtt = %d", rtt);
1618
1619         tcpstat.tcps_rttupdated++;
1620         if (tp->t_srtt != 0) {
1621                 /*
1622                  * srtt is stored as fixed point with 3 bits after the
1623                  * binary point (i.e., scaled by 8).  The following magic
1624                  * is equivalent to the smoothing algorithm in rfc793 with
1625                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
1626                  * point).  Adjust rtt to origin 0.
1627                  */
1628                 delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT);
1629                 if ((tp->t_srtt += delta) <= 0)
1630                         tp->t_srtt = 1;
1631                 /*
1632                  * We accumulate a smoothed rtt variance (actually, a
1633                  * smoothed mean difference), then set the retransmit
1634                  * timer to smoothed rtt + 4 times the smoothed variance.
1635                  * rttvar is stored as fixed point with 2 bits after the
1636                  * binary point (scaled by 4).  The following is
1637                  * equivalent to rfc793 smoothing with an alpha of .75
1638                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
1639                  * rfc793's wired-in beta.
1640                  */
1641                 if (delta < 0)
1642                         delta = -delta;
1643                 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
1644                 if ((tp->t_rttvar += delta) <= 0)
1645                         tp->t_rttvar = 1;
1646         } else {
1647                 /*
1648                  * No rtt measurement yet - use the unsmoothed rtt.
1649                  * Set the variance to half the rtt (so our first
1650                  * retransmit happens at 3*rtt).
1651                  */
1652                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
1653                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
1654         }
1655         tp->t_rtt = 0;
1656         tp->t_rxtshift = 0;
1657
1658         /*
1659          * the retransmit should happen at rtt + 4 * rttvar.
1660          * Because of the way we do the smoothing, srtt and rttvar
1661          * will each average +1/2 tick of bias.  When we compute
1662          * the retransmit timer, we want 1/2 tick of rounding and
1663          * 1 extra tick because of +-1/2 tick uncertainty in the
1664          * firing of the timer.  The bias will give us exactly the
1665          * 1.5 tick we need.  But, because the bias is
1666          * statistical, we have to test that we don't drop below
1667          * the minimum feasible timer (which is 2 ticks).
1668          */
1669         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1670             (short)tp->t_rttmin, TCPTV_REXMTMAX); /* XXX */
1671
1672         /*
1673          * We received an ack for a packet that wasn't retransmitted;
1674          * it is probably safe to discard any error indications we've
1675          * received recently.  This isn't quite right, but close enough
1676          * for now (a route might have failed after we sent a segment,
1677          * and the return path might not be symmetrical).
1678          */
1679         tp->t_softerror = 0;
1680 }
1681
1682 /*
1683  * Determine a reasonable value for maxseg size.
1684  * If the route is known, check route for mtu.
1685  * If none, use an mss that can be handled on the outgoing
1686  * interface without forcing IP to fragment; if bigger than
1687  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
1688  * to utilize large mbufs.  If no route is found, route has no mtu,
1689  * or the destination isn't local, use a default, hopefully conservative
1690  * size (usually 512 or the default IP max size, but no more than the mtu
1691  * of the interface), as we can't discover anything about intervening
1692  * gateways or networks.  We also initialize the congestion/slow start
1693  * window to be a single segment if the destination isn't local.
1694  * While looking at the routing entry, we also initialize other path-dependent
1695  * parameters from pre-set or cached values in the routing entry.
1696  */
1697
1698 int
1699 tcp_mss(tp, offer)
1700         register struct tcpcb *tp;
1701         u_int offer;
1702 {
1703         struct socket *so = tp->t_socket;
1704         int mss;
1705
1706         DEBUG_CALL("tcp_mss");
1707         DEBUG_ARG("tp = %lx", (long)tp);
1708         DEBUG_ARG("offer = %d", offer);
1709
1710         mss = min(if_mtu, if_mru) - sizeof(struct tcpiphdr);
1711         if (offer)
1712                 mss = min(mss, offer);
1713         mss = max(mss, 32);
1714         if (mss < tp->t_maxseg || offer != 0)
1715            tp->t_maxseg = mss;
1716
1717         tp->snd_cwnd = mss;
1718
1719         sbreserve(&so->so_snd, tcp_sndspace+((tcp_sndspace%mss)?(mss-(tcp_sndspace%mss)):0));
1720         sbreserve(&so->so_rcv, tcp_rcvspace+((tcp_rcvspace%mss)?(mss-(tcp_rcvspace%mss)):0));
1721
1722         DEBUG_MISC((dfd, " returning mss = %d\n", mss));
1723
1724         return mss;
1725 }