struct tcp_sock {//在 inet_connection_sock 基础上增加了 滑动窗口 拥塞控制算法等tcp 专有 属性 /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ u16 gso_segs; /* Max number of segs per GSO packet */ /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags;/*首部预测标志 在接收到 syn 跟新窗口 等时设置此标志 , 此标志和时间戳 序号等 用于判断执行 快速还是慢速路径*/ /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ u32 rcv_nxt; /* What we want to receive next 等待接收的下一个序列号 */ u32 copied_seq; /* Head of yet unread data */ /* rcv_nxt on last window update sent最早接收但没有确认的序号, 也就是接收窗口的左端, 在发送ack的时候, rcv_nxt更新 因此rcv_wup 更新比rcv_nxt 滞后一些 */ u32 rcv_wup; u32 snd_nxt; /* Next sequence we send 等待发送的下一个序列号 */ u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */ u32 snd_una; /* First byte we want an ack for 最早一个未被确认的序号 */ u32 snd_sml; /* Last byte of the most recently transmitted small packet 最近发送一个小于mss的最后 一个字节序列号 在成功发送, 如果报文小于mss,跟新这个字段 主要用来判断是否启用 nagle 算法*/ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) 最近一次收到ack的时间 用于 tcp 保活*/ u32 lsndtime; /* timestamp of last sent data packet (for restart window) 最近一次发送 数据包时间*/ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 tsoffset; /* timestamp offset */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ unsigned long tsq_flags; /* Data for direct copy to user cp 数据到用户进程的控制块 有用户缓存以及其长度 prequeue 队列 其内存*/ struct { struct sk_buff_head prequeue // tcp 段 缓冲到此队列 知道进程主动读取才真正的处理; struct task_struct *task; struct msghdr *msg; int memory;// prequeue 当前消耗的内存 int len;// 用户缓存中 当前可以使用的缓存大小 } ucopy; u32 snd_wl1; /* Sequence for window update记录跟新发送窗口的那个ack 段号 用来判断是否 需要跟新窗口 如果后续收到ack大于snd_wll 则表示需要更新 窗口*/ u32 snd_wnd; /* The window we expect to receive 接收方 提供的窗口大小 也就是发送方窗口大小 */ u32 max_window; /* Maximal window ever seen from peer 接收方通告的最大窗口 */ u32 mss_cache; /* Cached effective mss, not including SACKS 发送方当前有效的mss*/ u32 window_clamp; /* Maximal window to advertise 滑动窗口最大值 */ u32 rcv_ssthresh; /* Current window clamp 当前接收窗口的阈值 */ /* Information of the most recently (s)acked skb */ struct tcp_rack { struct skb_mstamp mstamp; /* (Re)sent time of the skb */ u8 advanced; /* mstamp advanced since last lost marking */ u8 reord; /* reordering detected */ } rack; u16 advmss; /* Advertised MSS本端能接收的 MSS 上限,在建立时用来通告对方 */ u8 unused; u8 nonagle : 4,/* Disable Nagle algorithm? 是否 开启 ngnagle 算法 */ thin_lto : 1,/* Use linear timeouts for thin streams */ thin_dupack : 1,/* Fast retransmit on first dupack */ repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ u32 srtt_us; /* smoothed round trip time << 3 in usecs 平滑rtt*/ u32 mdev_us; /* medium deviation rtt平均偏差 */ u32 mdev_max_us; /* maximal mdev for the last rtt period rtt平均偏差最大值 */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar 记录SND.UNA 计算rto 时比较SND.NUA是否已经给更新 如果SND.UNA 跟新,则需要同时跟新rttval*/ struct rtt_meas { u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */ } rtt_min[3]; u32 packets_out; /* Packets which are "in flight"发送出去 没有被ack的数 (SND.NEXT -SND.UNA )*/ u32 retrans_out; /* Retransmitted packets out 重传还未得到确认的tcp数 重传并且还未得到确认的 TCP 段的数目*/ u32 max_packets_out; /* max packets_out in last window */ u32 max_packets_seq; /* right edge of max_packets_out flight */ u16 urg_data; /* Saved octet of OOB data and control flags 存放紧急数据以及控制标示 */ u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes 保活探测次数上限 */ u32 reordering; /* Packet reordering metric. tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;三个重复ACK报文时,触发快速重传 */ u32 snd_up; /* Urgent pointer 紧急数据指针 带外数据的序号 */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_ssthresh; /* Slow start size threshold 拥塞控制 满启动阈值 */ u32 snd_cwnd; /* Sending congestion window 当前拥塞窗口大小 ---发送的拥塞窗口 */ u32 snd_cwnd_cnt; /* Linear increase counter 自从上次调整拥塞窗口后 到目前位置接收到的 总ack段数 如果该字段为0 表示调整拥塞窗口但是没有收到ack,调整拥塞窗口之后 收到ack段就回让 snd_cwnd_cnt 加1 */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this snd_cwnd 的最大值*/ u32 snd_cwnd_used;//记录已经从队列发送而没有被ack的段数 u32 snd_cwnd_stamp;//记录最近一次检验cwnd 的时间; 拥塞期间 每次会检验cwnd而调节拥塞窗口 , //在非拥塞期间,为了防止应用层序造成拥塞窗口失效 因此在发送后 有必要检测cwnd u32 prior_cwnd; /* Congestion window at start of Recovery.在进入 Recovery 状态时的拥塞窗口 */ u32 prr_delivered; /* Number of newly delivered packets to在恢复阶段给接收者新发送包的数量 * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery.在恢复阶段一共发送的包的数量 */ u32 rcv_wnd; /* Current receiver window 当前接收窗口的大小 */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer 已加入发送队列中的最后一个字节序号*/ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets丢失的数据报 */ u32 sacked_out; /* SACK'd packets启用 SACK 时,通过 SACK 的 TCP 选项标识已接收到的段的数量。 不启用 SACK 时,标识接收到的重复确认的次数,该值在接收到确认新数据段时被清除。 */ u32 fackets_out; /* FACK'd packets FACK'd packets 记录 SND.UNA 与 (SACK 选项中目前接收方收到的段中最高序号段) 之间的段数。FACK 用 SACK 选项来计算丢失在网络中上的段数 lost_out=fackets_out-sacked_out left_out=fackets_out fackets_out = sack_out + lost_out
*/ /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; /*在重传队列中, 缓存下次要标志的段*/ struct sk_buff *retransmit_skb_hint;/* 表示将要重传的起始包*/ /* OOO segments go in this list. Note that socket lock must be held, * as we do not use sk_buff_head lock. */ struct sk_buff_head out_of_order_queue; /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; struct sk_buff *highest_sack; /* skb just after the highest最大sack序列号 * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ int lost_cnt_hint;/* 已经标志了多少个段 */ u32 retransmit_high; /* L-bits may be on up to this seqno 表示将要重传的起始包 */ u32 prior_ssthresh; /* ssthresh saved at recovery start表示前一个snd_ssthresh得大小 */ u32 high_seq; /* snd_nxt at onset of congestion拥塞开始时,snd_nxt的大----开始拥塞的时候下一个要发送的序号字节*/ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. 在使用 F-RTO 算法进行发送超时处理,或进入 Recovery 进行重传, 或进入 Loss 开始慢启动时,记录当时 SND.UNA, 标记重传起始点。它是检测是否可以进行拥塞控制撤销的条件之一,一般在完成 拥塞撤销操作或进入拥塞控制 Loss 状态后会清零。*/ int undo_retrans; /* number of undoable retransmissions. 在恢复拥塞控制之前可进行撤销的重传段数。 在进入 FTRO 算法或 拥塞状态 Loss 时,清零,在重传时计数,是检测是否可以进行拥塞撤销的条件之一。*/ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer 紧急数据的序号 所在段的序号和紧急指针相加获得*/ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Receiver side RTT estimation */ struct { u32 rtt; u32 seq; u32 time; } rcv_rtt_est; /* Receiver queue space */ struct { int space; u32 seq; u32 time; } rcvq_space; /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ struct request_sock *fastopen_rsk; u32 *saved_syn; };
接收端在检测数据包乱序是否超过乱序阀值(默认为3,在proc的tcp_reordering里可配置)是会用到fack_out和sacked_out,针对这两个的含义如下图所示,fack_out表示收到最大sack到snd_una间的大小,sacked_out表示接收方sack到的包个数;
接收端在检测数据包乱序是否超过乱序阀值(默认为3,在proc的tcp_reordering里可配置)是会用到fack_out和sacked_out,针对这两个的含义如下图所示,fack_out表示收到最大sack到snd_una间的大小,sacked_out表示接收方sack到的包个数;
/* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. * This is 44 bytes if IPV6 is enabled. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { __u32 seq; /* Starting sequence number *起始序号 */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { /* Note : tcp_tw_isn is used in input path only * (isn chosen by tcp_timewait_state_process()) * * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK/FACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block SKB 被确认了 ----被 SACK 块 ACK'd 也就是SACK块已经给出了skb数据缓冲区中的段回答信息 */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted 数据段被重传 */ #define TCPCB_LOST 0x04 /* SKB is lost 数据段已经丢失 */ #define TCPCB_TAGBITS 0x07 /* All tag bits TCPCB_TAGBITS = TCPCB_SACKED_ACKED | TCPCB_SACKED_RESTRANS | TCPCB_LOST */ #define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp) */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame 指明数据段以前是否重传过 */ #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ TCPCB_REPAIRED) __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ /* 1 byte hole */ __u32 ack_seq; /* Sequence number ACK'd ACK 的序号 */ union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif
2
/** inet_connection_sock - INET connection oriented sock * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event * @icsk_backoff: Backoff * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries * @icsk_probes_out: unanswered 0 window probes * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data * @icsk_mtup; MTU probing control data */ struct inet_connection_sock { /* inet_sock has to be the first member! */ struct inet_sock icsk_inet; //inet_connection_sock common struct struct request_sock_queue icsk_accept_queue; //tcp newsk 存放新的链接sock 等待accept 读取 struct inet_bind_bucket *icsk_bind_hash;//指向与之bind的信息 unsigned long icsk_timeout;//数据包超时时间-- 重传 tv_off --通常为 jiffies+ icsk_rto 后 进行重传 struct timer_list icsk_retransmit_timer; // 通过icsk_pengding 来区分重传定时器和持续定时器 struct timer_list icsk_delack_timer;// 延时发送ack 定时器 __u32 icsk_rto;// 重传超时时间 初始值为 TCP_TIMEOUT_INIT 根据网络情况动态计算 __u32 icsk_pmtu_cookie; //最后一次更新的路径MTU const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:6,// 拥塞状态 icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits;// 超时重传的次数 __u8 icsk_pending; //标志定时器事件 ICSK_TIME_EARLY_RETRANS 等可选值 表示 重传定时 持续定时器 保活定时器等 __u8 icsk_backoff;// 计算持续定时器 下一个设定值的指数 退避算法指数 __u8 icsk_syn_retries; // 建立tcp 允许 重传 syn syn+ack的次数 __u8 icsk_probes_out;// 持续定时等 周期性发出未被确认的tcp seg 数目 __u16 icsk_ext_hdr_len; struct { __u8 pending; /* ACK is pending 标示 需要确认发送的 紧急程度 和状态 */ __u8 quick; /* Scheduled number of quick acks在快速发送确认模式中 */ __u8 pingpong; /* The session is interactive 启用禁用 快速确认模式 1 ---标示延时发送ack 0 标示快速发送ack */ __u8 blocked; /* Delayed ACK was blocked by socket lock 软中断 用户进程 不能同时own sk 如果sk 被 user 拥有, 延时ack 定时器被触发,此时不应该发送ack, blocked 为1;标示 如果有机会就需要立即发送,所以当接收的数据被cp到user 后 就可以立即发送ack */ __u32 ato; /* Predicted tick of soft clock 延时确认的估值 */ unsigned long timeout; /* Currently scheduled timeout当前延时确认时间 超时后立即发送ack */ __u32 lrcvtime; /* timestamp of last received data packet 最近一次接收到数据包时间*/ __u16 last_seg_size; /* Size of last incoming segment最后一个接收到段的长度 用来计算rcv_mss */ __u16 rcv_mss; /* MSS used for delayed ACK decisions 由最近接收的段计算出MSS */ } icsk_ack; // 延时确认控制块 struct { int enabled;// 是否开启路径MTU /* Range of MTUs to search */ int search_high; int search_low; /* Information on the current probe当前mtu 探测的长度 用于判断mtu是否完成 初始值为0. */ int probe_size; u32 probe_timestamp; } icsk_mtup; u32 icsk_user_timeout; u64 icsk_ca_priv[64 / sizeof(u64)]; #define ICSK_CA_PRIV_SIZE (8 * sizeof(u64)) };