C/C++教程

聊一聊tcp 拥塞控制 相关数据结构

本文主要是介绍聊一聊tcp 拥塞控制 相关数据结构,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
struct tcp_sock {//在 inet_connection_sock  基础上增加了 滑动窗口 拥塞控制算法等tcp 专有 属性
    /* inet_connection_sock has to be the first member of tcp_sock */
    struct inet_connection_sock    inet_conn;
    u16    tcp_header_len;    /* Bytes of tcp header to send        */
    u16    gso_segs;    /* Max number of segs per GSO packet    */

/*
 *    Header prediction flags
 *    0x5?10 << 16 + snd_wnd in net byte order
 */
    __be32    pred_flags;/*首部预测标志 在接收到 syn 跟新窗口 等时设置此标志 ,
    此标志和时间戳 序号等 用于判断执行 快速还是慢速路径*/
        
/*
 *    RFC793 variables by their proper names. This means you can
 *    read the code and the spec side by side (and laugh ...)
 *    See RFC793 and RFC1122. The RFC writes these in capitals.
 */
    u64    bytes_received;    /* RFC4898 tcpEStatsAppHCThruOctetsReceived
                 * sum(delta(rcv_nxt)), or how many bytes
                 * were acked.
                 */
    u32    segs_in;    /* RFC4898 tcpEStatsPerfSegsIn
                 * total number of segments in.
                 */
     u32    rcv_nxt;    /* What we want to receive next  等待接收的下一个序列号    */
    u32    copied_seq;    /* Head of yet unread data        */

/* rcv_nxt on last window update sent最早接收但没有确认的序号, 也就是接收窗口的左端,
        在发送ack的时候, rcv_nxt更新 因此rcv_wup 更新比rcv_nxt 滞后一些  */
    u32    rcv_wup;    

    u32    snd_nxt;    /* Next sequence we send 等待发送的下一个序列号        */
    u32    segs_out;    /* RFC4898 tcpEStatsPerfSegsOut
                 * The total number of segments sent.
                 */
    u64    bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked
                 * sum(delta(snd_una)), or how many bytes
                 * were acked.
                 */
    struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */

     u32    snd_una;    /* First byte we want an ack for  最早一个未被确认的序号    */
     u32    snd_sml;    /* Last byte of the most recently transmitted small packet  最近发送一个小于mss的最后 一个字节序列号
    在成功发送, 如果报文小于mss,跟新这个字段 主要用来判断是否启用 nagle 算法*/
    u32    rcv_tstamp;    /* timestamp of last received ACK (for keepalives)  最近一次收到ack的时间 用于 tcp 保活*/
    u32    lsndtime;    /* timestamp of last sent data packet (for restart window) 最近一次发送 数据包时间*/
    u32    last_oow_ack_time;  /* timestamp of last out-of-window ACK */

    u32    tsoffset;    /* timestamp offset */

    struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
    unsigned long    tsq_flags;

    /* Data for direct copy to user cp 数据到用户进程的控制块 有用户缓存以及其长度 prequeue 队列 其内存*/
    struct {
        struct sk_buff_head    prequeue // tcp 段 缓冲到此队列 知道进程主动读取才真正的处理;
        struct task_struct    *task;
        struct msghdr        *msg;
        int            memory;// prequeue 当前消耗的内存
        int            len;// 用户缓存中 当前可以使用的缓存大小 
    } ucopy;

    u32    snd_wl1;    /* Sequence for window update记录跟新发送窗口的那个ack 段号 用来判断是否 需要跟新窗口
    如果后续收到ack大于snd_wll 则表示需要更新 窗口*/
    u32    snd_wnd;    /* The window we expect to receive 接收方 提供的窗口大小 也就是发送方窗口大小    */
    u32    max_window;    /* Maximal window ever seen from peer 接收方通告的最大窗口    */
    u32    mss_cache;    /* Cached effective mss, not including SACKS  发送方当前有效的mss*/

    u32    window_clamp;    /* Maximal window to advertise 滑动窗口最大值        */
    u32    rcv_ssthresh;    /* Current window clamp  当前接收窗口的阈值            */

    /* Information of the most recently (s)acked skb */
    struct tcp_rack {
        struct skb_mstamp mstamp; /* (Re)sent time of the skb */
        u8 advanced; /* mstamp advanced since last lost marking */
        u8 reord;    /* reordering detected */
    } rack;
    u16    advmss;        /* Advertised MSS本端能接收的 MSS 上限,在建立时用来通告对方            */
    u8    unused;
    u8    nonagle     : 4,/* Disable Nagle algorithm?  是否  开启 ngnagle 算法           */
        thin_lto    : 1,/* Use linear timeouts for thin streams */
        thin_dupack : 1,/* Fast retransmit on first dupack      */
        repair      : 1,
        frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
    u8    repair_queue;
    u8    do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
        syn_data:1,    /* SYN includes data */
        syn_fastopen:1,    /* SYN includes Fast Open option */
        syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
        syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
        save_syn:1,    /* Save headers of SYN packet */
        is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
    u32    tlp_high_seq;    /* snd_nxt at the time of TLP retransmit. */

/* RTT measurement */
    u32    srtt_us;    /* smoothed round trip time << 3 in usecs  平滑rtt*/
    u32    mdev_us;    /* medium deviation    rtt平均偏差        */
    u32    mdev_max_us;    /* maximal mdev for the last rtt period   rtt平均偏差最大值     */
    u32    rttvar_us;    /* smoothed mdev_max            */
    u32    rtt_seq;    /* sequence number to update rttvar  记录SND.UNA 计算rto 时比较SND.NUA是否已经给更新
    如果SND.UNA 跟新,则需要同时跟新rttval*/
    struct rtt_meas {
        u32 rtt, ts;    /* RTT in usec and sampling time in jiffies. */
    } rtt_min[3];

    u32    packets_out;    /* Packets which are "in flight"发送出去 没有被ack的数 (SND.NEXT -SND.UNA )*/
    u32    retrans_out;    /* Retransmitted packets out 重传还未得到确认的tcp数        重传并且还未得到确认的 TCP 段的数目*/
    u32    max_packets_out;  /* max packets_out in last window */
    u32    max_packets_seq;  /* right edge of max_packets_out flight */

    u16    urg_data;    /* Saved octet of OOB data and control flags 存放紧急数据以及控制标示 */
    u8    ecn_flags;    /* ECN status bits.            */
    u8    keepalive_probes; /* num of allowed keep alive probes 保活探测次数上限    */
    u32    reordering;    /* Packet reordering metric. tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;三个重复ACK报文时,触发快速重传 */
    u32    snd_up;        /* Urgent pointer 紧急数据指针 带外数据的序号        */

/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
    struct tcp_options_received rx_opt;

/*
 *    Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
     u32    snd_ssthresh;    /* Slow start size threshold 拥塞控制 满启动阈值        */
     u32    snd_cwnd;    /* Sending congestion window    当前拥塞窗口大小  ---发送的拥塞窗口    */
    u32    snd_cwnd_cnt;    /* Linear increase counter    自从上次调整拥塞窗口后 到目前位置接收到的
    总ack段数 如果该字段为0  表示调整拥塞窗口但是没有收到ack,调整拥塞窗口之后 收到ack段就回让
    snd_cwnd_cnt 加1 */
    u32    snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this  snd_cwnd  的最大值*/
    u32    snd_cwnd_used;//记录已经从队列发送而没有被ack的段数
    u32    snd_cwnd_stamp;//记录最近一次检验cwnd 的时间;     拥塞期间 每次会检验cwnd而调节拥塞窗口 ,
    //在非拥塞期间,为了防止应用层序造成拥塞窗口失效  因此在发送后 有必要检测cwnd
    u32    prior_cwnd;    /* Congestion window at start of Recovery.在进入 Recovery 状态时的拥塞窗口 */
    u32    prr_delivered;    /* Number of newly delivered packets to在恢复阶段给接收者新发送包的数量
                 * receiver in Recovery. */
    u32    prr_out;    /* Total number of pkts sent during Recovery.在恢复阶段一共发送的包的数量 */

     u32    rcv_wnd;    /* Current receiver window 当前接收窗口的大小        */
    u32    write_seq;    /* Tail(+1) of data held in tcp send buffer   已加入发送队列中的最后一个字节序号*/
    u32    notsent_lowat;    /* TCP_NOTSENT_LOWAT */
    u32    pushed_seq;    /* Last pushed seq, required to talk to windows */
    u32    lost_out;    /* Lost packets丢失的数据报            */
    u32    sacked_out;    /* SACK'd packets启用 SACK 时,通过 SACK 的 TCP 选项标识已接收到的段的数量。
                 不启用 SACK 时,标识接收到的重复确认的次数,该值在接收到确认新数据段时被清除。            */
    u32    fackets_out;    /* FACK'd packets    FACK'd packets 记录 SND.UNA 与 (SACK 选项中目前接收方收到的段中最高序号段) 之间的段数。FACK
            用 SACK 选项来计算丢失在网络中上的段数  lost_out=fackets_out-sacked_out  left_out=fackets_out  fackets_out = sack_out + lost_out
*/

    /* from STCP, retrans queue hinting */
    struct sk_buff* lost_skb_hint; /*在重传队列中, 缓存下次要标志的段*/
    struct sk_buff *retransmit_skb_hint;/* 表示将要重传的起始包*/

    /* OOO segments go in this list. Note that socket lock must be held,
     * as we do not use sk_buff_head lock.
     */
    struct sk_buff_head    out_of_order_queue;

    /* SACKs data, these 2 need to be together (see tcp_options_write) */
    struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
    struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

    struct tcp_sack_block recv_sack_cache[4];

    struct sk_buff *highest_sack;   /* skb just after the highest最大sack序列号
                     * skb with SACKed bit set
                     * (validity guaranteed only if
                     * sacked_out > 0)
                     */

    int     lost_cnt_hint;/* 已经标志了多少个段 */
    u32     retransmit_high;    /* L-bits may be on up to this seqno  表示将要重传的起始包 */

    u32    prior_ssthresh; /* ssthresh saved at recovery start表示前一个snd_ssthresh得大小    */
    u32    high_seq;    /* snd_nxt at onset of congestion拥塞开始时,snd_nxt的大----开始拥塞的时候下一个要发送的序号字节*/

    u32    retrans_stamp;    /* Timestamp of the last retransmit,
                 * also used in SYN-SENT to remember stamp of
                 * the first SYN. */
    u32    undo_marker;    /* snd_una upon a new recovery episode. 在使用 F-RTO 算法进行发送超时处理,或进入 Recovery 进行重传,
                    或进入 Loss 开始慢启动时,记录当时 SND.UNA, 标记重传起始点。它是检测是否可以进行拥塞控制撤销的条件之一,一般在完成
                    拥塞撤销操作或进入拥塞控制 Loss 状态后会清零。*/
    int    undo_retrans;    /* number of undoable retransmissions. 在恢复拥塞控制之前可进行撤销的重传段数。
                    在进入 FTRO 算法或 拥塞状态 Loss 时,清零,在重传时计数,是检测是否可以进行拥塞撤销的条件之一。*/
    u32    total_retrans;    /* Total retransmits for entire connection */

    u32    urg_seq;    /* Seq of received urgent pointer  紧急数据的序号 所在段的序号和紧急指针相加获得*/
    unsigned int        keepalive_time;      /* time before keep alive takes place */
    unsigned int        keepalive_intvl;  /* time interval between keep alive probes */

    int            linger2;

/* Receiver side RTT estimation */
    struct {
        u32    rtt;
        u32    seq;
        u32    time;
    } rcv_rtt_est;

/* Receiver queue space */
    struct {
        int    space;
        u32    seq;
        u32    time;
    } rcvq_space;

/* TCP-specific MTU probe information. */
    struct {
        u32          probe_seq_start;
        u32          probe_seq_end;
    } mtu_probe;
    u32    mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
               * while socket was owned by user.
               */

#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
    const struct tcp_sock_af_ops    *af_specific;

/* TCP MD5 Signature Option information */
    struct tcp_md5sig_info    __rcu *md5sig_info;
#endif

/* TCP fastopen related information */
    struct tcp_fastopen_request *fastopen_req;
    /* fastopen_rsk points to request_sock that resulted in this big
     * socket. Used to retransmit SYNACKs etc.
     */
    struct request_sock *fastopen_rsk;
    u32    *saved_syn;
};

   接收端在检测数据包乱序是否超过乱序阀值(默认为3,在proc的tcp_reordering里可配置)是会用到fack_out和sacked_out,针对这两个的含义如下图所示,fack_out表示收到最大sack到snd_una间的大小,sacked_out表示接收方sack到的包个数;
接收端在检测数据包乱序是否超过乱序阀值(默认为3,在proc的tcp_reordering里可配置)是会用到fack_out和sacked_out,针对这两个的含义如下图所示,fack_out表示收到最大sack到snd_una间的大小,sacked_out表示接收方sack到的包个数;

 

/* This is what the send packet queuing engine uses to pass
 * TCP per-packet control information to the transmission code.
 * We also store the host-order sequence numbers in here too.
 * This is 44 bytes if IPV6 is enabled.
 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
 */
struct tcp_skb_cb {
    __u32        seq;        /* Starting sequence number    *起始序号 */
    __u32        end_seq;    /* SEQ + FIN + SYN + datalen    */
    union {
        /* Note : tcp_tw_isn is used in input path only
         *      (isn chosen by tcp_timewait_state_process())
         *
         *       tcp_gso_segs/size are used in write queue only,
         *      cf tcp_skb_pcount()/tcp_skb_mss()
         */
        __u32        tcp_tw_isn;
        struct {
            u16    tcp_gso_segs;
            u16    tcp_gso_size;
        };
    };
    __u8        tcp_flags;    /* TCP header flags. (tcp[13])    */

    __u8        sacked;        /* State flags for SACK/FACK.    */
#define TCPCB_SACKED_ACKED    0x01    /* SKB ACK'd by a SACK block  SKB 被确认了 ----被 SACK 块 ACK'd 也就是SACK块已经给出了skb数据缓冲区中的段回答信息    */
#define TCPCB_SACKED_RETRANS    0x02    /* SKB retransmitted  数据段被重传        */
#define TCPCB_LOST        0x04    /* SKB is lost    数据段已经丢失        */
#define TCPCB_TAGBITS        0x07    /* All tag bits TCPCB_TAGBITS = TCPCB_SACKED_ACKED |  TCPCB_SACKED_RESTRANS | TCPCB_LOST            */
#define TCPCB_REPAIRED        0x10    /* SKB repaired (no skb_mstamp)    */
#define TCPCB_EVER_RETRANS    0x80    /* Ever retransmitted frame 指明数据段以前是否重传过    */
#define TCPCB_RETRANS        (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
                TCPCB_REPAIRED)

    __u8        ip_dsfield;    /* IPv4 tos or IPv6 dsfield    */
    /* 1 byte hole */
    __u32        ack_seq;    /* Sequence number ACK'd  ACK 的序号    */
    union {
        struct inet_skb_parm    h4;
#if IS_ENABLED(CONFIG_IPV6)
        struct inet6_skb_parm    h6;
#endif

 

 2

 

/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:       FIFO of established children 
 * @icsk_bind_hash:       Bind node
 * @icsk_timeout:       Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:           Retransmit timeout
 * @icsk_pmtu_cookie       Last pmtu seen by socket
 * @icsk_ca_ops           Pluggable congestion control hook
 * @icsk_af_ops           Operations which are AF_INET{4,6} specific
 * @icsk_ca_state:       Congestion control state
 * @icsk_retransmits:       Number of unrecovered [RTO] timeouts
 * @icsk_pending:       Scheduled timer event
 * @icsk_backoff:       Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:       unanswered 0 window probes
 * @icsk_ext_hdr_len:       Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:           Delayed ACK control data
 * @icsk_mtup;           MTU probing control data
 */
struct inet_connection_sock {
    /* inet_sock has to be the first member! */
    struct inet_sock      icsk_inet; //inet_connection_sock  common struct
    struct request_sock_queue icsk_accept_queue;  //tcp newsk 存放新的链接sock 等待accept 读取
    struct inet_bind_bucket      *icsk_bind_hash;//指向与之bind的信息
    unsigned long          icsk_timeout;//数据包超时时间-- 重传 tv_off    --通常为 jiffies+ icsk_rto 后 进行重传
     struct timer_list      icsk_retransmit_timer; // 通过icsk_pengding 来区分重传定时器和持续定时器
     struct timer_list      icsk_delack_timer;// 延时发送ack 定时器
    __u32              icsk_rto;// 重传超时时间 初始值为    TCP_TIMEOUT_INIT    根据网络情况动态计算
    __u32              icsk_pmtu_cookie; //最后一次更新的路径MTU 
    const struct tcp_congestion_ops *icsk_ca_ops;
    const struct inet_connection_sock_af_ops *icsk_af_ops;
    unsigned int          (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
    __u8              icsk_ca_state:6,// 拥塞状态
                  icsk_ca_setsockopt:1,
                  icsk_ca_dst_locked:1;
    __u8              icsk_retransmits;// 超时重传的次数 
    __u8              icsk_pending; //标志定时器事件     ICSK_TIME_EARLY_RETRANS  等可选值 表示 重传定时 持续定时器 保活定时器等
    __u8              icsk_backoff;// 计算持续定时器 下一个设定值的指数 退避算法指数 
    __u8              icsk_syn_retries; // 建立tcp 允许 重传 syn   syn+ack的次数
    __u8              icsk_probes_out;// 持续定时等  周期性发出未被确认的tcp seg 数目
    __u16              icsk_ext_hdr_len;
    struct {
        __u8          pending;     /* ACK is pending 标示 需要确认发送的 紧急程度 和状态               */
        __u8          quick;     /* Scheduled number of quick acks在快速发送确认模式中       */
        __u8          pingpong;     /* The session is interactive 启用禁用 快速确认模式    1 ---标示延时发送ack 0 标示快速发送ack       */
        __u8          blocked;     /* Delayed ACK was blocked by socket lock  软中断 用户进程 不能同时own sk
        如果sk 被 user  拥有, 延时ack 定时器被触发,此时不应该发送ack, 
        blocked 为1;标示 如果有机会就需要立即发送,所以当接收的数据被cp到user 后 就可以立即发送ack */
        __u32          ato;         /* Predicted tick of soft clock 延时确认的估值       */
        unsigned long      timeout;     /* Currently scheduled timeout当前延时确认时间 超时后立即发送ack           */
        __u32          lrcvtime;     /* timestamp of last received data packet 最近一次接收到数据包时间*/
        __u16          last_seg_size; /* Size of last incoming segment最后一个接收到段的长度 用来计算rcv_mss       */
        __u16          rcv_mss;     /* MSS used for delayed ACK decisions 由最近接收的段计算出MSS       */ 
    } icsk_ack; // 延时确认控制块
    struct {
        int          enabled;// 是否开启路径MTU

        /* Range of MTUs to search  */
        int          search_high;
        int          search_low;

        /* Information on the current probe当前mtu 探测的长度 用于判断mtu是否完成  初始值为0. */
        int          probe_size;

        u32          probe_timestamp;
    } icsk_mtup;
    u32              icsk_user_timeout;

    u64              icsk_ca_priv[64 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE      (8 * sizeof(u64))
};

 

这篇关于聊一聊tcp 拥塞控制 相关数据结构的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!