linux下的网络编程离不开socket,中文被翻译为套接字。任何网络通信都必须先建立socket,再通过socket给对方收发数据!数据接受的demo代码如下:
#include <string.h> #include <sys/socket.h> #include <sys/types.h> #define SET_PORT 3490 int main(void) { int sockfd, new_fd; struct sockaddr_in my_addr; struct sockaddr_in their_addr; int sin_size; sockfd = socket(PF_INET, SOCK_STREAM, 0); my_addr.sin_family = AF_INET; my_addr.sin_port = htons(_INT_PORT); my_addr.sin_addr.s_addr = INADDR_ANY; bzero(&(my_addr.sin_zero),sizeof(my_addr.sin_zero)); bind(sockfd, (struct sockaddr *)&my_addr,sizeof(struct sockaddr));// 绑定套接字 listen(sockfd, 10); // 监听套接字 sin_size = sizeof(struct sockaddr_in); new_fd = accept(sockfd, &their_addr, &sin_size); // 接收套接字 }
可以看出,需要先调用socket函数建立socket,再绑定套接字,最后监听和接受数据。 这个socket到底是啥?linux在内核中又是怎么使用的了?
1、(1)socket是个结构体,字段不多,但是嵌套了其他结构体,各种嵌套的关系标识如下:
(2)socket结构体有了,接下来就是创建和初始化了!linux内核创建socket的函数是__sock_create,核心代码如下:
int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { int err; struct socket *sock; const struct net_proto_family *pf; ......... /* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. 本质:创建socket结构体,存放在inode,通过superblock统一检索和管理 */ sock = sock_alloc(); ......... /*socket就是在这里创建的,实际调用的是inet_create af_inet.c文件中: static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, };*/ err = pf->create(net, sock, protocol, kern); .................. }
创建socket的核心函数就2个:sock_alloc,还有pf->create!先看第一个sock_alloc,代码如下:
/** * sock_alloc - allocate a socket * * Allocate a new inode and socket object. The two are bound together * and initialised. The socket is then returned. If we are out of inodes * NULL is returned. 明明是申请socket,底层却分配inode,这是为啥了? 1、socket也需要管理,放在inode后通过super_bloc统一检索和管理 2、socket的属性字段自然也存放在inode节点了 3、符合万物皆文件的理念 */ struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; //从超级块里分配一个inode inode = new_inode_pseudo(sock_mnt->mnt_sb); if (!inode) return NULL; /*把inode和socket绑定在一起,通过inode寻址socket,便于管理*/ sock = SOCKET_I(inode); kmemcheck_annotate_bitfield(sock, type);//标记shadow memory来表示这块内存已经使用了 inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_op = &sockfs_inode_ops; this_cpu_add(sockets_in_use, 1); return sock; }
本质上就是分配一个inode,然后和socket结构体绑定,通过inode寻址socket结构体!socket结构体有了,接下来就是在socket内部嵌套的sock结构体了!其生成和初始化的工作都是在inet_create内部完成的,代码如下:
static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; sock->state = SS_UNCONNECTED;//初始化状态当然设置成未连接了 /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOBUFS; /*从cpu缓存或堆内存分配空间存储sock实例,并初始化*/ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; /* 1、强制转换成inet_sock类型,便于继续初始化; 2、inet和sk指针并未改变,指向的是同一块内存地址,两个指针可以同时使用 */ inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; inet->nodefrag = 0; if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } if (net->ipv4.sysctl_ip_no_pmtu_disc) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; inet->inet_id = 0; /* 1、初始化sk_buff的读、写、错误队列 2、关联socket和sock的实例 3、定义sock的回调函数 4、初始化其他sock字段 */ sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct;//析构时的回调函数 sk->sk_protocol = protocol;//协议类型 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; //sk和inet交替使用来初始化 inet->uc_ttl = -1; inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_all = 1; inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; sk_refcnt_debug_inc(sk);//引用计数+1 if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); if (err) { sk_common_release(sk); goto out; } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) sk_common_release(sk); } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; }
整个逻辑并不复杂,先是调用sk_alloc函数生成sock实例,再调用sock_init_data初始化sock实力,并和socket实例关联,所以我个人认为sock_init_data是最核心的函数,如下:
/* 1、初始化sk_buff的读、写、错误队列 2、关联socket和sock的实例 3、定义sock的回调函数 4、初始化其他sock字段 */ void sock_init_data(struct socket *sock, struct sock *sk) { /*初始化sk_buff的读写、错误队列*/ skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); sk->sk_send_head = NULL; //初始化定时器 init_timer(&sk->sk_timer); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; //这里终于把socket和sock实例关联起来了 sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); if (sock) { sk->sk_type = sock->type; sk->sk_wq = sock->wq; sock->sk = sk; } else sk->sk_wq = NULL; rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); sk->sk_state_change = sock_def_wakeup;//状态改变后的回调函数 sk->sk_data_ready = sock_def_readable;//有数据可读的回调函数 sk->sk_write_space = sock_def_write_space;//有缓存可写的回调函数 sk->sk_error_report = sock_def_error_report;//发生io错误时的回调函数 sk->sk_destruct = sock_def_destruct; sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = ktime_set(-1L, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; sk->sk_ll_usec = sysctl_net_busy_read; #endif sk->sk_max_pacing_rate = ~0U; sk->sk_pacing_rate = ~0U; sk->sk_incoming_cpu = -1; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) */ smp_wmb(); atomic_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); }
上面有几个回调函数,其实实现的逻辑的代码结构基本是一样的:
/* * Default Socket Callbacks 当sock的状态发生改变时,会调用此函数来进行处理 */ static void sock_def_wakeup(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq))//有进程阻塞在这个socket //唤醒所有在等待这个socket的进程,核心就是执行进程唤醒的回调函数 wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } /*sock有输入数据可读时,会调用此函数来处理*/ static void sock_def_readable(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) /* 唤醒等待数据的进程,核心还是执行回调函数 */ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); /* 异步通知队列的处理。 * 检查应用程序是否通过recv()类调用来等待接收数据,如果没有就发送SIGIO信号, * 告知它有数据可读。 * how为函数的处理方式,band为用来告知的IO类型。 */ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); }
当有可读数据的时候,肯定第一时间通知相应的进程来读取数据,核心是通过sk_wake_async函数实现的;而sk_wake_async最终调用了kill_fasync_rcu来给排队等待的队列发出SIGIO信号,通知这些队列中的进程来取数据了!异步的好处在这里就凸显了:进程不用在这里空转等数据,而是可以释放cpu去执行其他进程的代码;等socket有数据后再通过类似中断的形式通知等待的进程来取数据了!
/* * rcu_read_lock() is held 函数名有kill,但实际是向队列的进程发送SIGIO信号 */ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } spin_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } spin_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } }