您的位置：首页 > 运维架构 > Linux

Linux内核协议栈（附4) accept得到新的socket的本端端口号是什么？

2016-04-01 21:19 639 查看

背景：前两天有个人跟我争论accept之后新得到的描述连接的socket的端口是多少。我说是原来监听的端口，因为我之前经常用netstat查看连接，例如ssh，端口22，所有客户端与服务端建立的连接，服务端都是22。而那位同仁认为是随机获取可用端口，理由是如果端口相同怎么区分连接呢？当时脑子抽筋，没办法反驳。真是那句话，虽然你不对，但是我却找不到理由反驳你，只怪当时没有细看。现在我们整理下：

有两种可能：

1.与监听的套接字公用一个端口

2.随机获取一个系统可用的端口

我们获得答案的方法是研究内核协议栈。

在研究代码之前我们先回顾下TCP三次握手过程：

第一步：请求端发送SYN包，等待服务端回应

第二步：服务端接收到SYN包，并回应SYN+ACK包

第三步：请求端接收到SYN+ACK包，并回应ACK包，连接建立

具体到Linux的内核协议栈：

在服务端：

通过socket调用建立初始化sock对象，通过bind调用绑定本地地址和监听端口，通过listen调用为sock对象建立监听队列的空间，并将sock加入TCP协议三张全局哈希表中的监听哈希表，当有请求连接时，就从监听哈希表中找到对应的sock对象处理该请求。

处理过程就是三次握手过程：

第一步：请求端发送SYN包请求连接

第二步：服务端接收到SYN包，并根据数据包的目的地址和目的端口信息，从监听哈希表中查找对应的sock对象，然后在sock对象的监听队列中查找是否有该请求（第一次发送syn包，是没有对应的request_sock对象的），如果没有就在请求队列空间中建立代表该请求的request_sock对象，然后发送syn+ack包回应请求端。

第三步：请求端接收到SYN+ACK包，并回应ACK包。ACK包到达服务端，服务端从监听哈希表中查出处理该ACK包的sock对象，然后从sock对象的接受队列中查找该请求的request_sock对象（这次是能找到的，因为第一次处理syn包时已经建立），并构建新的sock对象描述该链接，然后将request_sock对象连同新构建的sock对象挂到接受队列icsk_accept_queue的队尾。

接着用户空间的进程调用accept会从接受队列中取出request_sock对象连同新构建的sock对象，并将新构建的sock对象返回给用户空间进程。这个就是我们说的accept得到新socket对应的sock对象，我们称之为子sock对象，那么这个子sock对象的端口是多少呢？是随机获取的，还是和原来sock的一样？

我们先看继承与sock对象的inet_sock, 也就是在inet协议族中代表一个连接sock对象

/** struct inet_sock - representation of INET sockets
*
* @sk - ancestor class
* @pinet6 - pointer to IPv6 control block
* @daddr - Foreign IPv4 addr
* @rcv_saddr - Bound local IPv4 addr
* @dport - Destination port
* @num - Local port
* @saddr - Sending source
* @uc_ttl - Unicast TTL
* @sport - Source port
* @id - ID counter for DF pkts
* @tos - TOS
* @mc_ttl - Multicasting TTL
* @is_icsk - is this an inet_connection_sock?
* @mc_index - Multicast device index
* @mc_list - Group array
* @cork - info to build ip hdr on each ip frag while socket is corked
*/
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock		sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo	*pinet6;
#endif
/* Socket demultiplex comparisons on incoming packets. */
__be32			daddr;
__be32			rcv_saddr;
__be16			dport;
__u16			num;//主机字节序表示的源端口号
__be32			saddr;
__s16			uc_ttl;
__u16			cmsg_flags;
struct ip_options	*opt;
__be16			sport;//网络字字节序表示的端口号
__u16			id;
__u8			tos;
__u8			mc_ttl;
__u8			pmtudisc;
__u8			recverr:1,
is_icsk:1,
freebind:1,
hdrincl:1,
mc_loop:1,
transparent:1,
mc_all:1;
int			mc_index;
__be32			mc_addr;
struct ip_mc_socklist	*mc_list;
struct {
unsigned int		flags;
unsigned int		fragsize;
struct ip_options	*opt;
struct dst_entry	*dst;
int			length; /* Total length of all frames */
__be32			addr;
struct flowi		fl;
} cork;
};

可以看到其中有两个代表端口的字段，分别是主机字节序的num，和网络字节序的sport。

接着我们结合协议栈源码分析三次握手过程和accept过程。

数据到达网卡的时候，对于TCP协议，将大致要经过这个一个调用链：

网卡驱动-->netif_receive_skb()--->ip_rcv()--->ip_local_deliver_finish()--->rcp_v4_rcv()

下面我分析下函数tcp_v4_rcv()

int tcp_v4_rcv(struct sk_buff *skb)
{
...
/*
*
*
* 根据源端口号,目的端口号和接收的interface查找sock对象------>先在建立连接的哈希表中查找------>如果没找到就从监听哈希表中找
*
*/

sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
...
else
#endif
{
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);//--------------------------->用得到的sock对象处理skb
}
} else
...
}

主要是根据目的地址和端口口从listen哈希表中查找处理数据包的sock对象，然后tcp_v4_do_rcv()函数用该sock对象处理数据包

/* The socket must have it's spinlock held when we get
* here.
*
* We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme.
* This is because we cannot sleep with the original spinlock
* held.
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
*  o We're expecting an MD5'd packet and this is no MD5 tcp option
*  o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard;
#endif

if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;
}
/*****************************************************************/
if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
goto csum_err;

if (sk->sk_state == TCP_LISTEN) {//如果是listen状态 处理握手连接

/*
*
*从监听队列中查找请求对象request_sock,对于第一个syn包，这时还没有构建对应的request_sock对象，也没有对应的子sock对象，所以直接返回入参sk		*/
struct sock *nsk = tcp_v4_hnd_req(sk, skb);//
if (!nsk)
goto discard;

/*对于第一个syn包，nsk就是sk，不会执行if块里面的代码，继续向下执行*/
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {//执行tcp_rcv_state_process，状态机
rsk = nsk;
goto reset;
}
return 0;
}
}

TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {//执行状态机
rsk = sk;
goto reset;
}
TCP_CHECK_TIMER(sk);
return 0;

reset:
tcp_v4_send_reset(rsk, skb);
discard:
kfree_skb(skb);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
return 0;

csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}

在解释函数之前，我看一下sock对象的接受队列：

struct inet_connection_sock {
/* inet_sock has to be the first member! */
struct inet_sock	  icsk_inet;
struct request_sock_queue icsk_accept_queue;//接受队列
struct inet_bind_bucket	  *icsk_bind_hash;
unsigned long		  icsk_timeout;
struct timer_list	  icsk_retransmit_timer;
struct timer_list	  icsk_delack_timer;
...
}

是连接套接字中的一个成员：

struct request_sock_queue {
struct request_sock	*rskq_accept_head;
struct request_sock	*rskq_accept_tail;
rwlock_t		syn_wait_lock;
u8			rskq_defer_accept;
/* 3 bytes hole, try to pack */
struct listen_sock	*listen_opt;
};

看到请求对象的队列对头和队尾了，再看listen_sock

/** struct listen_sock - listen state
*
* @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
*/
struct listen_sock {
u8			max_qlen_log;
/* 3 bytes hole, try to use */
int			qlen;
int			qlen_young;
int			clock_hand;
u32			hash_rnd;
u32			nr_table_entries;
struct request_sock	*syn_table[0];
};

好了，看完了三个数据结构，我们接着函数tcp_v4_do_rcv()，该函数调用tcp_v4_hnd_req(),解释下内核是怎么处理三次握手的。

第一步：收到syn请求连接的数据包，取出处理该包的sock对象，sock对象中有接受队列，现在接受队列中查找请求对象request_sock，因为第一次收到该请求的syn包，肯定没有对应request_sock对象，于是没找到。又在建立连接的哈希表中找，更加不可能找到，于是直接返回sk。

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,//在sock对象的接受队列中查找请求对象
iph->saddr, iph->daddr);

/*在接受队列找到了request_sock对象,说明完成接收第一次握手，并发送了ack包，现在接收到第二个syn包，进入第三次握手，构建新sock对象*/
if (req)
return tcp_check_req(sk, skb, req, prev);

nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,//没找到可能已经建立连接，sock已被转到连接建立哈希表中，所以在此哈希表中找
th->source, iph->daddr, th->dest, inet_iif(skb));

if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}

#ifdef CONFIG_SYN_COOKIES
if (!th->rst && !th->syn && th->ack)
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
return sk;
}

回到函数tcp_v4_do_rcv()，上面分析，返回的就是sk，所以if条件是不成立的，if内的代码块不会执行，直接执行if代码块之后的代码。

/*对于第一个syn包，nsk就是sk，不会执行if块里面的代码，继续向下执行*/
if (nsk != sk) {
if (tcp_child_process(sk, nsk, skb)) {//执行tcp_rcv_state_process，状态机
rsk = nsk;
goto reset;
}
return 0;
}

我们进入该函数tcp_rcv_state_process(), 显然sock是被设置成listen状态，接受到的是syn包，

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int queued = 0;
int res;

tp->rx_opt.saw_tstamp = 0;

switch (sk->sk_state) {
case TCP_CLOSE:
goto discard;

case TCP_LISTEN:
if (th->ack)
return 1;

if (th->rst)
goto discard;

/*listen 状态只处理syn包 */
if (th->syn) {
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)  /**/tcp_v4_init_sock ------> ipv4_specific   ---> tcp_v4_conn_request
return 1;

/* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
* syn up to the [to be] advertised window and
* Solaris 2.1 gives you a protocol error. For now
* we just ignore it, that fits the spec precisely
* and avoids incompatibilities. It would be nice in
* future to drop through and process the data.
*
* Now that TTCP is starting to be used we ought to
* queue this data.
* But, this leaves one open to an easy denial of
* service attack, and SYN cookies can't defend
* against this problem. So, we drop the data
* in the interest of security over speed unless
* it's still in use.
*/
kfree_skb(skb);
return 0;
}
goto discard;

case TCP_SYN_SENT:
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued;
...
}

于是执行下面代码块：

/*listen 状态只处理syn包 */
if (th->syn) {
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)  /**/tcp_v4_init_sock ------> ipv4_specific   ---> tcp_v4_conn_request

实际调用的是函数tcp_v4_conn_request（），该函数具体做了什么呢？该函数就构建并初始化了一个brequest_sock对象，将请求对象挂到sock对象的监听队列中，并回了一个syn+ack包，就是做了第二次握手的事。还有一个大家一定要注意：就是对请求对象做了初始化，包括设置端口。端口正是我们关心的东西，不过我们先不分析，回头来分析，现在只是提醒注意。

看该函数关键代码：

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
...

req = inet_reqsk_alloc(&tcp_request_sock_ops);/*构造request_sock对象,对于TCP其实是tcp_request_sock对象 */ ----> tcp_prot proto_register
if (!req)
goto drop;
...

tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;

/* 设置ireq端口等 */
tcp_openreq_init(req, &tmp_opt, skb);

ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(sk, skb);
...
/*回复syn + ack 包*/
if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
goto drop_and_free;

inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);//添加到接受队列中
...
}

回了syn+ack包，做了第二次握手，那就等请求端的ack回应了。

接着请求端收到syn+ack包，并又发了一个ack，进行第三次握手。在在服务端收到该ack包，于是又到了函数tcp_do_rcv()，又找到处理该数据包的sock对象，进入函数tcp_v4_do_rcv()

调用函数tcp_v4_hnd_req(),在sock队列中查找对应的请求对象request_sock，由于第一次收到syn包时，构建了请求对象request_sock，并将请求对象挂到请求队列中，这次查找肯定是能找到了

static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,//在sock对象的接受队列中查找请求对象
iph->saddr, iph->daddr);

/*在接受队列找到了request_sock对象,说明完成接收第一次握手，并发送了ack包，现在接收到第二个syn包，进入第三次握手，构建新sock对象*/
if (req)
return tcp_check_req(sk, skb, req, prev);

看tcp_v4_hnd_req（），就是执行if代码块中的函数tcp_check_req（），该函数做了什么呢？该函数主要做

1)构建请求对象对应的子sock，注意这个sock也就是我们调用accept时得到的sock,并将该sock对象挂到请求对象上request_sock

2）将请求对象request_sock从请求队列移到接受队列，并将子sock对象返回

struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev)
{
...
*
*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); /**/ tcp_v4_init_sock -- > ipv4_specific  --> tcp_v4_syn_recv_sock
if (child == NULL)
goto listen_overflow;

inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);

inet_csk_reqsk_queue_add(sk, req, child);//将请求对象request_sock和对应的sock添加到sock的接受队列
return child;
}

这样就有了新的连接的子sock对象。当在用户空间调用accept时，就会冲接受队列中取出请求对象，并返回对应的子sock。现在回到我们的问题：新的sock的端口是多少？是随机获取可用的端口？还是用原来监听sock'的端口？

我们看下新的子sock是怎么构建的；回到上面的tcp_check_req()函数，该函数执行下面代码构成新sock对象的构建

/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*
* 构建sock对象
*
*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); /**/ tcp_v4_init_sock -- > ipv4_specific  --> tcp_v4_syn_recv_sock
if (child == NULL)
goto listen_overflow;

实际上是调用函数tcp_v4_syn_recv_sock（），看下该函数：

/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*
* 第三次握手成功，构建新的sock对象
*/
tcp_check_req
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
#endif

if (sk_acceptq_is_full(sk))
goto exit_overflow;

if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto exit;

newsk = tcp_create_openreq_child(sk, req, skb);//构建新的sock对象
if (!newsk)
goto exit;

newsk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(newsk, dst);
..
}

浏览整个函数没有发现端口的东西，我们进入构建函数tcp_create_openreq_child()

/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);//克隆sock对象

if (newsk != NULL) {
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp;

/* Now setup tcp_sock */
newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
newtp->snd_up = treq->snt_isn + 1;

tcp_prequeue_init(newtp);
...

阅读了函数左右代码，也没有看到明显关于端口的东西，但是我们看到克隆得到sock对象的函数inet_csk_clone()

struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
const gfp_t priority)
{
struct sock *newsk = sk_clone(sk, priority);

if (newsk != NULL) {
struct inet_connection_sock *newicsk = inet_csk(newsk);

newsk->sk_state = TCP_SYN_RECV;
newicsk->icsk_bind_hash = NULL;

inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);//主机字节序端口
inet_sk(newsk)->sport = inet_rsk(req)->loc_port;//网络字节序端口
newsk->sk_write_space = sk_stream_write_space;

newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff	  = 0;
newicsk->icsk_probes_out  = 0;

/* Deinitialize accept_queue to trap illegal accesses. */
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));

security_inet_csk_clone(newsk, req);
}
return newsk;
}

终于看到了关于端口的信息了，是从请求对象中获得的。还记得我提醒过关于请求对象初始化的事情。我现在就看下请求对象是怎么初始化的

看请求对象构建的代码：

/* 设置ireq端口等 */
tcp_openreq_init(req, &tmp_opt, skb);

ireq = inet_rsk(req);
ireq->loc_addr = daddr;
ireq->rmt_addr = saddr;
ireq->no_srccheck = inet_sk(sk)->transparent;
ireq->opt = tcp_v4_save_options(sk, skb);

我们直接进入函数

static inline void tcp_openreq_init(struct request_sock *req,
struct tcp_options_received *rx_opt,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);

req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->rmt_port = tcp_hdr(skb)->source;//请求端端口
ireq->loc_port = tcp_hdr(skb)->dest;//本地端口
}

结合上面的代码可以得到下面的赋值：

inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port)=tcp_hdr(skb)->dest;
inet_sk(newsk)->sport = inet_rsk(req)->loc_port=tcp_hdr(skb)->dest;

目的端口！！！请求端发送syn请求包的时候，目的端口不就是监听端口吗？至此，所有都明了。根本没有所谓的随机获取系统可用端口，用的就是监听sock'的端口。至于怎么区分拥有相同本地端口的多个sock呢？不是还有请求端地址和端口么？

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航