您的位置:首页 > 运维架构 > Linux

【Linux4.1.12源码分析】IP层报文发送之ip_output

2016-09-26 22:01 615 查看
上一篇提到ip_local_out函数最终会调用ip_output完成报文发送,本篇分析ip_output的处理过程。

1、ip_output函数

int ip_output(struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;

IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);

skb->dev = dev;
skb->protocol = htons(ETH_P_IP);   //设置报文协议为IPV4

return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, sk, skb, 
NULL, dev,
ip_finish_output,   //报文发送netfilter处理,如果允许则调用ip_finish_output
 !(IPCB(skb)->flags & IPSKB_REROUTED));
}
2、ip_finish_output函数

static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {			//仅经过ip_forward流程处理的报文携带该对象
IPCB(skb)->flags |= IPSKB_REROUTED;	//该flag会影响后续报文的GSO处理
return dst_output_sk(sk, skb);		//由于SNAT等策略处理,需要再次调用xfrm4_output函数来发包
}
#endif
if (skb_is_gso(skb))
return ip_finish_output_gso(sk, skb);	//如果是gso报文

if (skb->len > ip_skb_dst_mtu(skb))		//非gso报文,报文大小超过设备MTU值,则需要进行IP分片
return ip_fragment(sk, skb, ip_finish_output2);

return ip_finish_output2(sk, skb);		//直接发送报文
}
3、ip_finish_output_gso函数

static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
{
netdev_features_t features;
struct sk_buff *segs;
int ret = 0;

/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||		//只有ip forward流程该条件才会不成立,否则该条件成立
skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
return ip_finish_output2(sk, skb);

/* Slowpath -  GSO segment length is exceeding the dst MTU.
*
* This can happen in two cases:
* 1) TCP GRO packet, DF bit not set
* 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
* from host network stack.
*/
features = netif_skb_features(skb);				//获取dev的offload feature
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);	//skb gso报文分段
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
return -ENOMEM;
}

consume_skb(skb);

do {
struct sk_buff *nskb = segs->next;
int err;

segs->next = NULL;
err = ip_fragment(sk, segs, ip_finish_output2);		//分段报文经过ip分片后通过ip_finish_output2发送

if (err && ret == 0)
ret = err;
segs = nskb;
} while (segs);

return ret;
}
4、ip_finish_output2函数

static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
u32 nexthop;

if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);

/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;

skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (!skb2) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
}

rcu_read_lock_bh();
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
int res = dst_neigh_output(dst, neigh, skb);	//调用邻居子系统封装MAC头,并且调用二层发包函数完成报文发送

rcu_read_unlock_bh();
return res;
}
rcu_read_unlock_bh();

net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb(skb);
return -EINVAL;
}
由于邻居子系统是比较庞大的一个系统,后续单独进行分析,另外ip分片和gso分段两个函数也比较复杂,后续将单独进行分析。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息