您的位置:首页 > 其它

用户空间具体是如何处理dpif_upcall ?(1)构造flow_miss批量处理

2014-03-08 10:19 239 查看
通过前面的分析,现在用户空间拿到了自内核传上来的关于未能在内核得到匹配的packet的netlink message的信息,接下来批量处理这些upcalls。流程图如下:

这个模块可以分成三个阶段:1)由dpif_upcall得到flow_miss 集合,构造填充相应的字段;2)构造datapath actions;3)执行flow_miss_op->dpif_op,与内核沟通

处理upcall的数据结构有:

flow_miss是将具有相同流特征的packets统一起来( batching),性能可能会更优,所以这个结构体要将datapath interface相关的数据队列起来,每个flow_miss对应的是发送的一个活多个数据包,另外可能会在dpif中安装流项。
/* So far we only batch the operations that affect flow setup time the most.

* It's possible to batch more than that, but the benefit might be minimal. */
struct flow_miss {

struct hmap_node hmap_node;

struct flow flow; //流特征;

enum odp_key_fitness key_fitness;

const struct nlattr *key;

size_t key_len;

ovs_be16 initial_tci;

struct list packets; //具有该流特征的所有的packets;

enum dpif_upcall_type upcall_type;

};

枚举体odp_key_fitness表征的是自内核而来的这个flow key(就是nla序列)和我们用户空间期望的匹配程度(如何判断??);
/* These values are arranged so that greater values are "more important" than lesser ones. In particular, a single flow key can fit the descriptions for

* both ODP_FIT_TOO_LITTLE and ODP_FIT_TOO_MUCH. Such a key is treated as ODP_FIT_TOO_LITTLE. */

enum odp_key_fitness {

ODP_FIT_PERFECT, /* The key had exactly the fields we expect. */

ODP_FIT_TOO_MUCH, /* The key had fields we don't understand. */

ODP_FIT_TOO_LITTLE, /* The key lacked fields we expected to see. */

ODP_FIT_ERROR, /* The key was invalid. */

};

struct flow_miss_op {

struct dpif_op dpif_op; //据此可以得到操作类型handler;

struct subfacet *subfacet; // Subfacet ,据此可以得到所有的flow和rule等数据;

void *garbage; /* Pointer to pass to free(), NULL if none. */

uint64_t stub[1024 / 8]; /* Temporary buffer. */

};
----lib/dpif.h
struct dpif_op {

enum dpif_op_type type;

int error;

union {

struct dpif_flow_put flow_put;

struct dpif_flow_del flow_del;

struct dpif_execute execute;

} u;

};
/* Operation batching interface.*/

enum dpif_op_type {

DPIF_OP_FLOW_PUT = 1,

DPIF_OP_FLOW_DEL,

DPIF_OP_EXECUTE,

};

主框架,
static void handle_miss_upcalls(struct ofproto_dpif
*ofproto, struct dpif_upcall *upcalls,size_t n_upcalls) {
struct dpif_upcall *upcall;

struct flow_miss *miss;

struct flow_miss misses[FLOW_MISS_MAX_BATCH]; // 50

struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];

struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];

struct hmap todo;

int n_misses;

size_t n_ops;

size_t i;

//构造一个to do list ,这相当于从每个packet中析取出flow 然后将那些具有相同流特征的packets 集合到 "flow_miss"结构体中,这样我们可以一同处理它们。

hmap_init(&todo);

n_misses = 0;

for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) { //遍历这个upcall数据;

struct flow_miss *miss = &misses[n_misses];

struct flow_miss *existing_miss;

struct flow flow;

uint32_t hash;

//和函数 odp_flow_key_to_flow()类似,函数ofproto_dpif_extract_flow_key(ofproto/ofproto-dpif.c)将key中的特定
//长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow,返回ODP_FIT_* 值 来表征 upcall->key 和我们期望的合适度;
//执行完成后flow结构体得到了填充;
miss->key_fitness = ofproto_dpif_extract_flow_key( ofproto, upcall->key, upcall->key_len, &flow, &miss->initial_tci,
upcall->packet);

if (miss->key_fitness == ODP_FIT_ERROR) {

continue;

}
//构造miss->flow结构体,同时也会设置packet中的一些指针字段;
flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, &flow.tunnel, flow.in_port, &miss->flow);

//将新的packets加入todo list 中(通过对struct flow_miss中的flow进行hash)

hash = flow_hash(&miss->flow, 0);

existing_miss = flow_miss_find(&todo, &miss->flow, hash); //不重复加入;

if (!existing_miss) {

hmap_insert(&todo, &miss->hmap_node, hash);

miss->key = upcall->key;

miss->key_len = upcall->key_len;

miss->upcall_type = upcall->type;

list_init(&miss->packets);

n_misses++;

} else {

miss = existing_miss;

}

list_push_back(&miss->packets, &upcall->packet->list_node);
//将这个upcall对应的packet加入这个flow_miss的packets链表中(效果见图3);

}

//---------------------以上是第一个阶段--------------------------

//然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet,handle_flow_miss_with_facet;

n_ops = 0;

HMAP_FOR_EACH (miss, hmap_node, &todo) {

handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);

}

assert(n_ops <= ARRAY_SIZE(flow_miss_ops));

//----------------------第二个阶段------------------------------

/* Execute batch. */

for (i = 0; i < n_ops; i++) {

dpif_ops[i] = &flow_miss_ops[i].dpif_op;

}

dpif_operate(ofproto->dpif, dpif_ops, n_ops);

//这里会调用 dpif_linux_operate ,通过netlink socket和内核通信;

/* Free memory and update facets. */

for (i = 0; i < n_ops; i++) {

struct flow_miss_op *op = &flow_miss_ops[i];

switch (op->dpif_op.type) {

case DPIF_OP_EXECUTE:

break;

case DPIF_OP_FLOW_PUT:

if (!op->dpif_op.error) {

op->subfacet->path = subfacet_want_path(op->subfacet->slow);

}

break;

case DPIF_OP_FLOW_DEL:

NOT_REACHED();

}

free(op->garbage);

}

hmap_destroy(&todo);

}

-------ofproto/ofproto-dpif.c

和函数 odp_flow_key_to_flow()的作用类似,该函数将长度为key_len的netlink attribute解析到flow结构体中
static enum odp_key_fitness ofproto_dpif_extract_flow_key(const struct ofproto_dpif *ofproto,
const struct nlattr *key, size_t key_len,

struct flow *flow, ovs_be16 *initial_tci,

struct ofpbuf *packet)

{

enum odp_key_fitness fitness;

fitness = odp_flow_key_to_flow(key, key_len, flow);

if (fitness == ODP_FIT_ERROR) {

return fitness;

}

*initial_tci = flow->vlan_tci;

//(----ofproto/ofproto-dpif.c)此刻这个flow代表的是从ofproto上接收到的一个packet,检查flow->in_port是否代表的是一个Linux
//VLAN 设备,如果是的话就设置in_port为真正的代表VLAN的设备,vlan_tci=VLAN VID,然后返回true。但是通常的情况下没有设置
//VLAN splinters,所以没有做任何改变直接返回false。

if (vsp_adjust_flow(ofproto, flow)) {

if (packet) {

/* Make the packet resemble the flow, so that it gets sent to an

* OpenFlow controller properly, so that it looks correct for

* sFlow, and so that flow_extract() will get the correct vlan_tci

* if it is called on 'packet'.

*

* The allocated space inside 'packet' probably also contains

* 'key', that is, both 'packet' and 'key' are probably part of a

* struct dpif_upcall (see the large comment on that structure

* definition), so pushing data on 'packet' is in general not a

* good idea since it could overwrite 'key' or free it as a side

* effect. However, it's OK in this special case because we know

* that 'packet' is inside a Netlink attribute: pushing 4 bytes

* will just overwrite the 4-byte "struct nlattr", which is fine

* since we don't need that header anymore. */

eth_push_vlan(packet, flow->vlan_tci);

}

/* Let the caller know that we can't reproduce 'key' from 'flow'. */

if (fitness == ODP_FIT_PERFECT) { // ????

fitness = ODP_FIT_TOO_MUCH;

}

}

return fitness;

}

-------------lib/odp-util.c
将key中一定长度的netlink attribute解析到flow结构体中,这里并没有拿packet作为参数,因为现在理解的OVS_KEY_ATTR_*属性都不需要包数据。现在我们总是可以通过底层协议相关的属性来推断出其他的attributes,比如说,如果在OVS_KEY_ATTR_IPV4或
OVS_KEY_ATTR_IPV6中的协议值是IPPROTO_TCP,那么就会出现属性OVS_KEY_ATTR_TCP。
enum odp_key_fitness odp_flow_key_to_flow(const struct nlattr *key, size_t key_len, struct flow *flow)

{

static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);

const struct nlattr *attrs[OVS_KEY_ATTR_MAX + 1];

uint64_t expected_attrs;

uint64_t present_attrs;

int out_of_range_attr;

memset(flow, 0, sizeof *flow);

/* Parse attributes. */

if (!parse_flow_nlattrs(key, key_len, attrs, &present_attrs, &out_of_range_attr)) {

return ODP_FIT_ERROR;

}

expected_attrs = 0;

/* Metadata. */

if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_PRIORITY)) {

flow->skb_priority = nl_attr_get_u32(attrs[OVS_KEY_ATTR_PRIORITY]);

expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_PRIORITY;

}

if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK)) {

flow->skb_mark = nl_attr_get_u32(attrs[OVS_KEY_ATTR_SKB_MARK]);

expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_SKB_MARK;

}

if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_TUN_ID)) {

flow->tunnel.tun_id = nl_attr_get_be64(attrs[OVS_KEY_ATTR_TUN_ID]);

expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_TUN_ID;

}

if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_IN_PORT)) {

uint32_t in_port = nl_attr_get_u32(attrs[OVS_KEY_ATTR_IN_PORT]);

if (in_port >= UINT16_MAX || in_port >= OFPP_MAX) {

VLOG_ERR_RL(&rl, "in_port %"PRIu32" out of supported range",

in_port);

return ODP_FIT_ERROR;

}

flow->in_port = odp_port_to_ofp_port(in_port);

expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_IN_PORT;

} else {

flow->in_port = OFPP_NONE;

}

/* Ethernet header. */

if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_ETHERNET)) {

const struct ovs_key_ethernet *eth_key;

eth_key = nl_attr_get(attrs[OVS_KEY_ATTR_ETHERNET]);

memcpy(flow->dl_src, eth_key->eth_src, ETH_ADDR_LEN);

memcpy(flow->dl_dst, eth_key->eth_dst, ETH_ADDR_LEN);

}

expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ETHERNET;

/* Get Ethertype or 802.1Q TPID or FLOW_DL_TYPE_NONE. */

if (!parse_ethertype(attrs, present_attrs, &expected_attrs, flow)) {

return ODP_FIT_ERROR;

}

if (flow->dl_type == htons(ETH_TYPE_VLAN)) {

return parse_8021q_onward(attrs, present_attrs, out_of_range_attr, expected_attrs, flow, key, key_len);

}

return parse_l3_onward(attrs, present_attrs, out_of_range_attr, expected_attrs, flow, key, key_len);

}

------lib/flow.h
struct flow {

struct flow_tnl tunnel; /* Encapsulating tunnel parameters. */

ovs_be64 metadata; /* OpenFlow Metadata. */

struct in6_addr ipv6_src; /* IPv6 source address. */

struct in6_addr ipv6_dst; /* IPv6 destination address. */

struct in6_addr nd_target; /* IPv6 neighbor discovery (ND) target. */

uint32_t skb_priority; /* Packet priority for QoS. */

uint32_t regs[FLOW_N_REGS]; /* Registers. */

ovs_be32 nw_src; /* IPv4 source address. */

ovs_be32 nw_dst; /* IPv4 destination address. */

ovs_be32 ipv6_label; /* IPv6 flow label. */

uint16_t in_port; /* OpenFlow port number of input port. */

uint32_t skb_mark; /* Packet mark. */

ovs_be16 vlan_tci; /* If 802.1Q, TCI | VLAN_CFI; otherwise 0. */

ovs_be16 dl_type; /* Ethernet frame type. */

ovs_be16 tp_src; /* TCP/UDP source port. */

ovs_be16 tp_dst; /* TCP/UDP destination port. */

uint8_t dl_src[6]; /* Ethernet source address. */

uint8_t dl_dst[6]; /* Ethernet destination address. */

uint8_t nw_proto; /* IP protocol or low 8 bits of ARP opcode. */

uint8_t nw_tos; /* IP ToS (including DSCP and ECN). */

uint8_t arp_sha[6]; /* ARP/ND source hardware address. */

uint8_t arp_tha[6]; /* ARP/ND target hardware address. */

uint8_t nw_ttl; /* IP TTL/Hop Limit. */

uint8_t nw_frag; /* FLOW_FRAG_* flags. */

uint8_t zeros[2]; /* Must be zero. */

};

----------lib/flow.c
flow_extract(lib/flow.c) 用packet中的信息,'skb_priority', 'tnl', 及 'ofp_in_port' 来填充flow中的域(重要),同时packet中的 一些指针也会真正有效,层层跟进(比如packet->l4
= b.data)l2-以太网头,l3-指向以太网帧的payload(即如果有vlan_header的话就从其后),l4指向IPv4的payload,l7指向TCP,UDP或ICMP头之后的payload;疑问:前面通过nla得到的flow结构,与后面通过upcall->packet解析的flow结构体,二者有何联系??

void flow_extract(struct ofpbuf *packet, uint32_t skb_priority, uint32_t skb_mark,

const struct flow_tnl *tnl, uint16_t ofp_in_port, struct flow *flow)

{

struct ofpbuf b = *packet;

struct eth_header *eth;

COVERAGE_INC(flow_extract);

memset(flow, 0, sizeof *flow);

if (tnl) {

assert(tnl != &flow->tunnel);

flow->tunnel = *tnl;

}

flow->in_port = ofp_in_port;

flow->skb_priority = skb_priority;

flow->skb_mark = skb_mark;

packet->l2 = b.data;

packet->l3 = NULL;

packet->l4 = NULL;

packet->l7 = NULL;

if (b.size < sizeof *eth) {

return;

}

/* Link layer. */

eth = b.data;

memcpy(flow->dl_src, eth->eth_src, ETH_ADDR_LEN);

memcpy(flow->dl_dst, eth->eth_dst, ETH_ADDR_LEN);

/* dl_type, vlan_tci. */

ofpbuf_pull(&b, ETH_ADDR_LEN * 2);

if (eth->eth_type == htons(ETH_TYPE_VLAN)) {

parse_vlan(&b, flow);

}

flow->dl_type = parse_ethertype(&b);

/* Network layer. */

packet->l3 = b.data;

if (flow->dl_type == htons(ETH_TYPE_IP)) {

const struct ip_header *nh = pull_ip(&b);

if (nh) {

packet->l4 = b.data;

flow->nw_src = get_unaligned_be32(&nh->ip_src);

flow->nw_dst = get_unaligned_be32(&nh->ip_dst);

flow->nw_proto = nh->ip_proto;

flow->nw_tos = nh->ip_tos;

if (IP_IS_FRAGMENT(nh->ip_frag_off)) {

flow->nw_frag = FLOW_NW_FRAG_ANY;

if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {

flow->nw_frag |= FLOW_NW_FRAG_LATER;

}

}

flow->nw_ttl = nh->ip_ttl;

if (!(nh->ip_frag_off & htons(IP_FRAG_OFF_MASK))) {

if (flow->nw_proto == IPPROTO_TCP) {

parse_tcp(packet, &b, flow);

} else if (flow->nw_proto == IPPROTO_UDP) {

parse_udp(packet, &b, flow);

} else if (flow->nw_proto == IPPROTO_ICMP) {

const struct icmp_header *icmp = pull_icmp(&b);

if (icmp) {

flow->tp_src = htons(icmp->icmp_type);

flow->tp_dst = htons(icmp->icmp_code);

packet->l7 = b.data;

}

}

}

}

} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {

if (parse_ipv6(&b, flow)) {

return;

}

packet->l4 = b.data;

if (flow->nw_proto == IPPROTO_TCP) {

parse_tcp(packet, &b, flow);

} else if (flow->nw_proto == IPPROTO_UDP) {

parse_udp(packet, &b, flow);

} else if (flow->nw_proto == IPPROTO_ICMPV6) {

if (parse_icmpv6(&b, flow)) {

packet->l7 = b.data;

}

}

} else if (flow->dl_type == htons(ETH_TYPE_ARP) ||

flow->dl_type == htons(ETH_TYPE_RARP)) {

const struct arp_eth_header *arp = pull_arp(&b);

if (arp && arp->ar_hrd == htons(1)

&& arp->ar_pro == htons(ETH_TYPE_IP)

&& arp->ar_hln == ETH_ADDR_LEN

&& arp->ar_pln == 4) {

/* We only match on the lower 8 bits of the opcode. */

if (ntohs(arp->ar_op) <= 0xff) {

flow->nw_proto = ntohs(arp->ar_op);

}

flow->nw_src = arp->ar_spa;

flow->nw_dst = arp->ar_tpa;

memcpy(flow->arp_sha, arp->ar_sha, ETH_ADDR_LEN);

memcpy(flow->arp_tha, arp->ar_tha, ETH_ADDR_LEN);

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: