内核态Datapath
datapath模块是ovs的核心,运行在内核态,ovs-vswitchd 运行在用户态,两者通过netlink 通信。
netlink 是一种灵活和强大的进程间通信机制(socket),可以沟通用户态和内核态。根据不同socket注册五种类型的family,包括datapath、vport、flow 、packet、meter。
datapath初始化
// 1、在dp_init()函数中,调用dp_register_genl()完成对五种类型的family 以及相应操作的注册
static int __init dp_init(void)
{
int err;
BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath %s\n", VERSION);
ovs_nsh_init();
err = action_fifos_init();
if (err)
goto error;
err = ovs_internal_dev_rtnl_link_register();
if (err)
goto error_action_fifos_exit;
err = ovs_flow_init();
if (err)
goto error_unreg_rtnl_link;
err = ovs_vport_init();
if (err)
goto error_flow_exit;
err = register_pernet_device(&ovs_net_ops);
if (err)
goto error_vport_exit;
err = compat_init();
if (err)
goto error_netns_exit;
err = register_netdevice_notifier(&ovs_dp_device_notifier);
if (err)
goto error_compat_exit;
err = ovs_netdev_init();
if (err)
goto error_unreg_notifier;
// datapath注册
err = dp_register_genl();
if (err < 0)
goto error_unreg_netdev;
return 0;
error_unreg_netdev:
ovs_netdev_exit();
error_unreg_notifier:
unregister_netdevice_notifier(&ovs_dp_device_notifier);
error_compat_exit:
compat_exit();
error_netns_exit:
unregister_pernet_device(&ovs_net_ops);
error_vport_exit:
ovs_vport_exit();
error_flow_exit:
ovs_flow_exit();
error_unreg_rtnl_link:
ovs_internal_dev_rtnl_link_unregister();
error_action_fifos_exit:
action_fifos_exit();
error:
ovs_nsh_cleanup();
return err;
}
// 2、在datapath找到注册的family
static int __init dp_register_genl(void)
{
int err;
int i;
for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
// 注册family
err = genl_register_family(dp_genl_families[i]);
if (err)
goto error;
}
return 0;
error:
dp_unregister_genl(i);
return err;
}
// 3、family的类型:datapath、vport、flow 、packet、meter
static struct genl_family *dp_genl_families[] = {
&dp_datapath_genl_family,
&dp_vport_genl_family,
&dp_flow_genl_family,
&dp_packet_genl_family,
&dp_meter_genl_family,
#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
&dp_ct_limit_genl_family,
#endif
};
创建网桥br-int: ovs-vsctl add-br br-int
// 1、网桥属于dp_datapath_genl_family
static struct genl_family dp_datapath_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
.maxattr = OVS_DP_ATTR_MAX,
#ifndef HAVE_GENL_OPS_POLICY
.policy = datapath_policy,
#endif
.netnsok = true,
.parallel_ops = true,
.ops = dp_datapath_genl_ops,
.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
.mcgrps = &ovs_dp_datapath_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
};
// 2、对datapath的操作有new,del,get,set
static const struct genl_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = datapath_policy,
#endif
.doit = ovs_dp_cmd_new
},
{ .cmd = OVS_DP_CMD_DEL,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = datapath_policy,
#endif
.doit = ovs_dp_cmd_del
},
{ .cmd = OVS_DP_CMD_GET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = 0, /* OK for unprivileged users. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = datapath_policy,
#endif
.doit = ovs_dp_cmd_get,
.dumpit = ovs_dp_cmd_dump
},
{ .cmd = OVS_DP_CMD_SET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = datapath_policy,
#endif
.doit = ovs_dp_cmd_set,
},
};
// 3、创建网桥的同时会创建一个同名port
static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
......
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
parms.type = OVS_VPORT_TYPE_INTERNAL;
parms.options = NULL;
parms.dp = dp;
parms.port_no = OVSP_LOCAL;
parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
vport = new_vport(&parms);
......
}
// 4、创建网桥时会创建同名port
static struct vport *new_vport(const struct vport_parms *parms)
{
struct vport *vport;
vport = ovs_vport_add(parms);
if (!IS_ERR(vport)) {
struct datapath *dp = parms->dp;
struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
hlist_add_head_rcu(&vport->dp_hash_node, head);
}
return vport;
}
// 5、确定创建的port类型
struct vport *ovs_vport_add(const struct vport_parms *parms)
{
struct vport_ops *ops;
struct vport *vport;
ops = ovs_vport_lookup(parms);
......
}
static struct vport_ops ovs_internal_vport_ops = {
.type = OVS_VPORT_TYPE_INTERNAL,
.create = internal_dev_create,
.destroy = internal_dev_destroy,
.send = internal_dev_recv,
};
// 6、netdev代表了不同平台底层的设备实现
static struct vport *internal_dev_create(const struct vport_parms *parms)
{
struct vport *vport;
struct internal_dev *internal_dev;
int err;
vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
// 分配netdev
vport->dev = alloc_netdev(sizeof(struct internal_dev),
parms->name, NET_NAME_USER, do_setup);
rtnl_lock();
// vport和netdev的对应关系注册到内核,由内核管理
err = register_netdevice(vport->dev);
if (err)
goto error_unlock;
rtnl_unlock();
}
在br-int网桥上创建port:ovs-vsctl add-port xxx
// 1、从vport family进入
struct genl_family dp_vport_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_VPORT_FAMILY,
.version = OVS_VPORT_VERSION,
.maxattr = OVS_VPORT_ATTR_MAX,
#ifndef HAVE_GENL_OPS_POLICY
.policy = vport_policy,
#endif
.netnsok = true,
.parallel_ops = true,
.ops = dp_vport_genl_ops,
.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
.mcgrps = &ovs_dp_vport_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
};
// 2、vport family包含操作有new,del,get,set
static const struct genl_ops dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = vport_policy,
#endif
.doit = ovs_vport_cmd_new
},
{ .cmd = OVS_VPORT_CMD_DEL,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = vport_policy,
#endif
.doit = ovs_vport_cmd_del
},
{ .cmd = OVS_VPORT_CMD_GET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = 0, /* OK for unprivileged users. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = vport_policy,
#endif
.doit = ovs_vport_cmd_get,
.dumpit = ovs_vport_cmd_dump
},
{ .cmd = OVS_VPORT_CMD_SET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = vport_policy,
#endif
.doit = ovs_vport_cmd_set,
},
};
// 3、调用创建vport
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
......
parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
parms.options = a[OVS_VPORT_ATTR_OPTIONS];
parms.dp = dp;
parms.port_no = port_no;
parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];
vport = new_vport(&parms);
err = PTR_ERR(vport);
if (IS_ERR(vport)) {
if (err == -EAGAIN)
goto restart;
goto exit_unlock_free;
}
......
}
// 4、创建port
static struct vport *new_vport(const struct vport_parms *parms)
{
struct vport *vport;
vport = ovs_vport_add(parms);
if (!IS_ERR(vport)) {
struct datapath *dp = parms->dp;
struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
hlist_add_head_rcu(&vport->dp_hash_node, head);
}
return vport;
}
// 5、确定创建的port类型
struct vport *ovs_vport_add(const struct vport_parms *parms)
{
struct vport_ops *ops;
struct vport *vport;
ops = ovs_vport_lookup(parms);
......
}
static struct vport_ops ovs_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_NETDEV,
.create = netdev_create,
.destroy = netdev_destroy,
.send = dev_queue_xmit,
};
// 6、创建netdev
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
// 创建vport
vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
if (IS_ERR(vport))
return vport;
// 内核注册
return ovs_netdev_link(vport, parms->name);
}
// 7、netdev_rx_handler_register在内核注册hook,当这个vport口收到packet时会调用netdev_frame_hook处理
struct vport *ovs_netdev_link(struct vport *vport, const char *name)
{
......
err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_master_upper_dev_unlink;
......
}
在br-int网桥内核添加flow:ovs-dpctl add-flow xxx
// 1、添加flow属于dp_flow_genl_family
static struct genl_family dp_flow_genl_family __ro_after_init = {
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
.maxattr = OVS_FLOW_ATTR_MAX,
#ifndef HAVE_GENL_OPS_POLICY
.policy = flow_policy,
#endif
.netnsok = true,
.parallel_ops = true,
.ops = dp_flow_genl_ops,
.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
.mcgrps = &ovs_dp_flow_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,
};
// 2、flow属性保函new,del,get,set
static const struct genl_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_NEW,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = flow_policy,
#endif
.doit = ovs_flow_cmd_new
},
{ .cmd = OVS_FLOW_CMD_DEL,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = flow_policy,
#endif
.doit = ovs_flow_cmd_del
},
{ .cmd = OVS_FLOW_CMD_GET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = 0, /* OK for unprivileged users. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = flow_policy,
#endif
.doit = ovs_flow_cmd_get,
.dumpit = ovs_flow_cmd_dump
},
{ .cmd = OVS_FLOW_CMD_SET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
.flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
.policy = flow_policy,
#endif
.doit = ovs_flow_cmd_set,
},
};
// 3、addflow提取match的参数和mask与操作后①:作为newflow的key插入到bucket,②再对该值hash后作为bucket的头
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
......
/* Extract key. 创建match结构体,关联key和mask,填入match属性 */
ovs_match_init(&match, &new_flow->key, false, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
// 将提取到match的key和mask与操作后放入new_flow->key
ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
/* Check if this is a duplicate flow */
if (ovs_identifier_is_ufid(&new_flow->id))
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
if (!flow)
flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
/* Put flow in bucket. 插入流表*/
error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
if (unlikely(error)) {
acts = NULL;
goto err_unlock_ovs;
}
......
}
// 4、插入mask和flow
int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
const struct sw_flow_mask *mask)
{
int err;
err = flow_mask_insert(table, flow, mask);
if (err)
return err;
flow_key_insert(table, flow);
if (ovs_identifier_is_ufid(&flow->id))
flow_ufid_insert(table, flow);
return 0;
}
在br-int网桥添加flow:ovs-dpctl add-flow xxx
// 1、匹配操作add-flow
static const struct ovs_cmdl_command all_commands[] = {
{ "show", "switch",
1, 1, ofctl_show, OVS_RO },
{ "dump-flows", "switch",
1, 2, ofctl_dump_flows, OVS_RO },
{ "add-flow", "switch flow",
2, 2, ofctl_add_flow, OVS_RW },
{ "add-flows", "switch file",
2, 2, ofctl_add_flows, OVS_RW },
{ "mod-flows", "switch flow",
2, 2, ofctl_mod_flows, OVS_RW },
{ "del-flows", "switch [flow]",
1, 2, ofctl_del_flows, OVS_RW },
{ "replace-flows", "switch file",
2, 2, ofctl_replace_flows, OVS_RW },
{ "diff-flows", "source1 source2",
2, 2, ofctl_diff_flows, OVS_RW },
}
// 2、添加流表操作add-flow
static void
ofctl_add_flow(struct ovs_cmdl_context *ctx)
{
ofctl_flow_mod(ctx->argc, ctx->argv, OFPFC_ADD);
}
// 3、分别执行match和action
static void
ofctl_flow_mod(int argc, char *argv[], uint16_t command)
{
if (argc > 2 && !strcmp(argv[2], "-")) {
ofctl_flow_mod_file(argc, argv, command);
} else {
struct ofputil_flow_mod fm;
char *error;
enum ofputil_protocol usable_protocols;
// 3.1parse参数
error = parse_ofp_flow_mod_str(&fm, argc > 2 ? argv[2] : "",
ports_to_accept(argv[1]),
tables_to_accept(argv[1]), command,
&usable_protocols);
if (error) {
ovs_fatal(0, "%s", error);
}
// 3.2生成flow
ofctl_flow_mod__(argv[1], &fm, 1, usable_protocols);
}
}
// 3.1解析match
char * OVS_WARN_UNUSED_RESULT
parse_ofp_flow_mod_str(struct ofputil_flow_mod *fm, const char *string,
const struct ofputil_port_map *port_map,
const struct ofputil_table_map *table_map,
int command,
enum ofputil_protocol *usable_protocols)
{
char *error = parse_ofp_str(fm, command, string, port_map, table_map,
usable_protocols);
......
}
parse_ofp_str(struct ofputil_flow_mod *fm, int command, const char *str_,
const struct ofputil_port_map *port_map,
const struct ofputil_table_map *table_map,
enum ofputil_protocol *usable_protocols)
{
char *string = xstrdup(str_);
char *error;
error = parse_ofp_str__(fm, command, string, port_map, table_map,
usable_protocols);
......
}
static char * OVS_WARN_UNUSED_RESULT
parse_ofp_str__(struct ofputil_flow_mod *fm, int command, char *string,
const struct ofputil_port_map *port_map,
const struct ofputil_table_map *table_map,
enum ofputil_protocol *usable_protocols)
{
......
// 提取action到act_str
if (fields & F_ACTIONS) {
act_str = ofp_extract_actions(string);
if (!act_str) {
return xstrdup("must specify an action");
}
}
// 解析match参数
struct match match = MATCH_CATCHALL_INITIALIZER;
while (ofputil_parse_key_value(&string, &name, &value)) {
const struct ofp_protocol *p;
const struct mf_field *mf;
char *error = NULL;
if (ofp_parse_protocol(name, &p)) {
match_set_dl_type(&match, htons(p->dl_type));
if (p->nw_proto) {
match_set_nw_proto(&match, p->nw_proto);
}
match_set_default_packet_type(&match);
} else if (!strcmp(name, "eth")) {
match_set_packet_type(&match, htonl(PT_ETH));
......
// 解析action
ofpbuf_init(&ofpacts, 32);
struct ofpact_parse_params pp = {
.port_map = port_map,
.table_map = table_map,
.ofpacts = &ofpacts,
.usable_protocols = &action_usable_protocols
};
error = ofpacts_parse_instructions(act_str, &pp);
......
}
char * OVS_WARN_UNUSED_RESULT
ofpacts_parse_instructions(const char *s, const struct ofpact_parse_params *pp)
{
return ofpacts_parse_copy(s, pp, true, 0);
}
static char * OVS_WARN_UNUSED_RESULT
ofpacts_parse_copy(const char *s_, const struct ofpact_parse_params *pp,
bool allow_instructions, enum ofpact_type outer_action)
{
char *error, *s;
*pp->usable_protocols = OFPUTIL_P_ANY;
s = xstrdup(s_);
error = ofpacts_parse(s, pp, allow_instructions, outer_action);
free(s);
return error;
}
static char * OVS_WARN_UNUSED_RESULT
ofpacts_parse(char *str, const struct ofpact_parse_params *pp,
bool allow_instructions, enum ofpact_type outer_action)
{
if (pp->depth >= MAX_OFPACT_PARSE_DEPTH) {
return xstrdup("Action nested too deeply");
}
CONST_CAST(struct ofpact_parse_params *, pp)->depth++;
uint32_t orig_size = pp->ofpacts->size;
char *error = ofpacts_parse__(str, pp, allow_instructions, outer_action);
if (error) {
pp->ofpacts->size = orig_size;
}
CONST_CAST(struct ofpact_parse_params *, pp)->depth--;
return error;
}
static char * OVS_WARN_UNUSED_RESULT
ofpacts_parse__(char *str, const struct ofpact_parse_params *pp,
bool allow_instructions, enum ofpact_type outer_action)
{
uint32_t orig_size = pp->ofpacts->size;
char *key, *value;
bool drop = false;
char *pos;
pos = str;
while (ofputil_parse_key_value(&pos, &key, &value)) {
enum ofpact_type type;
char *error = NULL;
ofp_port_t port;
if (ofpact_type_from_name(key, &type)) {
// 解析action参数
error = ofpact_parse(type, value, pp);
}
......
}
// ofpact_parse匹配的type类型对应的action
#define OFPACTS \
/* Output. */ \
OFPACT(OUTPUT, ofpact_output, ofpact, "output") \
OFPACT(GROUP, ofpact_group, ofpact, "group") \
OFPACT(CONTROLLER, ofpact_controller, userdata, "controller") \
OFPACT(ENQUEUE, ofpact_enqueue, ofpact, "enqueue") \
OFPACT(OUTPUT_REG, ofpact_output_reg, ofpact, "output_reg") \
OFPACT(BUNDLE, ofpact_bundle, members, "bundle") \
\
/* Header changes. */ \
OFPACT(SET_FIELD, ofpact_set_field, ofpact, "set_field") \
OFPACT(SET_VLAN_VID, ofpact_vlan_vid, ofpact, "set_vlan_vid") \
OFPACT(SET_VLAN_PCP, ofpact_vlan_pcp, ofpact, "set_vlan_pcp") \
OFPACT(STRIP_VLAN, ofpact_null, ofpact, "strip_vlan") \
OFPACT(PUSH_VLAN, ofpact_push_vlan, ofpact, "push_vlan") \
OFPACT(SET_ETH_SRC, ofpact_mac, ofpact, "mod_dl_src") \
OFPACT(SET_ETH_DST, ofpact_mac, ofpact, "mod_dl_dst") \
OFPACT(SET_IPV4_SRC, ofpact_ipv4, ofpact, "mod_nw_src") \
OFPACT(SET_IPV4_DST, ofpact_ipv4, ofpact, "mod_nw_dst") \
OFPACT(SET_IP_DSCP, ofpact_dscp, ofpact, "mod_nw_tos") \
OFPACT(SET_IP_ECN, ofpact_ecn, ofpact, "mod_nw_ecn") \
OFPACT(SET_IP_TTL, ofpact_ip_ttl, ofpact, "mod_nw_ttl") \
OFPACT(SET_L4_SRC_PORT, ofpact_l4_port, ofpact, "mod_tp_src") \
OFPACT(SET_L4_DST_PORT, ofpact_l4_port, ofpact, "mod_tp_dst") \
OFPACT(REG_MOVE, ofpact_reg_move, ofpact, "move") \
OFPACT(STACK_PUSH, ofpact_stack, ofpact, "push") \
OFPACT(STACK_POP, ofpact_stack, ofpact, "pop") \
OFPACT(DEC_TTL, ofpact_cnt_ids, cnt_ids, "dec_ttl") \
OFPACT(SET_MPLS_LABEL, ofpact_mpls_label, ofpact, "set_mpls_label") \
OFPACT(SET_MPLS_TC, ofpact_mpls_tc, ofpact, "set_mpls_tc") \
OFPACT(SET_MPLS_TTL, ofpact_mpls_ttl, ofpact, "set_mpls_ttl") \
OFPACT(DEC_MPLS_TTL, ofpact_null, ofpact, "dec_mpls_ttl") \
OFPACT(PUSH_MPLS, ofpact_push_mpls, ofpact, "push_mpls") \
OFPACT(POP_MPLS, ofpact_pop_mpls, ofpact, "pop_mpls") \
OFPACT(DEC_NSH_TTL, ofpact_null, ofpact, "dec_nsh_ttl") \
OFPACT(DELETE_FIELD, ofpact_delete_field, ofpact, "delete_field") \
\
/* Generic encap & decap */ \
OFPACT(ENCAP, ofpact_encap, props, "encap") \
OFPACT(DECAP, ofpact_decap, ofpact, "decap") \
\
/* Metadata. */ \
OFPACT(SET_TUNNEL, ofpact_tunnel, ofpact, "set_tunnel") \
OFPACT(SET_QUEUE, ofpact_queue, ofpact, "set_queue") \
OFPACT(POP_QUEUE, ofpact_null, ofpact, "pop_queue") \
OFPACT(FIN_TIMEOUT, ofpact_fin_timeout, ofpact, "fin_timeout") \
\
/* Flow table interaction. */ \
OFPACT(RESUBMIT, ofpact_resubmit, ofpact, "resubmit") \
OFPACT(LEARN, ofpact_learn, specs, "learn") \
OFPACT(CONJUNCTION, ofpact_conjunction, ofpact, "conjunction") \
\
/* Arithmetic. */ \
OFPACT(MULTIPATH, ofpact_multipath, ofpact, "multipath") \
\
/* Other. */ \
OFPACT(NOTE, ofpact_note, data, "note") \
OFPACT(EXIT, ofpact_null, ofpact, "exit") \
OFPACT(SAMPLE, ofpact_sample, ofpact, "sample") \
OFPACT(UNROLL_XLATE, ofpact_unroll_xlate, ofpact, "unroll_xlate") \
OFPACT(CT, ofpact_conntrack, ofpact, "ct") \
OFPACT(CT_CLEAR, ofpact_null, ofpact, "ct_clear") \
OFPACT(NAT, ofpact_nat, ofpact, "nat") \
OFPACT(OUTPUT_TRUNC, ofpact_output_trunc,ofpact, "output_trunc") \
OFPACT(CLONE, ofpact_nest, actions, "clone") \
OFPACT(CHECK_PKT_LARGER, ofpact_check_pkt_larger, ofpact, \
"check_pkt_larger") \
\
/* Debugging actions. \
* \
* These are intentionally undocumented, subject to change, and \
* only accepted if ovs-vswitchd is started with --enable-dummy. */ \
OFPACT(DEBUG_RECIRC, ofpact_null, ofpact, "debug_recirc") \
OFPACT(DEBUG_SLOW, ofpact_null, ofpact, "debug_slow") \
\
/* Instructions ("meter" is an action in OF1.5+). */ \
OFPACT(METER, ofpact_meter, ofpact, "meter") \
OFPACT(CLEAR_ACTIONS, ofpact_null, ofpact, "clear_actions") \
OFPACT(WRITE_ACTIONS, ofpact_nest, actions, "write_actions") \
OFPACT(WRITE_METADATA, ofpact_metadata, ofpact, "write_metadata") \
OFPACT(GOTO_TABLE, ofpact_goto_table, ofpact, "goto_table")
/* enum ofpact_type, with a member OFPACT_<ENUM> for each action. */
enum OVS_PACKED_ENUM ofpact_type {
#define OFPACT(ENUM, STRUCT, MEMBER, NAME) OFPACT_##ENUM,
OFPACTS
#undef OFPACT
};
从vport收到packet
// 1、vport口收到packet后调用rx_handler_result_t
static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
#ifndef USE_UPSTREAM_TUNNEL
netdev_port_receive(skb, NULL);
#else
netdev_port_receive(skb, skb_tunnel_info(skb));
#endif
return RX_HANDLER_CONSUMED;
}
// 2、处理packet包
void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info)
{
......
ovs_vport_receive(vport, skb, tun_info);
return;
error:
kfree_skb(skb);
}
// 3、ovs_vport_receive作为packet入口,同理ovs_vport_send作为packet出口
int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
const struct ip_tunnel_info *tun_info)
{
struct sw_flow_key key;
int error;
OVS_CB(skb)->input_vport = vport;
OVS_CB(skb)->mru = 0;
OVS_CB(skb)->cutlen = 0;
......
// 提取packet的key
error = ovs_flow_key_extract(tun_info, skb, &key);
if (unlikely(error)) {
kfree_skb(skb);
return error;
}
ovs_dp_process_packet(skb, &key);
return 0;
}
// 4、处理packet,根据从packet提取的key与flow对比
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
/* Look up flow. 提取数据包中的数据与flow中数据diff*/
flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
&n_mask_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
upcall.mru = OVS_CB(skb)->mru;
// 若未在内核找到flow,upcall到用户态
error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
if (unlikely(error))
kfree_skb(skb);
else
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
// 找到flow执行flow对应action
sf_acts = rcu_dereference(flow->sf_acts);
error = ovs_execute_actions(dp, skb, sf_acts, key);
......
}
// 5、寻找flow
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
const struct sw_flow_key *key,
u32 skb_hash,
u32 *n_mask_hit)
{
......
/* Cache miss, do full lookup. */
flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
if (flow)
ce->skb_hash = skb_hash;
return flow;
}
// 6、寻找mask
static struct sw_flow *flow_lookup(struct flow_table *tbl,
struct table_instance *ti,
const struct mask_array *ma,
const struct sw_flow_key *key,
u32 *n_mask_hit,
u32 *index)
{
......
flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
if (flow) { /* Found */
*index = i;
return flow;
}
}
return NULL;
}
// 7、核心代码,每个flow table分成N个桶,根据key进行哈希,不同的key分布在不同的桶里面,每个桶的大小是一个内存页的大小,每个sw_flow有key和action。先遍历mask链表得到mask值后与从packet包中提取的key与操作后hash,根据hash后的值定位bucket,遍历bucket里的flow,若flow里的key与hash前的值相等则执行flow的action操作
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
const struct sw_flow_key *unmasked,
const struct sw_flow_mask *mask,
u32 *n_mask_hit)
{
struct sw_flow *flow;
struct hlist_head *head;
u32 hash;
struct sw_flow_key masked_key;
ovs_flow_mask_key(&masked_key, unmasked, false, mask);
hash = flow_hash(&masked_key, &mask->range);
head = find_bucket(ti, hash);
(*n_mask_hit)++;
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
}
return NULL;
}
ovs分片和重组
分片:在dp收到packet后提取3-4层头信息时进行分片,分片后在执行后续的upcall或者ct
int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
struct tc_skb_ext *tc_ext;
#endif
int res, err;
/* Extract metadata from packet. */
...
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
if (static_branch_unlikely(&tc_recirc_sharing_support)) {
tc_ext = skb_ext_find(skb, TC_SKB_EXT);
key->recirc_id = tc_ext ? tc_ext->chain : 0;
} else {
key->recirc_id = 0;
}
#else
key->recirc_id = 0;
#endif
// 提取包的2-3层信息
err = key_extract(skb, key);
if (!err)
ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */
return err;
}
static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
{
struct ethhdr *eth;
/* Flags are always used as part of stats */
key->tp.flags = 0;
skb_reset_mac_header(skb);
/* Link layer. */
clear_vlan(key);
...
skb_reset_mac_len(skb);
/* Fill out L3/L4 key info, if any */
// 提取数据包3层和4层信息
return key_extract_l3l4(skb, key);
}
static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
{
int error;
/* Network layer. */
if (key->eth.type == htons(ETH_P_IP)) {
struct iphdr *nh;
__be16 offset;
error = check_iphdr(skb);
if (unlikely(error)) {
memset(&key->ip, 0, sizeof(key->ip));
memset(&key->ipv4, 0, sizeof(key->ipv4));
if (error == -EINVAL) {
skb->transport_header = skb->network_header;
error = 0;
}
return error;
nh = ip_hdr(skb);
key->ipv4.addr.src = nh->saddr;
key->ipv4.addr.dst = nh->daddr;
key->ip.proto = nh->protocol;
key->ip.tos = nh->tos;
key->ip.ttl = nh->ttl;
// 分片
offset = nh->frag_off & htons(IP_OFFSET);
if (offset) {
key->ip.frag = OVS_FRAG_TYPE_LATER;
memset(&key->tp, 0, sizeof(key->tp));
return 0;
}
...
}
ovs-vswitchd重组:对分片包的4层端口设置为0,所以如果涉及4层协议无法match重组
// 1、收到upcall的消息
static size_t
recv_upcalls(struct handler *handler)
{
...
upcall->key = dupcall->key;
upcall->key_len = dupcall->key_len;
upcall->ufid = &dupcall->ufid;
upcall->hash = hash;
upcall->out_tun_key = dupcall->out_tun_key;
upcall->actions = dupcall->actions;
pkt_metadata_from_flow(&dupcall->packet.md, flow);
flow_extract(&dupcall->packet, flow);
// 处理upcal消息
error = process_upcall(udpif, upcall,
&upcall->odp_actions, &upcall->wc);
if (error) {
goto cleanup;
}
n_upcalls++;
continue;
return n_upcalls;
}
// 2、处理upcal消息
static int
process_upcall(struct udpif *udpif, struct upcall *upcall,
struct ofpbuf *odp_actions, struct flow_wildcards *wc)
{
const struct dp_packet *packet = upcall->packet;
const struct flow *flow = upcall->flow;
size_t actions_len = 0;
switch (upcall->type) {
case MISS_UPCALL:
case SLOW_PATH_UPCALL:
// 执行upcall操作
upcall_xlate(udpif, upcall, odp_actions, wc);
return 0;
}
...
}
// 3、执行upcall操作
static void
upcall_xlate(struct udpif *udpif, struct upcall *upcall,
struct ofpbuf *odp_actions, struct flow_wildcards *wc)
{
struct dpif_flow_stats stats;
enum xlate_error xerr;
struct xlate_in xin;
struct ds output;
stats.n_packets = 1;
stats.n_bytes = dp_packet_size(upcall->packet);
stats.used = time_msec();
stats.tcp_flags = ntohs(upcall->flow->tcp_flags);
...
upcall->reval_seq = seq_read(udpif->reval_seq);
xerr = xlate_actions(&xin, &upcall->xout);
}
// 4、继续执行upcall操作
enum xlate_error
xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
{
*xout = (struct xlate_out) {
.slow = 0,
.recircs = RECIRC_REFS_EMPTY_INITIALIZER,
};
if (!xin->ofpacts && !ctx.rule) {
// 从openflow pipeline匹配规则
ctx.rule = rule_dpif_lookup_from_table(
ctx.xbridge->ofproto, ctx.xin->tables_version, flow, ctx.wc,
ctx.xin->resubmit_stats, &ctx.table_id,
flow->in_port.ofp_port, true, true, ctx.xin->xcache);
if (ctx.xin->resubmit_stats) {
rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats, false);
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
entry->rule = ctx.rule;
ofproto_rule_ref(&ctx.rule->up);
}
xlate_report_table(&ctx, ctx.rule, ctx.table_id);
}
}
// 5、从openflow pipeline匹配规则
struct rule_dpif *
rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto,
ovs_version_t version, struct flow *flow,
struct flow_wildcards *wc,
const struct dpif_flow_stats *stats,
uint8_t *table_id, ofp_port_t in_port,
bool may_packet_in, bool honor_table_miss,
struct xlate_cache *xcache)
{
ovs_be16 old_tp_src = flow->tp_src, old_tp_dst = flow->tp_dst;
ofp_port_t old_in_port = flow->in_port.ofp_port;
enum ofputil_table_miss miss_config;
struct rule_dpif *rule;
uint8_t next_id;
/* We always unwildcard nw_frag (for IP), so they
* need not be unwildcarded here. */
if (flow->nw_frag & FLOW_NW_FRAG_ANY
&& ofproto->up.frag_handling != OFPUTIL_FRAG_NX_MATCH) {
if (ofproto->up.frag_handling == OFPUTIL_FRAG_NORMAL) {
// 匹配源目port为0的场景重组
/* We must pretend that transport ports are unavailable. */
flow->tp_src = htons(0);
flow->tp_dst = htons(0);
} else {
/* Must be OFPUTIL_FRAG_DROP (we don't have OFPUTIL_FRAG_REASM).
}
}
}
ct重组:对所有进入ct的包重组
// 1、fastpath执行action
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
int err, level;
level = __this_cpu_inc_return(exec_actions_level);
if (unlikely(level > OVS_RECURSION_LIMIT)) {
net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
ovs_dp_name(dp));
kfree_skb(skb);
err = -ENETDOWN;
goto out;
}
OVS_CB(skb)->acts_origlen = acts->orig_len;
// 执行action
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len);
if (level == 1)
process_deferred_actions(dp);
out:
__this_cpu_dec(exec_actions_level);
return err;
}
// 2、执行action
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, int len)
{
const struct nlattr *a;
int rem;
for (a = attr, rem = len; rem > 0;
a = nla_next(a, &rem)) {
int err = 0;
switch (nla_type(a)) {
case OVS_ACTION_ATTR_CT:
if (!is_flow_key_valid(key)) {
err = ovs_flow_key_update(skb, key);
if (err)
return err;
}
// 匹配ct模块,执行操作
err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
nla_data(a));
/* Hide stolen IP fragments from user space. */
if (err)
return err == -EINPROGRESS ? 0 : err;
break;
}
}
// 3、匹配ct模块,执行操作
int ovs_ct_execute(struct net *net, struct sk_buff *skb,
struct sw_flow_key *key,
const struct ovs_conntrack_info *info)
{
int nh_ofs;
int err;
/* The conntrack module expects to be working at L3. */
nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs);
err = ovs_skb_network_trim(skb);
if (err)
return err;
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
// 处理分片包,重组
err = handle_fragments(net, key, info->zone.id, skb);
if (err)
return err;
}
if (info->commit)
err = ovs_ct_commit(net, key, info, skb);
else
err = ovs_ct_lookup(net, key, info, skb);
skb_push(skb, nh_ofs);
skb_postpush_rcsum(skb, skb->data, nh_ofs);
if (err)
kfree_skb(skb);
return err;
}
总结
- fastpath:仅查找内核中flow table的流表.
- slowpath:在内核无法查找到流表项的时候,才会到用户态查找用户态的流表。
协议 | 规则 |
---|---|
dpctl | 流表直接下到内核,可直接走fastpath |
ofctl | 流表下到用户态,会再内核态缓存,先走fastpath,查询未果后走slowpath |
ct | 流表下发到用户态和内核台,先走fastpath,查询未果后走slowpath |