Bootstrap

OVS底层实现原理

内核态Datapath

datapath模块是ovs的核心,运行在内核态,ovs-vswitchd 运行在用户态,两者通过netlink 通信。
netlink 是一种灵活和强大的进程间通信机制(socket),可以沟通用户态和内核态。根据不同socket注册五种类型的family,包括datapath、vport、flow 、packet、meter。

datapath初始化
// 1、在dp_init()函数中,调用dp_register_genl()完成对五种类型的family 以及相应操作的注册
static int __init dp_init(void)
{
    int err;
    BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb));
    pr_info("Open vSwitch switching datapath %s\n", VERSION);
    ovs_nsh_init();
    err = action_fifos_init();
    if (err)
        goto error;
    err = ovs_internal_dev_rtnl_link_register();
    if (err)
        goto error_action_fifos_exit;
    err = ovs_flow_init();
    if (err)
        goto error_unreg_rtnl_link;
    err = ovs_vport_init();
    if (err)
        goto error_flow_exit;
    err = register_pernet_device(&ovs_net_ops);
    if (err)
        goto error_vport_exit;
    err = compat_init();
    if (err)
        goto error_netns_exit;
    err = register_netdevice_notifier(&ovs_dp_device_notifier);
    if (err)
        goto error_compat_exit;
    err = ovs_netdev_init();
    if (err)
        goto error_unreg_notifier;
    // datapath注册
    err = dp_register_genl();
    if (err < 0)
        goto error_unreg_netdev;
    return 0;
error_unreg_netdev:
    ovs_netdev_exit();
error_unreg_notifier:
    unregister_netdevice_notifier(&ovs_dp_device_notifier);
error_compat_exit:
    compat_exit();
error_netns_exit:
    unregister_pernet_device(&ovs_net_ops);
error_vport_exit:
    ovs_vport_exit();
error_flow_exit:
    ovs_flow_exit();
error_unreg_rtnl_link:
    ovs_internal_dev_rtnl_link_unregister();
error_action_fifos_exit:
    action_fifos_exit();
error:
    ovs_nsh_cleanup();
    return err;
}
// 2、在datapath找到注册的family
static int __init dp_register_genl(void)
{
    int err;
    int i;
    for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
        // 注册family
        err = genl_register_family(dp_genl_families[i]);
        if (err)
            goto error;
    }
    return 0;
error:
    dp_unregister_genl(i);
    return err;
}
// 3、family的类型:datapath、vport、flow 、packet、meter
static struct genl_family *dp_genl_families[] = {
    &dp_datapath_genl_family,
    &dp_vport_genl_family,
    &dp_flow_genl_family,
    &dp_packet_genl_family,
    &dp_meter_genl_family,
#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
    &dp_ct_limit_genl_family,
#endif
};
创建网桥br-int: ovs-vsctl add-br br-int
// 1、网桥属于dp_datapath_genl_family
static struct genl_family dp_datapath_genl_family __ro_after_init = {
    .hdrsize = sizeof(struct ovs_header),
    .name = OVS_DATAPATH_FAMILY,
    .version = OVS_DATAPATH_VERSION,
    .maxattr = OVS_DP_ATTR_MAX,
#ifndef HAVE_GENL_OPS_POLICY
    .policy = datapath_policy,
#endif
    .netnsok = true,
    .parallel_ops = true,
    .ops = dp_datapath_genl_ops,
    .n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
    .mcgrps = &ovs_dp_datapath_multicast_group,
    .n_mcgrps = 1,
    .module = THIS_MODULE,
};

// 2、对datapath的操作有new,del,get,set
static const struct genl_ops dp_datapath_genl_ops[] = {
    { .cmd = OVS_DP_CMD_NEW,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = datapath_policy,
#endif
      .doit = ovs_dp_cmd_new
    },
    { .cmd = OVS_DP_CMD_DEL,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = datapath_policy,
#endif
      .doit = ovs_dp_cmd_del
    },
    { .cmd = OVS_DP_CMD_GET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = 0,           /* OK for unprivileged users. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = datapath_policy,
#endif
      .doit = ovs_dp_cmd_get,
      .dumpit = ovs_dp_cmd_dump
    },
    { .cmd = OVS_DP_CMD_SET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = datapath_policy,
#endif
      .doit = ovs_dp_cmd_set,
    },
};

// 3、创建网桥的同时会创建一个同名port
static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
   ......
    /* Set up our datapath device. */
    parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
    parms.type = OVS_VPORT_TYPE_INTERNAL;
    parms.options = NULL;
    parms.dp = dp;
    parms.port_no = OVSP_LOCAL;
    parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
    vport = new_vport(&parms);
    ......
}

// 4、创建网桥时会创建同名port
static struct vport *new_vport(const struct vport_parms *parms)
{
    struct vport *vport;

    vport = ovs_vport_add(parms);
    if (!IS_ERR(vport)) {
        struct datapath *dp = parms->dp;
        struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);

        hlist_add_head_rcu(&vport->dp_hash_node, head);
    }
    return vport;
}

// 5、确定创建的port类型
struct vport *ovs_vport_add(const struct vport_parms *parms)
{
    struct vport_ops *ops;
    struct vport *vport;
    ops = ovs_vport_lookup(parms);
    ......
}
static struct vport_ops ovs_internal_vport_ops = {
    .type       = OVS_VPORT_TYPE_INTERNAL,
    .create     = internal_dev_create,
    .destroy    = internal_dev_destroy,
    .send       = internal_dev_recv,
};

// 6、netdev代表了不同平台底层的设备实现
static struct vport *internal_dev_create(const struct vport_parms *parms)
{
    struct vport *vport;
    struct internal_dev *internal_dev;
    int err;

    vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms);
    if (IS_ERR(vport)) {
        err = PTR_ERR(vport);
        goto error;
    }
    // 分配netdev
    vport->dev = alloc_netdev(sizeof(struct internal_dev),
                  parms->name, NET_NAME_USER, do_setup);
 
    rtnl_lock();
    // vport和netdev的对应关系注册到内核,由内核管理
    err = register_netdevice(vport->dev);
    if (err)
        goto error_unlock;
    rtnl_unlock();
}
在br-int网桥上创建port:ovs-vsctl add-port xxx
// 1、从vport family进入
struct genl_family dp_vport_genl_family __ro_after_init = {
    .hdrsize = sizeof(struct ovs_header),
    .name = OVS_VPORT_FAMILY,
    .version = OVS_VPORT_VERSION,
    .maxattr = OVS_VPORT_ATTR_MAX,
#ifndef HAVE_GENL_OPS_POLICY
    .policy = vport_policy,
#endif
    .netnsok = true,
    .parallel_ops = true,
    .ops = dp_vport_genl_ops,
    .n_ops = ARRAY_SIZE(dp_vport_genl_ops),
    .mcgrps = &ovs_dp_vport_multicast_group,
    .n_mcgrps = 1,
    .module = THIS_MODULE,
};

// 2、vport family包含操作有new,del,get,set
static const struct genl_ops dp_vport_genl_ops[] = {
    { .cmd = OVS_VPORT_CMD_NEW,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = vport_policy,
#endif
      .doit = ovs_vport_cmd_new
    },
    { .cmd = OVS_VPORT_CMD_DEL,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = vport_policy,
#endif
      .doit = ovs_vport_cmd_del
    },
    { .cmd = OVS_VPORT_CMD_GET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = 0,           /* OK for unprivileged users. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = vport_policy,
#endif
      .doit = ovs_vport_cmd_get,
      .dumpit = ovs_vport_cmd_dump
    },
    { .cmd = OVS_VPORT_CMD_SET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = vport_policy,
#endif
      .doit = ovs_vport_cmd_set,
    },
};

// 3、调用创建vport 
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
   ......
    parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
    parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
    parms.options = a[OVS_VPORT_ATTR_OPTIONS];
    parms.dp = dp;
    parms.port_no = port_no;
    parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID];

    vport = new_vport(&parms);
    err = PTR_ERR(vport);
    if (IS_ERR(vport)) {
        if (err == -EAGAIN)
            goto restart;
        goto exit_unlock_free;
    }
    ......
}
// 4、创建port
static struct vport *new_vport(const struct vport_parms *parms)
{
    struct vport *vport;

    vport = ovs_vport_add(parms);
    if (!IS_ERR(vport)) {
        struct datapath *dp = parms->dp;
        struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);

        hlist_add_head_rcu(&vport->dp_hash_node, head);
    }
    return vport;
}

// 5、确定创建的port类型
struct vport *ovs_vport_add(const struct vport_parms *parms)
{
    struct vport_ops *ops;
    struct vport *vport;
    ops = ovs_vport_lookup(parms);
    ......
}
static struct vport_ops ovs_netdev_vport_ops = {
    .type       = OVS_VPORT_TYPE_NETDEV,
    .create     = netdev_create,
    .destroy    = netdev_destroy,
    .send       = dev_queue_xmit,
};

// 6、创建netdev
static struct vport *netdev_create(const struct vport_parms *parms)
{
    struct vport *vport;
    // 创建vport
    vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
    if (IS_ERR(vport))
        return vport;
    // 内核注册
    return ovs_netdev_link(vport, parms->name);
}

// 7、netdev_rx_handler_register在内核注册hook,当这个vport口收到packet时会调用netdev_frame_hook处理
struct vport *ovs_netdev_link(struct vport *vport, const char *name)
{
   ......
    err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
                     vport);
    if (err)
        goto error_master_upper_dev_unlink;
    ......
}
在br-int网桥内核添加flow:ovs-dpctl add-flow xxx
// 1、添加flow属于dp_flow_genl_family
static struct genl_family dp_flow_genl_family __ro_after_init = {
    .hdrsize = sizeof(struct ovs_header),
    .name = OVS_FLOW_FAMILY,
    .version = OVS_FLOW_VERSION,
    .maxattr = OVS_FLOW_ATTR_MAX,
#ifndef HAVE_GENL_OPS_POLICY
    .policy = flow_policy,
#endif
    .netnsok = true,
    .parallel_ops = true,
    .ops = dp_flow_genl_ops,
    .n_ops = ARRAY_SIZE(dp_flow_genl_ops),
    .mcgrps = &ovs_dp_flow_multicast_group,
    .n_mcgrps = 1,
    .module = THIS_MODULE,
};

// 2、flow属性保函new,del,get,set
static const struct genl_ops dp_flow_genl_ops[] = {
    { .cmd = OVS_FLOW_CMD_NEW,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = flow_policy,
#endif
      .doit = ovs_flow_cmd_new
    },
    { .cmd = OVS_FLOW_CMD_DEL,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = flow_policy,
#endif
      .doit = ovs_flow_cmd_del
    },
    { .cmd = OVS_FLOW_CMD_GET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = 0,           /* OK for unprivileged users. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = flow_policy,
#endif
      .doit = ovs_flow_cmd_get,
      .dumpit = ovs_flow_cmd_dump
    },
    { .cmd = OVS_FLOW_CMD_SET,
#ifdef HAVE_GENL_VALIDATE_FLAGS
      .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
#endif
      .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
#ifdef HAVE_GENL_OPS_POLICY
      .policy = flow_policy,
#endif
      .doit = ovs_flow_cmd_set,
    },
};

// 3、addflow提取match的参数和mask与操作后①:作为newflow的key插入到bucket,②再对该值hash后作为bucket的头
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
  ......
    /* Extract key. 创建match结构体,关联key和mask,填入match属性 */
    ovs_match_init(&match, &new_flow->key, false, &mask);
    error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
                  a[OVS_FLOW_ATTR_MASK], log);
    if (error)
        goto err_kfree_flow;

   
    // 将提取到match的key和mask与操作后放入new_flow->key
    ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);

    /* Check if this is a duplicate flow */
    if (ovs_identifier_is_ufid(&new_flow->id))
        flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
    if (!flow)
        flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
    if (likely(!flow)) {
        rcu_assign_pointer(new_flow->sf_acts, acts);

        /* Put flow in bucket. 插入流表*/
        error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
        if (unlikely(error)) {
            acts = NULL;
            goto err_unlock_ovs;
        }
       ......
}
// 4、插入mask和flow
int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
            const struct sw_flow_mask *mask)
{
    int err;

    err = flow_mask_insert(table, flow, mask);
    if (err)
        return err;
    flow_key_insert(table, flow);
    if (ovs_identifier_is_ufid(&flow->id))
        flow_ufid_insert(table, flow);

    return 0;
}
在br-int网桥添加flow:ovs-dpctl add-flow xxx
// 1、匹配操作add-flow
static const struct ovs_cmdl_command all_commands[] = {
    { "show", "switch",
      1, 1, ofctl_show, OVS_RO },
    { "dump-flows", "switch",
      1, 2, ofctl_dump_flows, OVS_RO },
    { "add-flow", "switch flow",
      2, 2, ofctl_add_flow, OVS_RW },
    { "add-flows", "switch file",
      2, 2, ofctl_add_flows, OVS_RW },
    { "mod-flows", "switch flow",
      2, 2, ofctl_mod_flows, OVS_RW },
    { "del-flows", "switch [flow]",
      1, 2, ofctl_del_flows, OVS_RW },
    { "replace-flows", "switch file",
      2, 2, ofctl_replace_flows, OVS_RW },
    { "diff-flows", "source1 source2",
      2, 2, ofctl_diff_flows, OVS_RW },
}
 
 // 2、添加流表操作add-flow
static void
ofctl_add_flow(struct ovs_cmdl_context *ctx)
{
    ofctl_flow_mod(ctx->argc, ctx->argv, OFPFC_ADD);
}  

// 3、分别执行match和action
static void
ofctl_flow_mod(int argc, char *argv[], uint16_t command)
{
   if (argc > 2 && !strcmp(argv[2], "-")) {
        ofctl_flow_mod_file(argc, argv, command);
    } else {
        struct ofputil_flow_mod fm;
        char *error;
        enum ofputil_protocol usable_protocols;
        // 3.1parse参数
        error = parse_ofp_flow_mod_str(&fm, argc > 2 ? argv[2] : "",
                                       ports_to_accept(argv[1]),
                                       tables_to_accept(argv[1]), command,
                                       &usable_protocols);
        if (error) {
            ovs_fatal(0, "%s", error);
        }
        // 3.2生成flow
        ofctl_flow_mod__(argv[1], &fm, 1, usable_protocols);
    }
}

// 3.1解析match
char * OVS_WARN_UNUSED_RESULT
parse_ofp_flow_mod_str(struct ofputil_flow_mod *fm, const char *string,
                       const struct ofputil_port_map *port_map,
                       const struct ofputil_table_map *table_map,
                       int command,
                       enum ofputil_protocol *usable_protocols)
{
    char *error = parse_ofp_str(fm, command, string, port_map, table_map,
                                usable_protocols);
  ......
}

parse_ofp_str(struct ofputil_flow_mod *fm, int command, const char *str_,
              const struct ofputil_port_map *port_map,
              const struct ofputil_table_map *table_map,
              enum ofputil_protocol *usable_protocols)
{
    char *string = xstrdup(str_);
    char *error;

    error = parse_ofp_str__(fm, command, string, port_map, table_map,
                            usable_protocols);
    ......
}

static char * OVS_WARN_UNUSED_RESULT
parse_ofp_str__(struct ofputil_flow_mod *fm, int command, char *string,
                const struct ofputil_port_map *port_map,
                const struct ofputil_table_map *table_map,
                enum ofputil_protocol *usable_protocols)
{
    ......
    // 提取action到act_str
    if (fields & F_ACTIONS) {
        act_str = ofp_extract_actions(string);
        if (!act_str) {
            return xstrdup("must specify an action");
        }
    }
// 解析match参数
    struct match match = MATCH_CATCHALL_INITIALIZER;
    while (ofputil_parse_key_value(&string, &name, &value)) {
        const struct ofp_protocol *p;
        const struct mf_field *mf;
        char *error = NULL;

        if (ofp_parse_protocol(name, &p)) {
            match_set_dl_type(&match, htons(p->dl_type));
            if (p->nw_proto) {
                match_set_nw_proto(&match, p->nw_proto);
            }
            match_set_default_packet_type(&match);
        } else if (!strcmp(name, "eth")) {
            match_set_packet_type(&match, htonl(PT_ETH));
       ......

// 解析action
        ofpbuf_init(&ofpacts, 32);
        struct ofpact_parse_params pp = {
            .port_map = port_map,
            .table_map = table_map,
            .ofpacts = &ofpacts,
            .usable_protocols = &action_usable_protocols
        };
        error = ofpacts_parse_instructions(act_str, &pp);
       ......
}

char * OVS_WARN_UNUSED_RESULT
ofpacts_parse_instructions(const char *s, const struct ofpact_parse_params *pp)
{
    return ofpacts_parse_copy(s, pp, true, 0);
}

static char * OVS_WARN_UNUSED_RESULT
ofpacts_parse_copy(const char *s_, const struct ofpact_parse_params *pp,
                   bool allow_instructions, enum ofpact_type outer_action)
{
    char *error, *s;

    *pp->usable_protocols = OFPUTIL_P_ANY;

    s = xstrdup(s_);
    error = ofpacts_parse(s, pp, allow_instructions, outer_action);
    free(s);

    return error;
}

static char * OVS_WARN_UNUSED_RESULT
ofpacts_parse(char *str, const struct ofpact_parse_params *pp,
              bool allow_instructions, enum ofpact_type outer_action)
{
    if (pp->depth >= MAX_OFPACT_PARSE_DEPTH) {
        return xstrdup("Action nested too deeply");
    }
    CONST_CAST(struct ofpact_parse_params *, pp)->depth++;
    uint32_t orig_size = pp->ofpacts->size;
    char *error = ofpacts_parse__(str, pp, allow_instructions, outer_action);
    if (error) {
        pp->ofpacts->size = orig_size;
    }
    CONST_CAST(struct ofpact_parse_params *, pp)->depth--;
    return error;
}

static char * OVS_WARN_UNUSED_RESULT
ofpacts_parse__(char *str, const struct ofpact_parse_params *pp,
                bool allow_instructions, enum ofpact_type outer_action)
{
    uint32_t orig_size = pp->ofpacts->size;
    char *key, *value;
    bool drop = false;
    char *pos;

    pos = str;
    while (ofputil_parse_key_value(&pos, &key, &value)) {
        enum ofpact_type type;
        char *error = NULL;
        ofp_port_t port;
        if (ofpact_type_from_name(key, &type)) {
            // 解析action参数
            error = ofpact_parse(type, value, pp);
            }
       ......
}

// ofpact_parse匹配的type类型对应的action
#define OFPACTS                                                         \
    /* Output. */                                                       \
    OFPACT(OUTPUT,          ofpact_output,      ofpact, "output")       \
    OFPACT(GROUP,           ofpact_group,       ofpact, "group")        \
    OFPACT(CONTROLLER,      ofpact_controller,  userdata, "controller") \
    OFPACT(ENQUEUE,         ofpact_enqueue,     ofpact, "enqueue")      \
    OFPACT(OUTPUT_REG,      ofpact_output_reg,  ofpact, "output_reg")   \
    OFPACT(BUNDLE,          ofpact_bundle,      members, "bundle")      \
                                                                        \
    /* Header changes. */                                               \
    OFPACT(SET_FIELD,       ofpact_set_field,   ofpact, "set_field")    \
    OFPACT(SET_VLAN_VID,    ofpact_vlan_vid,    ofpact, "set_vlan_vid") \
    OFPACT(SET_VLAN_PCP,    ofpact_vlan_pcp,    ofpact, "set_vlan_pcp") \
    OFPACT(STRIP_VLAN,      ofpact_null,        ofpact, "strip_vlan")   \
    OFPACT(PUSH_VLAN,       ofpact_push_vlan,   ofpact, "push_vlan")    \
    OFPACT(SET_ETH_SRC,     ofpact_mac,         ofpact, "mod_dl_src")   \
    OFPACT(SET_ETH_DST,     ofpact_mac,         ofpact, "mod_dl_dst")   \
    OFPACT(SET_IPV4_SRC,    ofpact_ipv4,        ofpact, "mod_nw_src")   \
    OFPACT(SET_IPV4_DST,    ofpact_ipv4,        ofpact, "mod_nw_dst")   \
    OFPACT(SET_IP_DSCP,     ofpact_dscp,        ofpact, "mod_nw_tos")   \
    OFPACT(SET_IP_ECN,      ofpact_ecn,         ofpact, "mod_nw_ecn")   \
    OFPACT(SET_IP_TTL,      ofpact_ip_ttl,      ofpact, "mod_nw_ttl")   \
    OFPACT(SET_L4_SRC_PORT, ofpact_l4_port,     ofpact, "mod_tp_src")   \
    OFPACT(SET_L4_DST_PORT, ofpact_l4_port,     ofpact, "mod_tp_dst")   \
    OFPACT(REG_MOVE,        ofpact_reg_move,    ofpact, "move")         \
    OFPACT(STACK_PUSH,      ofpact_stack,       ofpact, "push")         \
    OFPACT(STACK_POP,       ofpact_stack,       ofpact, "pop")          \
    OFPACT(DEC_TTL,         ofpact_cnt_ids,     cnt_ids, "dec_ttl")     \
    OFPACT(SET_MPLS_LABEL,  ofpact_mpls_label,  ofpact, "set_mpls_label") \
    OFPACT(SET_MPLS_TC,     ofpact_mpls_tc,     ofpact, "set_mpls_tc")  \
    OFPACT(SET_MPLS_TTL,    ofpact_mpls_ttl,    ofpact, "set_mpls_ttl") \
    OFPACT(DEC_MPLS_TTL,    ofpact_null,        ofpact, "dec_mpls_ttl") \
    OFPACT(PUSH_MPLS,       ofpact_push_mpls,   ofpact, "push_mpls")    \
    OFPACT(POP_MPLS,        ofpact_pop_mpls,    ofpact, "pop_mpls")     \
    OFPACT(DEC_NSH_TTL,     ofpact_null,        ofpact, "dec_nsh_ttl")  \
    OFPACT(DELETE_FIELD,    ofpact_delete_field, ofpact, "delete_field") \
                                                                        \
    /* Generic encap & decap */                                         \
    OFPACT(ENCAP,           ofpact_encap,       props, "encap")         \
    OFPACT(DECAP,           ofpact_decap,       ofpact, "decap")        \
                                                                        \
    /* Metadata. */                                                     \
    OFPACT(SET_TUNNEL,      ofpact_tunnel,      ofpact, "set_tunnel")   \
    OFPACT(SET_QUEUE,       ofpact_queue,       ofpact, "set_queue")    \
    OFPACT(POP_QUEUE,       ofpact_null,        ofpact, "pop_queue")    \
    OFPACT(FIN_TIMEOUT,     ofpact_fin_timeout, ofpact, "fin_timeout")  \
                                                                        \
    /* Flow table interaction. */                                       \
    OFPACT(RESUBMIT,        ofpact_resubmit,    ofpact, "resubmit")     \
    OFPACT(LEARN,           ofpact_learn,       specs, "learn")         \
    OFPACT(CONJUNCTION,     ofpact_conjunction, ofpact, "conjunction")  \
                                                                        \
    /* Arithmetic. */                                                   \
    OFPACT(MULTIPATH,       ofpact_multipath,   ofpact, "multipath")    \
                                                                        \
    /* Other. */                                                        \
    OFPACT(NOTE,            ofpact_note,        data, "note")           \
    OFPACT(EXIT,            ofpact_null,        ofpact, "exit")         \
    OFPACT(SAMPLE,          ofpact_sample,      ofpact, "sample")       \
    OFPACT(UNROLL_XLATE,    ofpact_unroll_xlate, ofpact, "unroll_xlate") \
    OFPACT(CT,              ofpact_conntrack,   ofpact, "ct")           \
    OFPACT(CT_CLEAR,        ofpact_null,        ofpact, "ct_clear")     \
    OFPACT(NAT,             ofpact_nat,         ofpact, "nat")          \
    OFPACT(OUTPUT_TRUNC,    ofpact_output_trunc,ofpact, "output_trunc") \
    OFPACT(CLONE,           ofpact_nest,        actions, "clone")       \
    OFPACT(CHECK_PKT_LARGER, ofpact_check_pkt_larger, ofpact,           \
           "check_pkt_larger")                                          \
                                                                        \
    /* Debugging actions.                                               \
     *                                                                  \
     * These are intentionally undocumented, subject to change, and     \
     * only accepted if ovs-vswitchd is started with --enable-dummy. */ \
    OFPACT(DEBUG_RECIRC, ofpact_null,           ofpact, "debug_recirc") \
    OFPACT(DEBUG_SLOW,   ofpact_null,           ofpact, "debug_slow")   \
                                                                        \
    /* Instructions ("meter" is an action in OF1.5+). */                \
    OFPACT(METER,           ofpact_meter,       ofpact, "meter")        \
    OFPACT(CLEAR_ACTIONS,   ofpact_null,        ofpact, "clear_actions") \
    OFPACT(WRITE_ACTIONS,   ofpact_nest,        actions, "write_actions") \
    OFPACT(WRITE_METADATA,  ofpact_metadata,    ofpact, "write_metadata") \
    OFPACT(GOTO_TABLE,      ofpact_goto_table,  ofpact, "goto_table")

/* enum ofpact_type, with a member OFPACT_<ENUM> for each action. */
enum OVS_PACKED_ENUM ofpact_type {
#define OFPACT(ENUM, STRUCT, MEMBER, NAME) OFPACT_##ENUM,
    OFPACTS
#undef OFPACT
};
从vport收到packet
// 1、vport口收到packet后调用rx_handler_result_t
static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
{
    struct sk_buff *skb = *pskb;

    if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
        return RX_HANDLER_PASS;

#ifndef USE_UPSTREAM_TUNNEL
    netdev_port_receive(skb, NULL);
#else
    netdev_port_receive(skb, skb_tunnel_info(skb));
#endif
    return RX_HANDLER_CONSUMED;
}

// 2、处理packet包
void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info)
{
    ......
    ovs_vport_receive(vport, skb, tun_info);
    return;
error:
    kfree_skb(skb);
}

// 3、ovs_vport_receive作为packet入口,同理ovs_vport_send作为packet出口
int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
              const struct ip_tunnel_info *tun_info)
{
    struct sw_flow_key key;
    int error;

    OVS_CB(skb)->input_vport = vport;
    OVS_CB(skb)->mru = 0;
    OVS_CB(skb)->cutlen = 0;
    ......
    // 提取packet的key
    error = ovs_flow_key_extract(tun_info, skb, &key);
    if (unlikely(error)) {
        kfree_skb(skb);
        return error;
    }
    ovs_dp_process_packet(skb, &key);
    return 0;
}

// 4、处理packet,根据从packet提取的key与flow对比
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
    /* Look up flow. 提取数据包中的数据与flow中数据diff*/
    flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
                     &n_mask_hit);
    if (unlikely(!flow)) {
        struct dp_upcall_info upcall;

        memset(&upcall, 0, sizeof(upcall));
        upcall.cmd = OVS_PACKET_CMD_MISS;
        upcall.portid = ovs_vport_find_upcall_portid(p, skb);
        upcall.mru = OVS_CB(skb)->mru;
        // 若未在内核找到flow,upcall到用户态
        error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
        if (unlikely(error))
            kfree_skb(skb);
        else
            consume_skb(skb);
        stats_counter = &stats->n_missed;
        goto out;
    }

   // 找到flow执行flow对应action
    sf_acts = rcu_dereference(flow->sf_acts);
    error = ovs_execute_actions(dp, skb, sf_acts, key);
    ......
}

// 5、寻找flow
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
                      const struct sw_flow_key *key,
                      u32 skb_hash,
                      u32 *n_mask_hit)
{
   ......
    /* Cache miss, do full lookup. */
    flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
    if (flow)
        ce->skb_hash = skb_hash;

    return flow;
}

// 6、寻找mask
static struct sw_flow *flow_lookup(struct flow_table *tbl,
                   struct table_instance *ti,
                   const struct mask_array *ma,
                   const struct sw_flow_key *key,
                   u32 *n_mask_hit,
                   u32 *index)
{
    ......
        flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
        if (flow) { /* Found */
            *index = i;
            return flow;
        }
    }
    return NULL;
}

// 7、核心代码,每个flow table分成N个桶,根据key进行哈希,不同的key分布在不同的桶里面,每个桶的大小是一个内存页的大小,每个sw_flow有key和action。先遍历mask链表得到mask值后与从packet包中提取的key与操作后hash,根据hash后的值定位bucket,遍历bucket里的flow,若flow里的key与hash前的值相等则执行flow的action操作
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
                      const struct sw_flow_key *unmasked,
                      const struct sw_flow_mask *mask,
                      u32 *n_mask_hit)
{
    struct sw_flow *flow;
    struct hlist_head *head;
    u32 hash;
    struct sw_flow_key masked_key;

    ovs_flow_mask_key(&masked_key, unmasked, false, mask);
    hash = flow_hash(&masked_key, &mask->range);
    head = find_bucket(ti, hash);
    (*n_mask_hit)++;
    hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
        if (flow->mask == mask && flow->flow_table.hash == hash &&
            flow_cmp_masked_key(flow, &masked_key, &mask->range))
            return flow;
    }
    return NULL;
}

ovs分片和重组

分片:在dp收到packet后提取3-4层头信息时进行分片,分片后在执行后续的upcall或者ct

int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 struct sk_buff *skb, struct sw_flow_key *key)
{
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
struct tc_skb_ext *tc_ext;
#endif
int res, err;

/* Extract metadata from packet. */
...

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
if (static_branch_unlikely(&tc_recirc_sharing_support)) {
	tc_ext = skb_ext_find(skb, TC_SKB_EXT);
	key->recirc_id = tc_ext ? tc_ext->chain : 0;
	} else {
	key->recirc_id = 0;
	}
#else
key->recirc_id = 0;
#endif

// 提取包的2-3层信息
err = key_extract(skb, key);
if (!err)
	ovs_ct_fill_key(skb, key);   /* Must be after key_extract(). */
return err;
}

static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
{
	struct ethhdr *eth;

	/* Flags are always used as part of stats */
	key->tp.flags = 0;
	
	skb_reset_mac_header(skb);
	
	/* Link layer. */
	clear_vlan(key);
	...
	
	skb_reset_mac_len(skb);
	
	/* Fill out L3/L4 key info, if any */
	// 提取数据包3层和4层信息
	return key_extract_l3l4(skb, key);
}

static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
{
	int error;
	
	/* Network layer. */
	if (key->eth.type == htons(ETH_P_IP)) {
	struct iphdr *nh;
	__be16 offset;
	
	error = check_iphdr(skb);
	if (unlikely(error)) {
	memset(&key->ip, 0, sizeof(key->ip));
	memset(&key->ipv4, 0, sizeof(key->ipv4));
	if (error == -EINVAL) {
	skb->transport_header = skb->network_header;
	error = 0;
	}
	return error;

	nh = ip_hdr(skb);
	key->ipv4.addr.src = nh->saddr;
	key->ipv4.addr.dst = nh->daddr;
	
	key->ip.proto = nh->protocol;
	key->ip.tos = nh->tos;
	key->ip.ttl = nh->ttl;
	
	// 分片
	offset = nh->frag_off & htons(IP_OFFSET);
	if (offset) {
	key->ip.frag = OVS_FRAG_TYPE_LATER;
	memset(&key->tp, 0, sizeof(key->tp));
	return 0;
	}
...
}

ovs-vswitchd重组:对分片包的4层端口设置为0,所以如果涉及4层协议无法match重组

// 1、收到upcall的消息
static size_t
recv_upcalls(struct handler *handler)
{ 
...
        upcall->key = dupcall->key;
        upcall->key_len = dupcall->key_len;
        upcall->ufid = &dupcall->ufid;
        upcall->hash = hash;

        upcall->out_tun_key = dupcall->out_tun_key;
        upcall->actions = dupcall->actions;

        pkt_metadata_from_flow(&dupcall->packet.md, flow);
        flow_extract(&dupcall->packet, flow);

		// 处理upcal消息
        error = process_upcall(udpif, upcall,
                               &upcall->odp_actions, &upcall->wc);
        if (error) {
            goto cleanup;
        }

        n_upcalls++;
        continue;

    return n_upcalls;
}

// 2、处理upcal消息
static int
process_upcall(struct udpif *udpif, struct upcall *upcall,
               struct ofpbuf *odp_actions, struct flow_wildcards *wc)
{
    const struct dp_packet *packet = upcall->packet;
    const struct flow *flow = upcall->flow;
    size_t actions_len = 0;

    switch (upcall->type) {
    case MISS_UPCALL:
    case SLOW_PATH_UPCALL:
// 执行upcall操作
        upcall_xlate(udpif, upcall, odp_actions, wc);
        return 0;
}
...
}

// 3、执行upcall操作
static void
upcall_xlate(struct udpif *udpif, struct upcall *upcall,
             struct ofpbuf *odp_actions, struct flow_wildcards *wc)
{
    struct dpif_flow_stats stats;
    enum xlate_error xerr;
    struct xlate_in xin;
    struct ds output;

    stats.n_packets = 1;
    stats.n_bytes = dp_packet_size(upcall->packet);
    stats.used = time_msec();
    stats.tcp_flags = ntohs(upcall->flow->tcp_flags);

    ...

    upcall->reval_seq = seq_read(udpif->reval_seq);

    xerr = xlate_actions(&xin, &upcall->xout);
}

// 4、继续执行upcall操作
enum xlate_error
xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
{
    *xout = (struct xlate_out) {
        .slow = 0,
        .recircs = RECIRC_REFS_EMPTY_INITIALIZER,
    };
	if (!xin->ofpacts && !ctx.rule) {
		// 从openflow pipeline匹配规则
        ctx.rule = rule_dpif_lookup_from_table(
            ctx.xbridge->ofproto, ctx.xin->tables_version, flow, ctx.wc,
            ctx.xin->resubmit_stats, &ctx.table_id,
            flow->in_port.ofp_port, true, true, ctx.xin->xcache);
        if (ctx.xin->resubmit_stats) {
            rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats, false);
        }
        if (ctx.xin->xcache) {
            struct xc_entry *entry;

            entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
            entry->rule = ctx.rule;
            ofproto_rule_ref(&ctx.rule->up);
        }

        xlate_report_table(&ctx, ctx.rule, ctx.table_id);
    }
}
// 5、从openflow pipeline匹配规则
struct rule_dpif *
rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto,
                            ovs_version_t version, struct flow *flow,
                            struct flow_wildcards *wc,
                            const struct dpif_flow_stats *stats,
                            uint8_t *table_id, ofp_port_t in_port,
                            bool may_packet_in, bool honor_table_miss,
                            struct xlate_cache *xcache)
{
    ovs_be16 old_tp_src = flow->tp_src, old_tp_dst = flow->tp_dst;
    ofp_port_t old_in_port = flow->in_port.ofp_port;
    enum ofputil_table_miss miss_config;
    struct rule_dpif *rule;
    uint8_t next_id;

    /* We always unwildcard nw_frag (for IP), so they
     * need not be unwildcarded here. */
    if (flow->nw_frag & FLOW_NW_FRAG_ANY
        && ofproto->up.frag_handling != OFPUTIL_FRAG_NX_MATCH) {
        if (ofproto->up.frag_handling == OFPUTIL_FRAG_NORMAL) {
			// 匹配源目port为0的场景重组
            /* We must pretend that transport ports are unavailable. */
            flow->tp_src = htons(0);
            flow->tp_dst = htons(0);
        } else {
            /* Must be OFPUTIL_FRAG_DROP (we don't have OFPUTIL_FRAG_REASM).
        }
}
}

ct重组:对所有进入ct的包重组

// 1、fastpath执行action
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
	int err, level;
	
	level = __this_cpu_inc_return(exec_actions_level);
	if (unlikely(level > OVS_RECURSION_LIMIT)) {
	net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
	     ovs_dp_name(dp));
	kfree_skb(skb);
	err = -ENETDOWN;
	goto out;
	}

	OVS_CB(skb)->acts_origlen = acts->orig_len;
	// 执行action
	err = do_execute_actions(dp, skb, key,
	 acts->actions, acts->actions_len);
	
	if (level == 1)
	process_deferred_actions(dp);
	
	out:
	__this_cpu_dec(exec_actions_level);
	return err;
}

// 2、执行action
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
      struct sw_flow_key *key,
      const struct nlattr *attr, int len)
{
	const struct nlattr *a;
	int rem;
	
	for (a = attr, rem = len; rem > 0;
	     a = nla_next(a, &rem)) {
	int err = 0;
	
	switch (nla_type(a)) {
	case OVS_ACTION_ATTR_CT:
	if (!is_flow_key_valid(key)) {
	err = ovs_flow_key_update(skb, key);
	if (err)
	return err;
	}
	// 匹配ct模块,执行操作
	err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
	     nla_data(a));
	
	/* Hide stolen IP fragments from user space. */
	if (err)
	return err == -EINPROGRESS ? 0 : err;
	break;
	}

}

// 3、匹配ct模块,执行操作
int ovs_ct_execute(struct net *net, struct sk_buff *skb,
   struct sw_flow_key *key,
   const struct ovs_conntrack_info *info)
{
	int nh_ofs;
	int err;
	
	/* The conntrack module expects to be working at L3. */
	nh_ofs = skb_network_offset(skb);
	skb_pull_rcsum(skb, nh_ofs);
	
	err = ovs_skb_network_trim(skb);
	if (err)
	return err;
	
	if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
	// 处理分片包,重组
	err = handle_fragments(net, key, info->zone.id, skb);
	if (err)
	return err;
	}
	
	if (info->commit)
	err = ovs_ct_commit(net, key, info, skb);
	else
	err = ovs_ct_lookup(net, key, info, skb);
	
	skb_push(skb, nh_ofs);
	skb_postpush_rcsum(skb, skb->data, nh_ofs);
	if (err)
	kfree_skb(skb);
	return err;
}
总结
  • fastpath:仅查找内核中flow table的流表.
  • slowpath:在内核无法查找到流表项的时候,才会到用户态查找用户态的流表。
协议规则
dpctl流表直接下到内核,可直接走fastpath
ofctl流表下到用户态,会再内核态缓存,先走fastpath,查询未果后走slowpath
ct流表下发到用户态和内核台,先走fastpath,查询未果后走slowpath
;