OpenCloudOS-Kernel/net/openvswitch/flow_table.h

/*
 * Copyright (c) 2007-2013 Nicira, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 */

#ifndef FLOW_TABLE_H
#define FLOW_TABLE_H 1

#include <linux/kernel.h>
#include <linux/netlink.h>
#include <linux/openvswitch.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/if_ether.h>
#include <linux/in6.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/flex_array.h>

#include <net/inet_ecn.h>
#include <net/ip_tunnels.h>

#include "flow.h"

struct table_instance {
	struct flex_array *buckets;
	unsigned int n_buckets;
	struct rcu_head rcu;
	int node_ver;
	u32 hash_seed;
	bool keep_flows;
};

struct flow_table {
	struct table_instance __rcu *ti;
	struct list_head mask_list;
	unsigned long last_rehash;
	unsigned int count;
};

extern struct kmem_cache *flow_stats_cache;

int ovs_flow_init(void);
void ovs_flow_exit(void);

struct sw_flow *ovs_flow_alloc(void);
void ovs_flow_free(struct sw_flow *, bool deferred);

int ovs_flow_tbl_init(struct flow_table *);
int ovs_flow_tbl_count(const struct flow_table *table);
void ovs_flow_tbl_destroy(struct flow_table *table);
int ovs_flow_tbl_flush(struct flow_table *flow_table);

int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
			const struct sw_flow_mask *mask);
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
int  ovs_flow_tbl_num_masks(const struct flow_table *table);
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
				       u32 *bucket, u32 *idx);
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *,
				    const struct sw_flow_key *,
				    u32 *n_mask_hit);
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
				    const struct sw_flow_key *);
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
					  const struct sw_flow_match *match);
bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
			       const struct sw_flow_match *match);

void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
		       const struct sw_flow_mask *mask);
#endif /* flow_table.h */
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`/*`
			`* Copyright (c) 2007-2013 Nicira, Inc.`
			`*`
			`* This program is free software; you can redistribute it and/or`
			`* modify it under the terms of version 2 of the GNU General Public`
			`* License as published by the Free Software Foundation.`
			`*`
			`* This program is distributed in the hope that it will be useful, but`
			`* WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA`
			`* 02110-1301, USA`
			`*/`

			`#ifndef FLOW_TABLE_H`
			`#define FLOW_TABLE_H 1`

			`#include <linux/kernel.h>`
			`#include <linux/netlink.h>`
			`#include <linux/openvswitch.h>`
			`#include <linux/spinlock.h>`
			`#include <linux/types.h>`
			`#include <linux/rcupdate.h>`
			`#include <linux/if_ether.h>`
			`#include <linux/in6.h>`
			`#include <linux/jiffies.h>`
			`#include <linux/time.h>`
			`#include <linux/flex_array.h>`

			`#include <net/inet_ecn.h>`
			`#include <net/ip_tunnels.h>`

			`#include "flow.h"`

openvswitch: Move mega-flow list out of rehashing struct. ovs-flow rehash does not touch mega flow list. Following patch moves it dp struct datapath. Avoid one extra indirection for accessing mega-flow list head on every packet receive. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 15:14:23 +08:00			`struct table_instance {`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`struct flex_array *buckets;`
openvswitch: Move mega-flow list out of rehashing struct. ovs-flow rehash does not touch mega flow list. Following patch moves it dp struct datapath. Avoid one extra indirection for accessing mega-flow list head on every packet receive. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 15:14:23 +08:00			`unsigned int n_buckets;`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`struct rcu_head rcu;`
			`int node_ver;`
			`u32 hash_seed;`
			`bool keep_flows;`
			`};`

openvswitch: Move mega-flow list out of rehashing struct. ovs-flow rehash does not touch mega flow list. Following patch moves it dp struct datapath. Avoid one extra indirection for accessing mega-flow list head on every packet receive. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 15:14:23 +08:00			`struct flow_table {`
			`struct table_instance __rcu *ti;`
			`struct list_head mask_list;`
			`unsigned long last_rehash;`
			`unsigned int count;`
			`};`

openvswitch: Per NUMA node flow stats. Keep kernel flow stats for each NUMA node rather than each (logical) CPU. This avoids using the per-CPU allocator and removes most of the kernel-side OVS locking overhead otherwise on the top of perf reports and allows OVS to scale better with higher number of threads. With 9 handlers and 4 revalidators netperf TCP_CRR test flow setup rate doubles on a server with two hyper-threaded physical CPUs (16 logical cores each) compared to the current OVS master. Tested with non-trivial flow table with a TCP port match rule forcing all new connections with unique port numbers to OVS userspace. The IP addresses are still wildcarded, so the kernel flows are not considered as exact match 5-tuple flows. This type of flows can be expected to appear in large numbers as the result of more effective wildcarding made possible by improvements in OVS userspace flow classifier. Perf results for this test (master): Events: 305K cycles + 8.43% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 5.64% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 4.75% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 3.32% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 2.61% ovs-vswitchd [kernel.kallsyms] [k] pcpu_alloc_area + 2.19% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.03% swapper [kernel.kallsyms] [k] intel_idle + 1.84% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 1.64% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.58% ovs-vswitchd libc-2.15.so [.] 0x7f4e6 + 1.07% ovs-vswitchd [kernel.kallsyms] [k] memset + 1.03% netperf [kernel.kallsyms] [k] __ticket_spin_lock + 0.92% swapper [kernel.kallsyms] [k] __ticket_spin_lock ... And after this patch: Events: 356K cycles + 6.85% ovs-vswitchd ovs-vswitchd [.] find_match_wc + 4.63% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_lock + 3.06% ovs-vswitchd [kernel.kallsyms] [k] __ticket_spin_lock + 2.81% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask_range + 2.51% ovs-vswitchd libpthread-2.15.so [.] pthread_mutex_unlock + 2.27% ovs-vswitchd ovs-vswitchd [.] classifier_lookup + 1.84% ovs-vswitchd libc-2.15.so [.] 0x15d30f + 1.74% ovs-vswitchd [kernel.kallsyms] [k] mutex_spin_on_owner + 1.47% swapper [kernel.kallsyms] [k] intel_idle + 1.34% ovs-vswitchd ovs-vswitchd [.] flow_hash_in_minimask + 1.33% ovs-vswitchd ovs-vswitchd [.] rule_actions_unref + 1.16% ovs-vswitchd ovs-vswitchd [.] hindex_node_with_hash + 1.16% ovs-vswitchd ovs-vswitchd [.] do_xlate_actions + 1.09% ovs-vswitchd ovs-vswitchd [.] ofproto_rule_ref + 1.01% netperf [kernel.kallsyms] [k] __ticket_spin_lock ... There is a small increase in kernel spinlock overhead due to the same spinlock being shared between multiple cores of the same physical CPU, but that is barely visible in the netperf TCP_CRR test performance (maybe ~1% performance drop, hard to tell exactly due to variance in the test results), when testing for kernel module throughput (with no userspace activity, handful of kernel flows). On flow setup, a single stats instance is allocated (for the NUMA node 0). As CPUs from multiple NUMA nodes start updating stats, new NUMA-node specific stats instances are allocated. This allocation on the packet processing code path is made to never block or look for emergency memory pools, minimizing the allocation latency. If the allocation fails, the existing preallocated stats instance is used. Also, if only CPUs from one NUMA-node are updating the preallocated stats instance, no additional stats instances are allocated. This eliminates the need to pre-allocate stats instances that will not be used, also relieving the stats reader from the burden of reading stats that are never used. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Acked-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2014-03-28 03:42:54 +08:00			`extern struct kmem_cache *flow_stats_cache;`

openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`int ovs_flow_init(void);`
			`void ovs_flow_exit(void);`

openvswitch: Remove 5-tuple optimization. The 5-tuple optimization becomes unnecessary with a later per-NUMA node stats patch. Remove it first to make the changes easier to grasp. Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2014-03-28 03:35:23 +08:00			`struct sw_flow *ovs_flow_alloc(void);`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`void ovs_flow_free(struct sw_flow *, bool deferred);`

openvswitch: Move mega-flow list out of rehashing struct. ovs-flow rehash does not touch mega flow list. Following patch moves it dp struct datapath. Avoid one extra indirection for accessing mega-flow list head on every packet receive. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 15:14:23 +08:00			`int ovs_flow_tbl_init(struct flow_table *);`
openvswitch: Constify various function arguments Help produce better optimized code. Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> 2014-11-06 22:58:52 +08:00			`int ovs_flow_tbl_count(const struct flow_table *table);`
openvswitch: Move table destroy to dp-rcu callback. Ths simplifies flow-table-destroy API. No need to pass explicit parameter about context. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Acked-by: Thomas Graf <tgraf@redhat.com> 2014-05-07 09:41:20 +08:00			`void ovs_flow_tbl_destroy(struct flow_table *table);`
openvswitch: Move mega-flow list out of rehashing struct. ovs-flow rehash does not touch mega flow list. Following patch moves it dp struct datapath. Avoid one extra indirection for accessing mega-flow list head on every packet receive. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 15:14:23 +08:00			`int ovs_flow_tbl_flush(struct flow_table *flow_table);`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00
openvswitch: Simplify mega-flow APIs. Hides mega-flow implementation in flow_table.c rather than datapath.c. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 15:17:42 +08:00			`int ovs_flow_tbl_insert(struct flow_table table, struct sw_flow flow,`
openvswitch: Constify various function arguments Help produce better optimized code. Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> 2014-11-06 22:58:52 +08:00			`const struct sw_flow_mask *mask);`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`void ovs_flow_tbl_remove(struct flow_table table, struct sw_flow flow);`
openvswitch: collect mega flow mask stats Collect mega flow mask stats. ovs-dpctl show command can be used to display them for debugging and performance tuning. Signed-off-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-23 01:42:46 +08:00			`int ovs_flow_tbl_num_masks(const struct flow_table *table);`
openvswitch: Move mega-flow list out of rehashing struct. ovs-flow rehash does not touch mega flow list. Following patch moves it dp struct datapath. Avoid one extra indirection for accessing mega-flow list head on every packet receive. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 15:14:23 +08:00			`struct sw_flow ovs_flow_tbl_dump_next(struct table_instance table,`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`u32 bucket, u32 idx);`
openvswitch: Change ovs_flow_tbl_lookup_xx() APIs API changes only for code readability. No functional chnages. This patch removes the underscored version. Added a new API ovs_flow_tbl_lookup_stats() that returns the n_mask_hits. Reported by: Ben Pfaff <blp@nicira.com> Reviewed-by: Thomas Graf <tgraf@redhat.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-11-26 02:42:46 +08:00			`struct sw_flow ovs_flow_tbl_lookup_stats(struct flow_table ,`
openvswitch: collect mega flow mask stats Collect mega flow mask stats. ovs-dpctl show command can be used to display them for debugging and performance tuning. Signed-off-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-23 01:42:46 +08:00			`const struct sw_flow_key *,`
			`u32 *n_mask_hit);`
openvswitch: Change ovs_flow_tbl_lookup_xx() APIs API changes only for code readability. No functional chnages. This patch removes the underscored version. Added a new API ovs_flow_tbl_lookup_stats() that returns the n_mask_hits. Reported by: Ben Pfaff <blp@nicira.com> Reviewed-by: Thomas Graf <tgraf@redhat.com> Signed-off-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-11-26 02:42:46 +08:00			`struct sw_flow ovs_flow_tbl_lookup(struct flow_table ,`
			`const struct sw_flow_key *);`
openvswitch: Use exact lookup for flow_get and flow_del. Due to the race condition in userspace, there is chance that two overlapping megaflows could be installed in datapath. And this causes userspace unable to delete the less inclusive megaflow flow even after it timeout, since the flow_del logic will stop at the first match of masked flow. This commit fixes the bug by making the kernel flow_del and flow_get logic check all masks in that case. Introduced by 03f0d916a (openvswitch: Mega flow implementation). Signed-off-by: Alex Wang <alexw@nicira.com> Acked-by: Andy Zhou <azhou@nicira.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> 2014-07-01 11:30:29 +08:00			`struct sw_flow ovs_flow_tbl_lookup_exact(struct flow_table tbl,`
openvswitch: Constify various function arguments Help produce better optimized code. Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> 2014-11-06 22:58:52 +08:00			`const struct sw_flow_match *match);`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00			`bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,`
openvswitch: Constify various function arguments Help produce better optimized code. Signed-off-by: Thomas Graf <tgraf@noironetworks.com> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> 2014-11-06 22:58:52 +08:00			`const struct sw_flow_match *match);`
openvswitch: Restructure datapath.c and flow.c Over the time datapath.c and flow.c has became pretty large files. Following patch restructures functionality of component into three different components: flow.c: contains flow extract. flow_netlink.c: netlink flow api. flow_table.c: flow table api. This patch restructures code without changing logic. Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: Jesse Gross <jesse@nicira.com> 2013-10-04 09:16:47 +08:00
			`void ovs_flow_mask_key(struct sw_flow_key dst, const struct sw_flow_key src,`
			`const struct sw_flow_mask *mask);`
			`#endif /* flow_table.h */`