2019-05-19 21:51:35 +08:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-17 06:20:36 +08:00
/*
* Copyright ( c ) 1999 - 2004 Intel Corporation . All rights reserved .
*/
# include <linux/skbuff.h>
# include <linux/netdevice.h>
# include <linux/etherdevice.h>
# include <linux/pkt_sched.h>
# include <linux/spinlock.h>
# include <linux/slab.h>
# include <linux/timer.h>
# include <linux/ip.h>
# include <linux/ipv6.h>
# include <linux/if_arp.h>
# include <linux/if_ether.h>
# include <linux/if_bonding.h>
# include <linux/if_vlan.h>
# include <linux/in.h>
# include <net/arp.h>
2008-08-29 03:38:41 +08:00
# include <net/ipv6.h>
2005-04-17 06:20:36 +08:00
# include <asm/byteorder.h>
2014-11-11 02:27:49 +08:00
# include <net/bonding.h>
# include <net/bond_alb.h>
2005-04-17 06:20:36 +08:00
2016-06-30 22:13:41 +08:00
static const u8 mac_v6_allmcast [ ETH_ALEN + 2 ] __long_aligned = {
2009-09-01 14:31:18 +08:00
0x33 , 0x33 , 0x00 , 0x00 , 0x00 , 0x01
} ;
2005-04-17 06:20:36 +08:00
static const int alb_delta_in_ticks = HZ / ALB_TIMER_TICKS_PER_SEC ;
# pragma pack(1)
struct learning_pkt {
u8 mac_dst [ ETH_ALEN ] ;
u8 mac_src [ ETH_ALEN ] ;
2007-08-23 08:06:58 +08:00
__be16 type ;
2005-04-17 06:20:36 +08:00
u8 padding [ ETH_ZLEN - ETH_HLEN ] ;
} ;
struct arp_pkt {
2007-08-23 08:06:58 +08:00
__be16 hw_addr_space ;
__be16 prot_addr_space ;
2005-04-17 06:20:36 +08:00
u8 hw_addr_len ;
u8 prot_addr_len ;
2007-08-23 08:06:58 +08:00
__be16 op_code ;
2005-04-17 06:20:36 +08:00
u8 mac_src [ ETH_ALEN ] ; /* sender hardware address */
2007-08-23 08:06:58 +08:00
__be32 ip_src ; /* sender IP address */
2005-04-17 06:20:36 +08:00
u8 mac_dst [ ETH_ALEN ] ; /* target hardware address */
2007-08-23 08:06:58 +08:00
__be32 ip_dst ; /* target IP address */
2005-04-17 06:20:36 +08:00
} ;
# pragma pack()
/* Forward declaration */
2021-10-23 07:20:59 +08:00
static void alb_send_learning_packets ( struct slave * slave , const u8 mac_addr [ ] ,
2014-05-22 01:19:48 +08:00
bool strict_match ) ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
static void rlb_purge_src_ip ( struct bonding * bond , struct arp_pkt * arp ) ;
static void rlb_src_unlink ( struct bonding * bond , u32 index ) ;
static void rlb_src_link ( struct bonding * bond , u32 ip_src_hash ,
u32 ip_dst_hash ) ;
2005-04-17 06:20:36 +08:00
2007-04-21 13:47:35 +08:00
static inline u8 _simple_hash ( const u8 * hash_start , int hash_size )
2005-04-17 06:20:36 +08:00
{
int i ;
u8 hash = 0 ;
2014-02-14 17:15:14 +08:00
for ( i = 0 ; i < hash_size ; i + + )
2005-04-17 06:20:36 +08:00
hash ^ = hash_start [ i ] ;
return hash ;
}
/*********************** tlb specific functions ***************************/
static inline void tlb_init_table_entry ( struct tlb_client_info * entry , int save_load )
{
if ( save_load ) {
entry - > load_history = 1 + entry - > tx_bytes /
BOND_TLB_REBALANCE_INTERVAL ;
entry - > tx_bytes = 0 ;
}
entry - > tx_slave = NULL ;
entry - > next = TLB_NULL_INDEX ;
entry - > prev = TLB_NULL_INDEX ;
}
static inline void tlb_init_slave ( struct slave * slave )
{
SLAVE_TLB_INFO ( slave ) . load = 0 ;
SLAVE_TLB_INFO ( slave ) . head = TLB_NULL_INDEX ;
}
2012-01-09 20:01:37 +08:00
static void __tlb_clear_slave ( struct bonding * bond , struct slave * slave ,
int save_load )
2005-04-17 06:20:36 +08:00
{
struct tlb_client_info * tx_hash_table ;
u32 index ;
/* clear slave from tx_hashtbl */
tx_hash_table = BOND_ALB_INFO ( bond ) . tx_hashtbl ;
2008-10-31 08:41:16 +08:00
/* skip this if we've already freed the tx hash table */
if ( tx_hash_table ) {
index = SLAVE_TLB_INFO ( slave ) . head ;
while ( index ! = TLB_NULL_INDEX ) {
u32 next_index = tx_hash_table [ index ] . next ;
2021-05-20 14:18:32 +08:00
2008-10-31 08:41:16 +08:00
tlb_init_table_entry ( & tx_hash_table [ index ] , save_load ) ;
index = next_index ;
}
2005-04-17 06:20:36 +08:00
}
tlb_init_slave ( slave ) ;
2012-01-09 20:01:37 +08:00
}
2006-01-10 04:14:00 +08:00
2012-01-09 20:01:37 +08:00
static void tlb_clear_slave ( struct bonding * bond , struct slave * slave ,
int save_load )
{
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2012-01-09 20:01:37 +08:00
__tlb_clear_slave ( bond , slave , save_load ) ;
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
/* Must be called before starting the monitor timer */
static int tlb_initialize ( struct bonding * bond )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
int size = TLB_HASH_TABLE_SIZE * sizeof ( struct tlb_client_info ) ;
2005-11-10 02:35:30 +08:00
struct tlb_client_info * new_hashtbl ;
2005-04-17 06:20:36 +08:00
int i ;
2007-02-07 06:16:40 +08:00
new_hashtbl = kzalloc ( size , GFP_KERNEL ) ;
2012-01-29 20:56:23 +08:00
if ( ! new_hashtbl )
2016-02-07 13:26:25 +08:00
return - ENOMEM ;
2012-01-29 20:56:23 +08:00
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-11-10 02:35:30 +08:00
bond_info - > tx_hashtbl = new_hashtbl ;
2005-04-17 06:20:36 +08:00
2014-02-14 17:15:14 +08:00
for ( i = 0 ; i < TLB_HASH_TABLE_SIZE ; i + + )
2011-04-08 11:40:19 +08:00
tlb_init_table_entry ( & bond_info - > tx_hashtbl [ i ] , 0 ) ;
2005-04-17 06:20:36 +08:00
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
return 0 ;
}
/* Must be called only after all slaves have been released */
static void tlb_deinitialize ( struct bonding * bond )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
kfree ( bond_info - > tx_hashtbl ) ;
bond_info - > tx_hashtbl = NULL ;
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
2010-05-19 11:26:39 +08:00
static long long compute_gap ( struct slave * slave )
{
return ( s64 ) ( slave - > speed < < 20 ) - /* Convert to Megabit per sec */
( s64 ) ( SLAVE_TLB_INFO ( slave ) . load < < 3 ) ; /* Bytes to bits */
}
2005-04-17 06:20:36 +08:00
static struct slave * tlb_get_least_loaded_slave ( struct bonding * bond )
{
struct slave * slave , * least_loaded ;
2013-09-25 15:20:14 +08:00
struct list_head * iter ;
2010-05-19 11:26:39 +08:00
long long max_gap ;
2005-04-17 06:20:36 +08:00
2010-05-19 11:26:39 +08:00
least_loaded = NULL ;
max_gap = LLONG_MIN ;
2005-04-17 06:20:36 +08:00
/* Find the slave with the largest gap */
2013-10-15 16:28:39 +08:00
bond_for_each_slave_rcu ( bond , slave , iter ) {
2014-05-16 03:39:59 +08:00
if ( bond_slave_can_tx ( slave ) ) {
2010-05-19 11:26:39 +08:00
long long gap = compute_gap ( slave ) ;
2005-04-17 06:20:36 +08:00
if ( max_gap < gap ) {
least_loaded = slave ;
max_gap = gap ;
}
}
}
return least_loaded ;
}
2012-01-09 20:01:37 +08:00
static struct slave * __tlb_choose_channel ( struct bonding * bond , u32 hash_index ,
u32 skb_len )
2005-04-17 06:20:36 +08:00
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct tlb_client_info * hash_table ;
struct slave * assigned_slave ;
hash_table = bond_info - > tx_hashtbl ;
assigned_slave = hash_table [ hash_index ] . tx_slave ;
if ( ! assigned_slave ) {
assigned_slave = tlb_get_least_loaded_slave ( bond ) ;
if ( assigned_slave ) {
struct tlb_slave_info * slave_info =
& ( SLAVE_TLB_INFO ( assigned_slave ) ) ;
u32 next_index = slave_info - > head ;
hash_table [ hash_index ] . tx_slave = assigned_slave ;
hash_table [ hash_index ] . next = next_index ;
hash_table [ hash_index ] . prev = TLB_NULL_INDEX ;
2014-02-14 17:15:14 +08:00
if ( next_index ! = TLB_NULL_INDEX )
2005-04-17 06:20:36 +08:00
hash_table [ next_index ] . prev = hash_index ;
slave_info - > head = hash_index ;
slave_info - > load + =
hash_table [ hash_index ] . load_history ;
}
}
2014-02-14 17:15:14 +08:00
if ( assigned_slave )
2005-04-17 06:20:36 +08:00
hash_table [ hash_index ] . tx_bytes + = skb_len ;
return assigned_slave ;
}
2012-01-09 20:01:37 +08:00
static struct slave * tlb_choose_channel ( struct bonding * bond , u32 hash_index ,
u32 skb_len )
{
struct slave * tx_slave ;
2014-09-15 23:19:34 +08:00
2021-05-21 11:31:35 +08:00
/* We don't need to disable softirq here, because
2012-01-09 20:01:37 +08:00
* tlb_choose_channel ( ) is only called by bond_alb_xmit ( )
* which already has softirq disabled .
*/
2014-09-12 04:49:26 +08:00
spin_lock ( & bond - > mode_lock ) ;
2012-01-09 20:01:37 +08:00
tx_slave = __tlb_choose_channel ( bond , hash_index , skb_len ) ;
2014-09-12 04:49:26 +08:00
spin_unlock ( & bond - > mode_lock ) ;
2014-09-15 23:19:34 +08:00
2012-01-09 20:01:37 +08:00
return tx_slave ;
}
2005-04-17 06:20:36 +08:00
/*********************** rlb specific functions ***************************/
2012-01-09 20:01:37 +08:00
2005-04-17 06:20:36 +08:00
/* when an ARP REPLY is received from a client update its info
* in the rx_hashtbl
*/
static void rlb_update_entry_from_arp ( struct bonding * bond , struct arp_pkt * arp )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct rlb_client_info * client_info ;
u32 hash_index ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
2014-02-14 17:15:12 +08:00
hash_index = _simple_hash ( ( u8 * ) & ( arp - > ip_src ) , sizeof ( arp - > ip_src ) ) ;
2005-04-17 06:20:36 +08:00
client_info = & ( bond_info - > rx_hashtbl [ hash_index ] ) ;
if ( ( client_info - > assigned ) & &
( client_info - > ip_src = = arp - > ip_dst ) & &
bonding: check if clients MAC addr has changed
When two systems using bonding devices in adaptive load
balancing (ALB) communicates with each other, an endless
ping-pong of ARP replies starts between these two systems.
What happens? In the ALB mode, bonding driver keeps track
of each client connected in a hash table, so it can do the
receive load balancing (RLB). This hash table is updated
when an ARP reply is received, then it scans for the client
entry, updates its MAC address and flag it to be announced
later. Therefore, two seconds later, the alb monitor runs
and send for each updated client entry two ARP replies
updating this specific client. The same process happens on
the receiving system, causing the endless ping-pong of arp
replies.
See more information including the relevant functions below:
System 1 System 2
bond0 bond0
ping <system2>
ARP request --------->
<--------- ARP reply
+->rlb_arp_recv <---------------------+ <--- loop begins
| rlb_update_entry_from_arp |
| client_info->ntt = 1; |
| bond_info->rx_ntt = 1; |
| |
| <communication succeed> |
| |
| bond_alb_monitor |
| rlb_update_rx_clients |
| rlb_update_client |
| arp_create(ARPOP_REPLY) |
| send ARP reply --------------> V
| send ARP reply -------------->
| rlb_arp_recv
| rlb_update_entry_from_arp
| client_info->ntt = 1;
| bond_info->rx_ntt = 1;
| < snipped, same as in system 1>
+------- <-------------- send ARP reply
<-------------- send ARP reply
Besides the unneeded networking traffic, this loop breaks
a cluster because a backup system can't take over the IP
address. There is always one system sending an ARP reply
poisoning the network.
This patch fixes the problem adding a check for the MAC
address before updating it. Thus, if the MAC address didn't
change, there is no need to update neither to announce it later.
Signed-off-by: Flavio Leitner <fleitner@redhat.com>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-29 16:24:39 +08:00
( client_info - > ip_dst = = arp - > ip_src ) & &
net, drivers/net: Convert compare_ether_addr_64bits to ether_addr_equal_64bits
Use the new bool function ether_addr_equal_64bits to add
some clarity and reduce the likelihood for misuse of
compare_ether_addr_64bits for sorting.
Done via cocci script:
$ cat compare_ether_addr_64bits.cocci
@@
expression a,b;
@@
- !compare_ether_addr_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- compare_ether_addr_64bits(a, b)
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) == 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) != 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) == 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) != 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 01:04:04 +08:00
( ! ether_addr_equal_64bits ( client_info - > mac_dst , arp - > mac_src ) ) ) {
2005-04-17 06:20:36 +08:00
/* update the clients MAC address */
2014-02-16 08:02:17 +08:00
ether_addr_copy ( client_info - > mac_dst , arp - > mac_src ) ;
2005-04-17 06:20:36 +08:00
client_info - > ntt = 1 ;
bond_info - > rx_ntt = 1 ;
}
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
2012-06-12 03:23:07 +08:00
static int rlb_arp_recv ( const struct sk_buff * skb , struct bonding * bond ,
struct slave * slave )
2005-04-17 06:20:36 +08:00
{
2012-06-12 03:23:07 +08:00
struct arp_pkt * arp , _arp ;
2005-04-17 06:20:36 +08:00
2011-04-19 11:48:16 +08:00
if ( skb - > protocol ! = cpu_to_be16 ( ETH_P_ARP ) )
2012-05-14 03:45:13 +08:00
goto out ;
2005-04-17 06:20:36 +08:00
2012-06-12 03:23:07 +08:00
arp = skb_header_pointer ( skb , 0 , sizeof ( _arp ) , & _arp ) ;
if ( ! arp )
2012-05-14 03:45:13 +08:00
goto out ;
2005-04-17 06:20:36 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
/* We received an ARP from arp->ip_src.
* We might have used this IP address previously ( on the bonding host
* itself or on a system that is bridged together with the bond ) .
* However , if arp - > mac_src is different than what is stored in
* rx_hashtbl , some other host is now using the IP and we must prevent
* sending out client updates with this IP address and the old MAC
* address .
* Clean up all hash table entries that have this address as ip_src but
* have a different mac_src .
*/
rlb_purge_src_ip ( bond , arp ) ;
2005-04-17 06:20:36 +08:00
if ( arp - > op_code = = htons ( ARPOP_REPLY ) ) {
/* update rx hash table for this ARP */
rlb_update_entry_from_arp ( bond , arp ) ;
2019-06-07 22:59:31 +08:00
slave_dbg ( bond - > dev , slave - > dev , " Server received an ARP Reply from client \n " ) ;
2005-04-17 06:20:36 +08:00
}
2012-05-14 03:45:13 +08:00
out :
return RX_HANDLER_ANOTHER ;
2005-04-17 06:20:36 +08:00
}
2014-09-15 23:19:33 +08:00
/* Caller must hold rcu_read_lock() */
static struct slave * __rlb_next_rx_slave ( struct bonding * bond )
2005-04-17 06:20:36 +08:00
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2013-09-25 15:20:17 +08:00
struct slave * before = NULL , * rx_slave = NULL , * slave ;
struct list_head * iter ;
bool found = false ;
2005-04-17 06:20:36 +08:00
2014-09-15 23:19:33 +08:00
bond_for_each_slave_rcu ( bond , slave , iter ) {
2014-05-16 03:39:59 +08:00
if ( ! bond_slave_can_tx ( slave ) )
2013-09-25 15:20:17 +08:00
continue ;
if ( ! found ) {
if ( ! before | | before - > speed < slave - > speed )
before = slave ;
} else {
if ( ! rx_slave | | rx_slave - > speed < slave - > speed )
2005-04-17 06:20:36 +08:00
rx_slave = slave ;
}
2013-09-25 15:20:17 +08:00
if ( slave = = bond_info - > rx_slave )
found = true ;
2005-04-17 06:20:36 +08:00
}
2013-09-25 15:20:17 +08:00
/* we didn't find anything after the current or we have something
* better before and up to the current slave
*/
if ( ! rx_slave | | ( before & & rx_slave - > speed < before - > speed ) )
rx_slave = before ;
2005-04-17 06:20:36 +08:00
2013-09-25 15:20:17 +08:00
if ( rx_slave )
bond_info - > rx_slave = rx_slave ;
2005-04-17 06:20:36 +08:00
return rx_slave ;
}
2014-09-15 23:19:33 +08:00
/* Caller must hold RTNL, rcu_read_lock is obtained only to silence checkers */
static struct slave * rlb_next_rx_slave ( struct bonding * bond )
2013-10-15 16:28:39 +08:00
{
2014-09-15 23:19:33 +08:00
struct slave * rx_slave ;
2013-10-15 16:28:39 +08:00
2014-09-15 23:19:33 +08:00
ASSERT_RTNL ( ) ;
2013-10-15 16:28:39 +08:00
2014-09-15 23:19:33 +08:00
rcu_read_lock ( ) ;
rx_slave = __rlb_next_rx_slave ( bond ) ;
rcu_read_unlock ( ) ;
2013-10-15 16:28:39 +08:00
return rx_slave ;
}
2005-04-17 06:20:36 +08:00
/* teach the switch the mac of a disabled slave
* on the primary for fault tolerance
*
2014-09-12 04:49:23 +08:00
* Caller must hold RTNL
2005-04-17 06:20:36 +08:00
*/
2021-10-23 07:20:59 +08:00
static void rlb_teach_disabled_mac_on_primary ( struct bonding * bond ,
const u8 addr [ ] )
2005-04-17 06:20:36 +08:00
{
2014-09-12 04:49:24 +08:00
struct slave * curr_active = rtnl_dereference ( bond - > curr_active_slave ) ;
2014-07-15 21:56:55 +08:00
if ( ! curr_active )
2005-04-17 06:20:36 +08:00
return ;
if ( ! bond - > alb_info . primary_is_promisc ) {
2014-07-15 21:56:55 +08:00
if ( ! dev_set_promiscuity ( curr_active - > dev , 1 ) )
2008-07-15 11:51:36 +08:00
bond - > alb_info . primary_is_promisc = 1 ;
else
bond - > alb_info . primary_is_promisc = 0 ;
2005-04-17 06:20:36 +08:00
}
bond - > alb_info . rlb_promisc_timeout_counter = 0 ;
2014-07-15 21:56:55 +08:00
alb_send_learning_packets ( curr_active , addr , true ) ;
2005-04-17 06:20:36 +08:00
}
/* slave being removed should not be active at this point
*
bonding: remove the no effect lock for bond_select_active_slave()
The bond slave list was no longer protected by bond lock and only
protected by RTNL or RCU, so anywhere that use bond lock to protect
slave list is meaningless.
remove the release and acquire bond lock for bond_select_active_slave().
The curr_active_slave could only be changed in 3 place:
1. enslave slave.
2. release slave.
3. change_active_slave.
all above place were holding bond lock, RTNL and curr_slave_lock
together, it is tedious and meaningless, obviously bond lock is no
need here, but RTNL or curr_slave_lock is needed, so if you want
to access active slave, you have to choose one lock, RTNL or
curr_slave_lock, if RTNL is exist, no need to add curr_slave_lock,
otherwise curr_slave_lock is better, because of the performance.
there are several place calling bond_select_active_slave() and
bond_change_active_slave(), the next step I will clean these place
and remove the no effect lock.
there are some document changed together when update the function.
Suggested-by: Jay Vosburgh <fubar@us.ibm.com>
Suggested-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-13 10:19:32 +08:00
* Caller must hold rtnl .
2005-04-17 06:20:36 +08:00
*/
static void rlb_clear_slave ( struct bonding * bond , struct slave * slave )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct rlb_client_info * rx_hash_table ;
u32 index , next_index ;
/* clear slave from rx_hashtbl */
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
rx_hash_table = bond_info - > rx_hashtbl ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
index = bond_info - > rx_hashtbl_used_head ;
2005-04-17 06:20:36 +08:00
for ( ; index ! = RLB_NULL_INDEX ; index = next_index ) {
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
next_index = rx_hash_table [ index ] . used_next ;
2005-04-17 06:20:36 +08:00
if ( rx_hash_table [ index ] . slave = = slave ) {
struct slave * assigned_slave = rlb_next_rx_slave ( bond ) ;
if ( assigned_slave ) {
rx_hash_table [ index ] . slave = assigned_slave ;
2018-05-15 02:48:08 +08:00
if ( is_valid_ether_addr ( rx_hash_table [ index ] . mac_dst ) ) {
2005-04-17 06:20:36 +08:00
bond_info - > rx_hashtbl [ index ] . ntt = 1 ;
bond_info - > rx_ntt = 1 ;
/* A slave has been removed from the
* table because it is either disabled
* or being released . We must retry the
* update to avoid clients from not
* being updated & disconnecting when
* there is stress
*/
bond_info - > rlb_update_retry_counter =
RLB_UPDATE_RETRY ;
}
} else { /* there is no active slave */
rx_hash_table [ index ] . slave = NULL ;
}
}
}
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
2014-09-12 04:49:24 +08:00
if ( slave ! = rtnl_dereference ( bond - > curr_active_slave ) )
2005-04-17 06:20:36 +08:00
rlb_teach_disabled_mac_on_primary ( bond , slave - > dev - > dev_addr ) ;
}
static void rlb_update_client ( struct rlb_client_info * client_info )
{
int i ;
2018-05-10 07:32:10 +08:00
if ( ! client_info - > slave | | ! is_valid_ether_addr ( client_info - > mac_dst ) )
2005-04-17 06:20:36 +08:00
return ;
for ( i = 0 ; i < RLB_ARP_BURST_SIZE ; i + + ) {
struct sk_buff * skb ;
skb = arp_create ( ARPOP_REPLY , ETH_P_ARP ,
client_info - > ip_dst ,
client_info - > slave - > dev ,
client_info - > ip_src ,
client_info - > mac_dst ,
client_info - > slave - > dev - > dev_addr ,
client_info - > mac_dst ) ;
if ( ! skb ) {
2019-06-07 22:59:31 +08:00
slave_err ( client_info - > slave - > bond - > dev ,
client_info - > slave - > dev ,
" failed to create an ARP packet \n " ) ;
2005-04-17 06:20:36 +08:00
continue ;
}
skb - > dev = client_info - > slave - > dev ;
2013-08-30 05:38:57 +08:00
if ( client_info - > vlan_id ) {
2014-11-19 21:04:57 +08:00
__vlan_hwaccel_put_tag ( skb , htons ( ETH_P_8021Q ) ,
client_info - > vlan_id ) ;
2005-04-17 06:20:36 +08:00
}
arp_xmit ( skb ) ;
}
}
/* sends ARP REPLIES that update the clients that need updating */
static void rlb_update_rx_clients ( struct bonding * bond )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct rlb_client_info * client_info ;
u32 hash_index ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
hash_index = bond_info - > rx_hashtbl_used_head ;
for ( ; hash_index ! = RLB_NULL_INDEX ;
hash_index = client_info - > used_next ) {
2005-04-17 06:20:36 +08:00
client_info = & ( bond_info - > rx_hashtbl [ hash_index ] ) ;
if ( client_info - > ntt ) {
rlb_update_client ( client_info ) ;
2014-02-14 17:15:15 +08:00
if ( bond_info - > rlb_update_retry_counter = = 0 )
2005-04-17 06:20:36 +08:00
client_info - > ntt = 0 ;
}
}
2009-10-16 21:20:49 +08:00
/* do not update the entries again until this counter is zero so that
2005-04-17 06:20:36 +08:00
* not to confuse the clients .
*/
bond_info - > rlb_update_delay_counter = RLB_UPDATE_DELAY ;
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
/* The slave was assigned a new mac address - update the clients */
static void rlb_req_update_slave_clients ( struct bonding * bond , struct slave * slave )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct rlb_client_info * client_info ;
int ntt = 0 ;
u32 hash_index ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
hash_index = bond_info - > rx_hashtbl_used_head ;
for ( ; hash_index ! = RLB_NULL_INDEX ;
hash_index = client_info - > used_next ) {
2005-04-17 06:20:36 +08:00
client_info = & ( bond_info - > rx_hashtbl [ hash_index ] ) ;
if ( ( client_info - > slave = = slave ) & &
2018-05-15 02:48:08 +08:00
is_valid_ether_addr ( client_info - > mac_dst ) ) {
2005-04-17 06:20:36 +08:00
client_info - > ntt = 1 ;
ntt = 1 ;
}
}
2014-02-14 17:15:13 +08:00
/* update the team's flag only after the whole iteration */
2005-04-17 06:20:36 +08:00
if ( ntt ) {
bond_info - > rx_ntt = 1 ;
2014-02-14 17:15:13 +08:00
/* fasten the change */
2005-04-17 06:20:36 +08:00
bond_info - > rlb_update_retry_counter = RLB_UPDATE_RETRY ;
}
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
/* mark all clients using src_ip to be updated */
2007-08-23 08:06:58 +08:00
static void rlb_req_update_subnet_clients ( struct bonding * bond , __be32 src_ip )
2005-04-17 06:20:36 +08:00
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct rlb_client_info * client_info ;
u32 hash_index ;
2014-09-12 04:49:26 +08:00
spin_lock ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
hash_index = bond_info - > rx_hashtbl_used_head ;
for ( ; hash_index ! = RLB_NULL_INDEX ;
hash_index = client_info - > used_next ) {
2005-04-17 06:20:36 +08:00
client_info = & ( bond_info - > rx_hashtbl [ hash_index ] ) ;
if ( ! client_info - > slave ) {
2014-07-16 01:36:02 +08:00
netdev_err ( bond - > dev , " found a client with no channel in the client's hash table \n " ) ;
2005-04-17 06:20:36 +08:00
continue ;
}
2014-09-15 23:19:34 +08:00
/* update all clients using this src_ip, that are not assigned
2005-04-17 06:20:36 +08:00
* to the team ' s address ( curr_active_slave ) and have a known
* unicast mac address .
*/
if ( ( client_info - > ip_src = = src_ip ) & &
net, drivers/net: Convert compare_ether_addr_64bits to ether_addr_equal_64bits
Use the new bool function ether_addr_equal_64bits to add
some clarity and reduce the likelihood for misuse of
compare_ether_addr_64bits for sorting.
Done via cocci script:
$ cat compare_ether_addr_64bits.cocci
@@
expression a,b;
@@
- !compare_ether_addr_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- compare_ether_addr_64bits(a, b)
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) == 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) != 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) == 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) != 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 01:04:04 +08:00
! ether_addr_equal_64bits ( client_info - > slave - > dev - > dev_addr ,
bond - > dev - > dev_addr ) & &
2018-05-15 02:48:08 +08:00
is_valid_ether_addr ( client_info - > mac_dst ) ) {
2005-04-17 06:20:36 +08:00
client_info - > ntt = 1 ;
bond_info - > rx_ntt = 1 ;
}
}
2014-09-12 04:49:26 +08:00
spin_unlock ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
2020-03-05 01:32:16 +08:00
static struct slave * rlb_choose_channel ( struct sk_buff * skb ,
struct bonding * bond ,
const struct arp_pkt * arp )
2005-04-17 06:20:36 +08:00
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2013-10-15 16:28:39 +08:00
struct slave * assigned_slave , * curr_active_slave ;
2005-04-17 06:20:36 +08:00
struct rlb_client_info * client_info ;
u32 hash_index = 0 ;
2014-09-12 04:49:26 +08:00
spin_lock ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
2013-10-15 16:28:39 +08:00
curr_active_slave = rcu_dereference ( bond - > curr_active_slave ) ;
2011-02-28 07:34:28 +08:00
hash_index = _simple_hash ( ( u8 * ) & arp - > ip_dst , sizeof ( arp - > ip_dst ) ) ;
2005-04-17 06:20:36 +08:00
client_info = & ( bond_info - > rx_hashtbl [ hash_index ] ) ;
if ( client_info - > assigned ) {
if ( ( client_info - > ip_src = = arp - > ip_src ) & &
( client_info - > ip_dst = = arp - > ip_dst ) ) {
/* the entry is already assigned to this client */
2018-05-15 02:48:08 +08:00
if ( ! is_broadcast_ether_addr ( arp - > mac_dst ) ) {
2005-04-17 06:20:36 +08:00
/* update mac address from arp */
2014-02-16 08:02:17 +08:00
ether_addr_copy ( client_info - > mac_dst , arp - > mac_dst ) ;
2005-04-17 06:20:36 +08:00
}
2014-02-16 08:02:17 +08:00
ether_addr_copy ( client_info - > mac_src , arp - > mac_src ) ;
2005-04-17 06:20:36 +08:00
assigned_slave = client_info - > slave ;
if ( assigned_slave ) {
2014-09-12 04:49:26 +08:00
spin_unlock ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
return assigned_slave ;
}
} else {
/* the entry is already assigned to some other client,
* move the old client to primary ( curr_active_slave ) so
* that the new client can be assigned to this entry .
*/
2014-07-15 21:56:55 +08:00
if ( curr_active_slave & &
2013-10-15 16:28:39 +08:00
client_info - > slave ! = curr_active_slave ) {
client_info - > slave = curr_active_slave ;
2005-04-17 06:20:36 +08:00
rlb_update_client ( client_info ) ;
}
}
}
/* assign a new slave */
2013-10-15 16:28:39 +08:00
assigned_slave = __rlb_next_rx_slave ( bond ) ;
2005-04-17 06:20:36 +08:00
if ( assigned_slave ) {
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
if ( ! ( client_info - > assigned & &
client_info - > ip_src = = arp - > ip_src ) ) {
/* ip_src is going to be updated,
* fix the src hash list
*/
u32 hash_src = _simple_hash ( ( u8 * ) & arp - > ip_src ,
sizeof ( arp - > ip_src ) ) ;
rlb_src_unlink ( bond , hash_index ) ;
rlb_src_link ( bond , hash_src , hash_index ) ;
}
2005-04-17 06:20:36 +08:00
client_info - > ip_src = arp - > ip_src ;
client_info - > ip_dst = arp - > ip_dst ;
2021-05-21 11:31:35 +08:00
/* arp->mac_dst is broadcast for arp requests.
2005-04-17 06:20:36 +08:00
* will be updated with clients actual unicast mac address
* upon receiving an arp reply .
*/
2014-02-16 08:02:17 +08:00
ether_addr_copy ( client_info - > mac_dst , arp - > mac_dst ) ;
ether_addr_copy ( client_info - > mac_src , arp - > mac_src ) ;
2005-04-17 06:20:36 +08:00
client_info - > slave = assigned_slave ;
2018-05-15 02:48:08 +08:00
if ( is_valid_ether_addr ( client_info - > mac_dst ) ) {
2005-04-17 06:20:36 +08:00
client_info - > ntt = 1 ;
bond - > alb_info . rx_ntt = 1 ;
} else {
client_info - > ntt = 0 ;
}
2014-03-12 17:31:59 +08:00
if ( vlan_get_tag ( skb , & client_info - > vlan_id ) )
2013-08-30 05:38:57 +08:00
client_info - > vlan_id = 0 ;
2005-04-17 06:20:36 +08:00
if ( ! client_info - > assigned ) {
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
u32 prev_tbl_head = bond_info - > rx_hashtbl_used_head ;
2021-05-20 14:18:32 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
bond_info - > rx_hashtbl_used_head = hash_index ;
client_info - > used_next = prev_tbl_head ;
2005-04-17 06:20:36 +08:00
if ( prev_tbl_head ! = RLB_NULL_INDEX ) {
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
bond_info - > rx_hashtbl [ prev_tbl_head ] . used_prev =
2005-04-17 06:20:36 +08:00
hash_index ;
}
client_info - > assigned = 1 ;
}
}
2014-09-12 04:49:26 +08:00
spin_unlock ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
return assigned_slave ;
}
/* chooses (and returns) transmit channel for arp reply
* does not choose channel for other arp types since they are
* sent on the curr_active_slave
*/
static struct slave * rlb_arp_xmit ( struct sk_buff * skb , struct bonding * bond )
{
struct slave * tx_slave = NULL ;
2020-03-05 01:32:16 +08:00
struct arp_pkt * arp ;
if ( ! pskb_network_may_pull ( skb , sizeof ( * arp ) ) )
return NULL ;
arp = ( struct arp_pkt * ) skb_network_header ( skb ) ;
2005-04-17 06:20:36 +08:00
2012-11-28 07:57:04 +08:00
/* Don't modify or load balance ARPs that do not originate locally
* ( e . g . , arrive via a bridge ) .
*/
2014-06-05 04:23:38 +08:00
if ( ! bond_slave_has_mac_rx ( bond , arp - > mac_src ) )
2012-11-28 07:57:04 +08:00
return NULL ;
2008-09-02 22:08:08 +08:00
if ( arp - > op_code = = htons ( ARPOP_REPLY ) ) {
2014-09-15 23:19:34 +08:00
/* the arp must be sent on the selected rx channel */
2020-03-05 01:32:16 +08:00
tx_slave = rlb_choose_channel ( skb , bond , arp ) ;
2014-02-14 17:15:15 +08:00
if ( tx_slave )
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
bond_hw_addr_copy ( arp - > mac_src , tx_slave - > dev - > dev_addr ,
tx_slave - > dev - > addr_len ) ;
2019-06-07 22:59:31 +08:00
netdev_dbg ( bond - > dev , " (slave %s): Server sent ARP Reply packet \n " ,
tx_slave ? tx_slave - > dev - > name : " NULL " ) ;
2008-09-02 22:08:08 +08:00
} else if ( arp - > op_code = = htons ( ARPOP_REQUEST ) ) {
2005-04-17 06:20:36 +08:00
/* Create an entry in the rx_hashtbl for this client as a
* place holder .
* When the arp reply is received the entry will be updated
* with the correct unicast address of the client .
*/
2020-03-05 01:32:16 +08:00
tx_slave = rlb_choose_channel ( skb , bond , arp ) ;
2005-04-17 06:20:36 +08:00
2011-04-11 08:16:32 +08:00
/* The ARP reply packets must be delayed so that
2005-04-17 06:20:36 +08:00
* they can cancel out the influence of the ARP request .
*/
bond - > alb_info . rlb_update_delay_counter = RLB_UPDATE_DELAY ;
/* arp requests are broadcast and are sent on the primary
* the arp request will collapse all clients on the subnet to
* the primary slave . We must register these clients to be
* updated with their assigned mac .
*/
rlb_req_update_subnet_clients ( bond , arp - > ip_src ) ;
2019-06-07 22:59:31 +08:00
netdev_dbg ( bond - > dev , " (slave %s): Server sent ARP Request packet \n " ,
tx_slave ? tx_slave - > dev - > name : " NULL " ) ;
2005-04-17 06:20:36 +08:00
}
return tx_slave ;
}
static void rlb_rebalance ( struct bonding * bond )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct slave * assigned_slave ;
struct rlb_client_info * client_info ;
int ntt ;
u32 hash_index ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
ntt = 0 ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
hash_index = bond_info - > rx_hashtbl_used_head ;
for ( ; hash_index ! = RLB_NULL_INDEX ;
hash_index = client_info - > used_next ) {
2005-04-17 06:20:36 +08:00
client_info = & ( bond_info - > rx_hashtbl [ hash_index ] ) ;
2013-12-13 10:19:45 +08:00
assigned_slave = __rlb_next_rx_slave ( bond ) ;
2005-04-17 06:20:36 +08:00
if ( assigned_slave & & ( client_info - > slave ! = assigned_slave ) ) {
client_info - > slave = assigned_slave ;
2018-05-15 02:48:07 +08:00
if ( ! is_zero_ether_addr ( client_info - > mac_dst ) ) {
client_info - > ntt = 1 ;
ntt = 1 ;
}
2005-04-17 06:20:36 +08:00
}
}
/* update the team's flag only after the whole iteration */
2014-02-14 17:15:15 +08:00
if ( ntt )
2005-04-17 06:20:36 +08:00
bond_info - > rx_ntt = 1 ;
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
2014-09-15 23:19:34 +08:00
/* Caller must hold mode_lock */
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
static void rlb_init_table_entry_dst ( struct rlb_client_info * entry )
{
entry - > used_next = RLB_NULL_INDEX ;
entry - > used_prev = RLB_NULL_INDEX ;
entry - > assigned = 0 ;
entry - > slave = NULL ;
2013-08-30 05:38:57 +08:00
entry - > vlan_id = 0 ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
}
static void rlb_init_table_entry_src ( struct rlb_client_info * entry )
{
entry - > src_first = RLB_NULL_INDEX ;
entry - > src_prev = RLB_NULL_INDEX ;
entry - > src_next = RLB_NULL_INDEX ;
}
2005-04-17 06:20:36 +08:00
static void rlb_init_table_entry ( struct rlb_client_info * entry )
{
memset ( entry , 0 , sizeof ( struct rlb_client_info ) ) ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
rlb_init_table_entry_dst ( entry ) ;
rlb_init_table_entry_src ( entry ) ;
}
static void rlb_delete_table_entry_dst ( struct bonding * bond , u32 index )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
u32 next_index = bond_info - > rx_hashtbl [ index ] . used_next ;
u32 prev_index = bond_info - > rx_hashtbl [ index ] . used_prev ;
if ( index = = bond_info - > rx_hashtbl_used_head )
bond_info - > rx_hashtbl_used_head = next_index ;
if ( prev_index ! = RLB_NULL_INDEX )
bond_info - > rx_hashtbl [ prev_index ] . used_next = next_index ;
if ( next_index ! = RLB_NULL_INDEX )
bond_info - > rx_hashtbl [ next_index ] . used_prev = prev_index ;
}
/* unlink a rlb hash table entry from the src list */
static void rlb_src_unlink ( struct bonding * bond , u32 index )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
u32 next_index = bond_info - > rx_hashtbl [ index ] . src_next ;
u32 prev_index = bond_info - > rx_hashtbl [ index ] . src_prev ;
bond_info - > rx_hashtbl [ index ] . src_next = RLB_NULL_INDEX ;
bond_info - > rx_hashtbl [ index ] . src_prev = RLB_NULL_INDEX ;
if ( next_index ! = RLB_NULL_INDEX )
bond_info - > rx_hashtbl [ next_index ] . src_prev = prev_index ;
if ( prev_index = = RLB_NULL_INDEX )
return ;
/* is prev_index pointing to the head of this list? */
if ( bond_info - > rx_hashtbl [ prev_index ] . src_first = = index )
bond_info - > rx_hashtbl [ prev_index ] . src_first = next_index ;
else
bond_info - > rx_hashtbl [ prev_index ] . src_next = next_index ;
}
static void rlb_delete_table_entry ( struct bonding * bond , u32 index )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct rlb_client_info * entry = & ( bond_info - > rx_hashtbl [ index ] ) ;
rlb_delete_table_entry_dst ( bond , index ) ;
rlb_init_table_entry_dst ( entry ) ;
rlb_src_unlink ( bond , index ) ;
}
/* add the rx_hashtbl[ip_dst_hash] entry to the list
* of entries with identical ip_src_hash
*/
static void rlb_src_link ( struct bonding * bond , u32 ip_src_hash , u32 ip_dst_hash )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
u32 next ;
bond_info - > rx_hashtbl [ ip_dst_hash ] . src_prev = ip_src_hash ;
next = bond_info - > rx_hashtbl [ ip_src_hash ] . src_first ;
bond_info - > rx_hashtbl [ ip_dst_hash ] . src_next = next ;
if ( next ! = RLB_NULL_INDEX )
bond_info - > rx_hashtbl [ next ] . src_prev = ip_dst_hash ;
bond_info - > rx_hashtbl [ ip_src_hash ] . src_first = ip_dst_hash ;
}
2014-09-15 23:19:34 +08:00
/* deletes all rx_hashtbl entries with arp->ip_src if their mac_src does
* not match arp - > mac_src
*/
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
static void rlb_purge_src_ip ( struct bonding * bond , struct arp_pkt * arp )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2014-02-14 17:15:12 +08:00
u32 ip_src_hash = _simple_hash ( ( u8 * ) & ( arp - > ip_src ) , sizeof ( arp - > ip_src ) ) ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
u32 index ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
index = bond_info - > rx_hashtbl [ ip_src_hash ] . src_first ;
while ( index ! = RLB_NULL_INDEX ) {
struct rlb_client_info * entry = & ( bond_info - > rx_hashtbl [ index ] ) ;
u32 next_index = entry - > src_next ;
2021-05-20 14:18:32 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
if ( entry - > ip_src = = arp - > ip_src & &
! ether_addr_equal_64bits ( arp - > mac_src , entry - > mac_src ) )
2021-05-20 14:18:33 +08:00
rlb_delete_table_entry ( bond , index ) ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
index = next_index ;
}
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
static int rlb_initialize ( struct bonding * bond )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2005-11-10 02:35:30 +08:00
struct rlb_client_info * new_hashtbl ;
2005-04-17 06:20:36 +08:00
int size = RLB_HASH_TABLE_SIZE * sizeof ( struct rlb_client_info ) ;
int i ;
2005-11-10 02:35:30 +08:00
new_hashtbl = kmalloc ( size , GFP_KERNEL ) ;
2012-01-29 20:56:23 +08:00
if ( ! new_hashtbl )
2005-04-17 06:20:36 +08:00
return - 1 ;
2012-01-29 20:56:23 +08:00
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-11-10 02:35:30 +08:00
bond_info - > rx_hashtbl = new_hashtbl ;
2005-04-17 06:20:36 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
bond_info - > rx_hashtbl_used_head = RLB_NULL_INDEX ;
2005-04-17 06:20:36 +08:00
2014-02-14 17:15:15 +08:00
for ( i = 0 ; i < RLB_HASH_TABLE_SIZE ; i + + )
2005-04-17 06:20:36 +08:00
rlb_init_table_entry ( bond_info - > rx_hashtbl + i ) ;
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
/* register to receive ARPs */
2011-04-19 11:48:16 +08:00
bond - > recv_probe = rlb_arp_recv ;
2005-04-17 06:20:36 +08:00
return 0 ;
}
static void rlb_deinitialize ( struct bonding * bond )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
kfree ( bond_info - > rx_hashtbl ) ;
bond_info - > rx_hashtbl = NULL ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
bond_info - > rx_hashtbl_used_head = RLB_NULL_INDEX ;
2005-04-17 06:20:36 +08:00
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
static void rlb_clear_vlan ( struct bonding * bond , unsigned short vlan_id )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
u32 curr_index ;
2014-09-12 04:49:26 +08:00
spin_lock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
curr_index = bond_info - > rx_hashtbl_used_head ;
2005-04-17 06:20:36 +08:00
while ( curr_index ! = RLB_NULL_INDEX ) {
struct rlb_client_info * curr = & ( bond_info - > rx_hashtbl [ curr_index ] ) ;
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
u32 next_index = bond_info - > rx_hashtbl [ curr_index ] . used_next ;
2005-04-17 06:20:36 +08:00
2013-08-30 05:38:57 +08:00
if ( curr - > vlan_id = = vlan_id )
bonding: delete migrated IP addresses from the rlb hash table
Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).
At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.
The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.
This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.
The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).
I added three new members in struct rlb_client_info:
rx_hashtbl[x].src_first will point to the start of a list of
entries for which hash(ip_src) == x.
The list is linked with src_next and src_prev.
When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.
To avoid confusion, I renamed these existing fields of struct
rlb_client_info:
next -> used_next
prev -> used_prev
rx_hashtbl_head -> rx_hashtbl_used_head
(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-28 12:42:14 +08:00
rlb_delete_table_entry ( bond , curr_index ) ;
2005-04-17 06:20:36 +08:00
curr_index = next_index ;
}
2014-09-12 04:49:26 +08:00
spin_unlock_bh ( & bond - > mode_lock ) ;
2005-04-17 06:20:36 +08:00
}
/*********************** tlb/rlb shared functions *********************/
2021-10-23 07:20:59 +08:00
static void alb_send_lp_vid ( struct slave * slave , const u8 mac_addr [ ] ,
2014-05-21 23:24:39 +08:00
__be16 vlan_proto , u16 vid )
2005-04-17 06:20:36 +08:00
{
struct learning_pkt pkt ;
2013-08-29 05:25:13 +08:00
struct sk_buff * skb ;
2005-04-17 06:20:36 +08:00
int size = sizeof ( struct learning_pkt ) ;
memset ( & pkt , 0 , size ) ;
2014-02-16 08:02:17 +08:00
ether_addr_copy ( pkt . mac_dst , mac_addr ) ;
ether_addr_copy ( pkt . mac_src , mac_addr ) ;
2014-03-13 19:41:58 +08:00
pkt . type = cpu_to_be16 ( ETH_P_LOOPBACK ) ;
2005-04-17 06:20:36 +08:00
2013-08-29 05:25:13 +08:00
skb = dev_alloc_skb ( size ) ;
if ( ! skb )
return ;
2005-04-17 06:20:36 +08:00
2017-06-18 22:52:04 +08:00
skb_put_data ( skb , & pkt , size ) ;
2013-08-29 05:25:13 +08:00
skb_reset_mac_header ( skb ) ;
skb - > network_header = skb - > mac_header + ETH_HLEN ;
skb - > protocol = pkt . type ;
skb - > priority = TC_PRIO_CONTROL ;
skb - > dev = slave - > dev ;
2019-06-07 22:59:31 +08:00
slave_dbg ( slave - > bond - > dev , slave - > dev ,
" Send learning packet: mac %pM vlan %d \n " , mac_addr , vid ) ;
2018-05-10 07:32:11 +08:00
2014-11-19 21:04:57 +08:00
if ( vid )
__vlan_hwaccel_put_tag ( skb , vlan_proto , vid ) ;
2013-08-29 05:25:13 +08:00
dev_queue_xmit ( skb ) ;
}
2005-04-17 06:20:36 +08:00
2016-10-18 10:15:45 +08:00
struct alb_walk_data {
struct bonding * bond ;
struct slave * slave ;
2021-10-23 07:20:59 +08:00
const u8 * mac_addr ;
2016-10-18 10:15:45 +08:00
bool strict_match ;
} ;
2020-09-26 02:13:12 +08:00
static int alb_upper_dev_walk ( struct net_device * upper ,
struct netdev_nested_priv * priv )
2016-10-18 10:15:45 +08:00
{
2020-09-26 02:13:12 +08:00
struct alb_walk_data * data = ( struct alb_walk_data * ) priv - > data ;
2016-10-18 10:15:45 +08:00
bool strict_match = data - > strict_match ;
2021-10-23 07:20:59 +08:00
const u8 * mac_addr = data - > mac_addr ;
2016-10-18 10:15:45 +08:00
struct bonding * bond = data - > bond ;
struct slave * slave = data - > slave ;
struct bond_vlan_tag * tags ;
2018-05-10 07:32:11 +08:00
if ( is_vlan_dev ( upper ) & &
2019-10-22 02:47:58 +08:00
bond - > dev - > lower_level = = upper - > lower_level - 1 ) {
2018-05-10 07:32:11 +08:00
if ( upper - > addr_assign_type = = NET_ADDR_STOLEN ) {
2016-10-18 10:15:45 +08:00
alb_send_lp_vid ( slave , mac_addr ,
vlan_dev_vlan_proto ( upper ) ,
vlan_dev_vlan_id ( upper ) ) ;
2018-05-10 07:32:11 +08:00
} else {
2016-10-18 10:15:45 +08:00
alb_send_lp_vid ( slave , upper - > dev_addr ,
vlan_dev_vlan_proto ( upper ) ,
vlan_dev_vlan_id ( upper ) ) ;
}
}
/* If this is a macvlan device, then only send updates
* when strict_match is turned off .
*/
if ( netif_is_macvlan ( upper ) & & ! strict_match ) {
tags = bond_verify_device_path ( bond - > dev , upper , 0 ) ;
if ( IS_ERR_OR_NULL ( tags ) )
BUG ( ) ;
alb_send_lp_vid ( slave , upper - > dev_addr ,
tags [ 0 ] . vlan_proto , tags [ 0 ] . vlan_id ) ;
kfree ( tags ) ;
}
return 0 ;
}
2021-10-23 07:20:59 +08:00
static void alb_send_learning_packets ( struct slave * slave , const u8 mac_addr [ ] ,
2014-05-22 01:19:48 +08:00
bool strict_match )
2013-08-29 05:25:13 +08:00
{
struct bonding * bond = bond_get_bond_by_slave ( slave ) ;
2020-09-26 02:13:12 +08:00
struct netdev_nested_priv priv ;
2016-10-18 10:15:45 +08:00
struct alb_walk_data data = {
. strict_match = strict_match ,
. mac_addr = mac_addr ,
. slave = slave ,
. bond = bond ,
} ;
2013-08-29 05:25:14 +08:00
2020-09-26 02:13:12 +08:00
priv . data = ( void * ) & data ;
2013-08-29 05:25:14 +08:00
/* send untagged */
2014-05-21 23:24:39 +08:00
alb_send_lp_vid ( slave , mac_addr , 0 , 0 ) ;
2013-08-29 05:25:14 +08:00
2014-06-05 04:23:38 +08:00
/* loop through all devices and see if we need to send a packet
* for that device .
*/
2013-08-29 05:25:14 +08:00
rcu_read_lock ( ) ;
2020-09-26 02:13:12 +08:00
netdev_walk_all_upper_dev_rcu ( bond - > dev , alb_upper_dev_walk , & priv ) ;
2013-08-29 05:25:14 +08:00
rcu_read_unlock ( ) ;
2005-04-17 06:20:36 +08:00
}
2021-10-23 07:20:59 +08:00
static int alb_set_slave_mac_addr ( struct slave * slave , const u8 addr [ ] ,
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
unsigned int len )
2005-04-17 06:20:36 +08:00
{
struct net_device * dev = slave - > dev ;
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
struct sockaddr_storage ss ;
2005-04-17 06:20:36 +08:00
2014-05-16 03:39:55 +08:00
if ( BOND_MODE ( slave - > bond ) = = BOND_MODE_TLB ) {
2021-10-23 07:20:59 +08:00
__dev_addr_set ( dev , addr , len ) ;
2005-04-17 06:20:36 +08:00
return 0 ;
}
2014-09-15 23:19:34 +08:00
/* for rlb each slave must have a unique hw mac addresses so that
* each slave will receive packets destined to a different mac
*/
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
memcpy ( ss . __data , addr , len ) ;
ss . ss_family = dev - > type ;
2018-12-13 19:54:30 +08:00
if ( dev_set_mac_address ( dev , ( struct sockaddr * ) & ss , NULL ) ) {
2019-06-07 22:59:31 +08:00
slave_err ( slave - > bond - > dev , dev , " dev_set_mac_address on slave failed! ALB mode requires that the base driver support setting the hw address also when the network device's interface is open \n " ) ;
2005-04-17 06:20:36 +08:00
return - EOPNOTSUPP ;
}
return 0 ;
}
2014-09-15 23:19:34 +08:00
/* Swap MAC addresses between two slaves.
2007-10-18 08:37:49 +08:00
*
* Called with RTNL held , and no other locks .
*/
2013-05-28 07:14:51 +08:00
static void alb_swap_mac_addr ( struct slave * slave1 , struct slave * slave2 )
2005-04-17 06:20:36 +08:00
{
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
u8 tmp_mac_addr [ MAX_ADDR_LEN ] ;
2005-04-17 06:20:36 +08:00
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
bond_hw_addr_copy ( tmp_mac_addr , slave1 - > dev - > dev_addr ,
slave1 - > dev - > addr_len ) ;
alb_set_slave_mac_addr ( slave1 , slave2 - > dev - > dev_addr ,
slave2 - > dev - > addr_len ) ;
alb_set_slave_mac_addr ( slave2 , tmp_mac_addr ,
slave1 - > dev - > addr_len ) ;
2005-04-17 06:20:36 +08:00
2007-10-18 08:37:49 +08:00
}
2014-09-15 23:19:34 +08:00
/* Send learning packets after MAC address swap.
2007-10-18 08:37:49 +08:00
*
2008-01-18 08:24:59 +08:00
* Called with RTNL and no other locks
2007-10-18 08:37:49 +08:00
*/
static void alb_fasten_mac_swap ( struct bonding * bond , struct slave * slave1 ,
struct slave * slave2 )
{
2014-05-16 03:39:59 +08:00
int slaves_state_differ = ( bond_slave_can_tx ( slave1 ) ! = bond_slave_can_tx ( slave2 ) ) ;
2007-10-18 08:37:49 +08:00
struct slave * disabled_slave = NULL ;
2008-01-18 08:24:59 +08:00
ASSERT_RTNL ( ) ;
2005-04-17 06:20:36 +08:00
/* fasten the change in the switch */
2014-05-16 03:39:59 +08:00
if ( bond_slave_can_tx ( slave1 ) ) {
2014-05-22 01:19:48 +08:00
alb_send_learning_packets ( slave1 , slave1 - > dev - > dev_addr , false ) ;
2005-04-17 06:20:36 +08:00
if ( bond - > alb_info . rlb_enabled ) {
/* inform the clients that the mac address
* has changed
*/
rlb_req_update_slave_clients ( bond , slave1 ) ;
}
} else {
disabled_slave = slave1 ;
}
2014-05-16 03:39:59 +08:00
if ( bond_slave_can_tx ( slave2 ) ) {
2014-05-22 01:19:48 +08:00
alb_send_learning_packets ( slave2 , slave2 - > dev - > dev_addr , false ) ;
2005-04-17 06:20:36 +08:00
if ( bond - > alb_info . rlb_enabled ) {
/* inform the clients that the mac address
* has changed
*/
rlb_req_update_slave_clients ( bond , slave2 ) ;
}
} else {
disabled_slave = slave2 ;
}
if ( bond - > alb_info . rlb_enabled & & slaves_state_differ ) {
/* A disabled slave was assigned an active mac addr */
rlb_teach_disabled_mac_on_primary ( bond ,
disabled_slave - > dev - > dev_addr ) ;
}
}
/**
* alb_change_hw_addr_on_detach
* @ bond : bonding we ' re working on
* @ slave : the slave that was just detached
*
* We assume that @ slave was already detached from the slave list .
*
* If @ slave ' s permanent hw address is different both from its current
* address and from @ bond ' s address , then somewhere in the bond there ' s
* a slave that has @ slave ' s permanet address as its current address .
2021-03-30 15:27:54 +08:00
* We ' ll make sure that slave no longer uses @ slave ' s permanent address .
2005-04-17 06:20:36 +08:00
*
2008-01-18 08:24:59 +08:00
* Caller must hold RTNL and no other locks
2005-04-17 06:20:36 +08:00
*/
static void alb_change_hw_addr_on_detach ( struct bonding * bond , struct slave * slave )
{
int perm_curr_diff ;
int perm_bond_diff ;
2013-06-18 19:44:52 +08:00
struct slave * found_slave ;
2005-04-17 06:20:36 +08:00
net, drivers/net: Convert compare_ether_addr_64bits to ether_addr_equal_64bits
Use the new bool function ether_addr_equal_64bits to add
some clarity and reduce the likelihood for misuse of
compare_ether_addr_64bits for sorting.
Done via cocci script:
$ cat compare_ether_addr_64bits.cocci
@@
expression a,b;
@@
- !compare_ether_addr_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- compare_ether_addr_64bits(a, b)
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) == 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) != 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) == 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) != 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 01:04:04 +08:00
perm_curr_diff = ! ether_addr_equal_64bits ( slave - > perm_hwaddr ,
slave - > dev - > dev_addr ) ;
perm_bond_diff = ! ether_addr_equal_64bits ( slave - > perm_hwaddr ,
bond - > dev - > dev_addr ) ;
2005-04-17 06:20:36 +08:00
if ( perm_curr_diff & & perm_bond_diff ) {
2013-06-18 19:44:52 +08:00
found_slave = bond_slave_has_mac ( bond , slave - > perm_hwaddr ) ;
2005-04-17 06:20:36 +08:00
2013-06-18 19:44:52 +08:00
if ( found_slave ) {
alb_swap_mac_addr ( slave , found_slave ) ;
alb_fasten_mac_swap ( bond , slave , found_slave ) ;
2005-04-17 06:20:36 +08:00
}
}
}
/**
* alb_handle_addr_collision_on_attach
* @ bond : bonding we ' re working on
* @ slave : the slave that was just attached
*
* checks uniqueness of slave ' s mac address and handles the case the
* new slave uses the bonds mac address .
*
* If the permanent hw address of @ slave is @ bond ' s hw address , we need to
* find a different hw address to give @ slave , that isn ' t in use by any other
2011-04-11 08:16:32 +08:00
* slave in the bond . This address must be , of course , one of the permanent
2005-04-17 06:20:36 +08:00
* addresses of the other slaves .
*
* We go over the slave list , and for each slave there we compare its
* permanent hw address with the current address of all the other slaves .
* If no match was found , then we ' ve found a slave with a permanent address
* that isn ' t used by any other slave in the bond , so we can assign it to
* @ slave .
*
* assumption : this function is called before @ slave is attached to the
2013-06-18 01:30:35 +08:00
* bond slave list .
2005-04-17 06:20:36 +08:00
*/
static int alb_handle_addr_collision_on_attach ( struct bonding * bond , struct slave * slave )
{
2014-07-15 21:56:55 +08:00
struct slave * has_bond_addr = rcu_access_pointer ( bond - > curr_active_slave ) ;
2013-09-25 15:20:14 +08:00
struct slave * tmp_slave1 , * free_mac_slave = NULL ;
struct list_head * iter ;
2005-04-17 06:20:36 +08:00
2013-09-25 15:20:21 +08:00
if ( ! bond_has_slaves ( bond ) ) {
2005-04-17 06:20:36 +08:00
/* this is the first slave */
return 0 ;
}
/* if slave's mac address differs from bond's mac address
* check uniqueness of slave ' s mac address against the other
* slaves in the bond .
*/
net, drivers/net: Convert compare_ether_addr_64bits to ether_addr_equal_64bits
Use the new bool function ether_addr_equal_64bits to add
some clarity and reduce the likelihood for misuse of
compare_ether_addr_64bits for sorting.
Done via cocci script:
$ cat compare_ether_addr_64bits.cocci
@@
expression a,b;
@@
- !compare_ether_addr_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- compare_ether_addr_64bits(a, b)
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) == 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) != 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) == 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) != 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 01:04:04 +08:00
if ( ! ether_addr_equal_64bits ( slave - > perm_hwaddr , bond - > dev - > dev_addr ) ) {
2013-06-18 01:30:35 +08:00
if ( ! bond_slave_has_mac ( bond , slave - > dev - > dev_addr ) )
2005-07-29 03:00:15 +08:00
return 0 ;
2005-04-17 06:20:36 +08:00
2005-07-29 03:00:15 +08:00
/* Try setting slave mac to bond address and fall-through
2014-09-15 23:19:34 +08:00
* to code handling that situation below . . .
*/
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
alb_set_slave_mac_addr ( slave , bond - > dev - > dev_addr ,
bond - > dev - > addr_len ) ;
2005-04-17 06:20:36 +08:00
}
/* The slave's address is equal to the address of the bond.
* Search for a spare address in the bond for this slave .
*/
2013-09-25 15:20:14 +08:00
bond_for_each_slave ( bond , tmp_slave1 , iter ) {
2013-06-18 01:30:35 +08:00
if ( ! bond_slave_has_mac ( bond , tmp_slave1 - > perm_hwaddr ) ) {
2005-04-17 06:20:36 +08:00
/* no slave has tmp_slave1's perm addr
* as its curr addr
*/
free_mac_slave = tmp_slave1 ;
break ;
}
if ( ! has_bond_addr ) {
net, drivers/net: Convert compare_ether_addr_64bits to ether_addr_equal_64bits
Use the new bool function ether_addr_equal_64bits to add
some clarity and reduce the likelihood for misuse of
compare_ether_addr_64bits for sorting.
Done via cocci script:
$ cat compare_ether_addr_64bits.cocci
@@
expression a,b;
@@
- !compare_ether_addr_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- compare_ether_addr_64bits(a, b)
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) == 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) != 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) == 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) != 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 01:04:04 +08:00
if ( ether_addr_equal_64bits ( tmp_slave1 - > dev - > dev_addr ,
bond - > dev - > dev_addr ) ) {
2005-04-17 06:20:36 +08:00
has_bond_addr = tmp_slave1 ;
}
}
}
if ( free_mac_slave ) {
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
alb_set_slave_mac_addr ( slave , free_mac_slave - > perm_hwaddr ,
free_mac_slave - > dev - > addr_len ) ;
2005-04-17 06:20:36 +08:00
2019-06-07 22:59:31 +08:00
slave_warn ( bond - > dev , slave - > dev , " the slave hw address is in use by the bond; giving it the hw address of %s \n " ,
free_mac_slave - > dev - > name ) ;
2005-04-17 06:20:36 +08:00
} else if ( has_bond_addr ) {
2019-06-07 22:59:31 +08:00
slave_err ( bond - > dev , slave - > dev , " the slave hw address is in use by the bond; couldn't find a slave with a free hw address to give it (this should not have happened) \n " ) ;
2005-04-17 06:20:36 +08:00
return - EFAULT ;
}
return 0 ;
}
/**
* alb_set_mac_address
2020-08-14 19:39:07 +08:00
* @ bond : bonding we ' re working on
* @ addr : MAC address to set
2005-04-17 06:20:36 +08:00
*
* In TLB mode all slaves are configured to the bond ' s hw address , but set
* their dev_addr field to different addresses ( based on their permanent hw
* addresses ) .
*
* For each slave , this function sets the interface to the new address and then
* changes its dev_addr field to its previous value .
*
* Unwinding assumes bond ' s mac address has not yet changed .
*/
static int alb_set_mac_address ( struct bonding * bond , void * addr )
{
2013-09-25 15:20:13 +08:00
struct slave * slave , * rollback_slave ;
2013-09-25 15:20:14 +08:00
struct list_head * iter ;
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
struct sockaddr_storage ss ;
char tmp_addr [ MAX_ADDR_LEN ] ;
2005-04-17 06:20:36 +08:00
int res ;
2013-08-01 22:54:47 +08:00
if ( bond - > alb_info . rlb_enabled )
2005-04-17 06:20:36 +08:00
return 0 ;
2013-09-25 15:20:14 +08:00
bond_for_each_slave ( bond , slave , iter ) {
2005-04-17 06:20:36 +08:00
/* save net_device's current hw address */
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
bond_hw_addr_copy ( tmp_addr , slave - > dev - > dev_addr ,
slave - > dev - > addr_len ) ;
2005-04-17 06:20:36 +08:00
2018-12-13 19:54:30 +08:00
res = dev_set_mac_address ( slave - > dev , addr , NULL ) ;
2005-04-17 06:20:36 +08:00
/* restore net_device's hw address */
2021-10-23 07:20:59 +08:00
dev_addr_set ( slave - > dev , tmp_addr ) ;
2005-04-17 06:20:36 +08:00
2008-11-20 13:56:05 +08:00
if ( res )
2005-04-17 06:20:36 +08:00
goto unwind ;
}
return 0 ;
unwind :
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
memcpy ( ss . __data , bond - > dev - > dev_addr , bond - > dev - > addr_len ) ;
ss . ss_family = bond - > dev - > type ;
2005-04-17 06:20:36 +08:00
/* unwind from head to the slave that failed */
2013-09-25 15:20:14 +08:00
bond_for_each_slave ( bond , rollback_slave , iter ) {
2013-09-25 15:20:13 +08:00
if ( rollback_slave = = slave )
break ;
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
bond_hw_addr_copy ( tmp_addr , rollback_slave - > dev - > dev_addr ,
rollback_slave - > dev - > addr_len ) ;
dev_set_mac_address ( rollback_slave - > dev ,
2018-12-13 19:54:30 +08:00
( struct sockaddr * ) & ss , NULL ) ;
2021-10-23 07:20:59 +08:00
dev_addr_set ( rollback_slave - > dev , tmp_addr ) ;
2005-04-17 06:20:36 +08:00
}
return res ;
}
2021-05-21 11:31:35 +08:00
/************************ exported alb functions ************************/
2005-04-17 06:20:36 +08:00
int bond_alb_initialize ( struct bonding * bond , int rlb_enabled )
{
int res ;
res = tlb_initialize ( bond ) ;
2014-02-14 17:15:15 +08:00
if ( res )
2005-04-17 06:20:36 +08:00
return res ;
if ( rlb_enabled ) {
bond - > alb_info . rlb_enabled = 1 ;
res = rlb_initialize ( bond ) ;
if ( res ) {
tlb_deinitialize ( bond ) ;
return res ;
}
2005-11-10 02:35:35 +08:00
} else {
bond - > alb_info . rlb_enabled = 0 ;
2005-04-17 06:20:36 +08:00
}
return 0 ;
}
void bond_alb_deinitialize ( struct bonding * bond )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
tlb_deinitialize ( bond ) ;
2014-02-14 17:15:16 +08:00
if ( bond_info - > rlb_enabled )
2005-04-17 06:20:36 +08:00
rlb_deinitialize ( bond ) ;
}
2018-05-11 17:53:10 +08:00
static netdev_tx_t bond_do_alb_xmit ( struct sk_buff * skb , struct bonding * bond ,
struct slave * tx_slave )
2014-04-23 07:30:18 +08:00
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
struct ethhdr * eth_data = eth_hdr ( skb ) ;
if ( ! tx_slave ) {
/* unbalanced or unassigned, send through primary */
tx_slave = rcu_dereference ( bond - > curr_active_slave ) ;
2014-04-23 07:30:22 +08:00
if ( bond - > params . tlb_dynamic_lb )
bond_info - > unbalanced_load + = skb - > len ;
2014-04-23 07:30:18 +08:00
}
2014-05-16 03:39:59 +08:00
if ( tx_slave & & bond_slave_can_tx ( tx_slave ) ) {
2014-08-17 18:21:45 +08:00
if ( tx_slave ! = rcu_access_pointer ( bond - > curr_active_slave ) ) {
2014-04-23 07:30:18 +08:00
ether_addr_copy ( eth_data - > h_source ,
tx_slave - > dev - > dev_addr ) ;
}
2020-05-08 00:32:22 +08:00
return bond_dev_queue_xmit ( bond , skb , tx_slave - > dev ) ;
2014-04-23 07:30:18 +08:00
}
2014-04-23 07:30:22 +08:00
if ( tx_slave & & bond - > params . tlb_dynamic_lb ) {
2014-09-12 04:49:26 +08:00
spin_lock ( & bond - > mode_lock ) ;
2014-04-23 07:30:18 +08:00
__tlb_clear_slave ( bond , tx_slave , 0 ) ;
2014-09-12 04:49:26 +08:00
spin_unlock ( & bond - > mode_lock ) ;
2014-04-23 07:30:18 +08:00
}
/* no suitable interface, frame not sent */
2020-05-08 00:32:22 +08:00
return bond_tx_drop ( bond - > dev , skb ) ;
2014-04-23 07:30:18 +08:00
}
2020-05-01 03:21:34 +08:00
struct slave * bond_xmit_tlb_slave_get ( struct bonding * bond ,
struct sk_buff * skb )
2014-04-23 07:30:20 +08:00
{
struct slave * tx_slave = NULL ;
2020-05-01 03:21:34 +08:00
struct ethhdr * eth_data ;
2014-04-23 07:30:20 +08:00
u32 hash_index ;
skb_reset_mac_header ( skb ) ;
eth_data = eth_hdr ( skb ) ;
/* Do not TX balance any multicast or broadcast */
if ( ! is_multicast_ether_addr ( eth_data - > h_dest ) ) {
switch ( skb - > protocol ) {
case htons ( ETH_P_IP ) :
case htons ( ETH_P_IPV6 ) :
hash_index = bond_xmit_hash ( bond , skb ) ;
2014-04-23 07:30:22 +08:00
if ( bond - > params . tlb_dynamic_lb ) {
tx_slave = tlb_choose_channel ( bond ,
hash_index & 0xFF ,
skb - > len ) ;
} else {
2014-10-05 08:45:01 +08:00
struct bond_up_slave * slaves ;
unsigned int count ;
2014-04-23 07:30:22 +08:00
2020-05-01 03:21:33 +08:00
slaves = rcu_dereference ( bond - > usable_slaves ) ;
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE()
Please do not apply this to mainline directly, instead please re-run the
coccinelle script shown below and apply its output.
For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't harmful, and changing them results in
churn.
However, for some features, the read/write distinction is critical to
correct operation. To distinguish these cases, separate read/write
accessors must be used. This patch migrates (most) remaining
ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
coccinelle script:
----
// Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
// WRITE_ONCE()
// $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch
virtual patch
@ depends on patch @
expression E1, E2;
@@
- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)
@ depends on patch @
expression E;
@@
- ACCESS_ONCE(E)
+ READ_ONCE(E)
----
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
count = slaves ? READ_ONCE ( slaves - > count ) : 0 ;
2014-10-05 08:45:01 +08:00
if ( likely ( count ) )
2014-07-17 02:10:36 +08:00
tx_slave = slaves - > arr [ hash_index %
2014-10-05 08:45:01 +08:00
count ] ;
2014-04-23 07:30:22 +08:00
}
2014-04-23 07:30:20 +08:00
break ;
}
}
2020-05-01 03:21:34 +08:00
return tx_slave ;
2014-04-23 07:30:20 +08:00
}
2020-05-01 03:21:34 +08:00
netdev_tx_t bond_tlb_xmit ( struct sk_buff * skb , struct net_device * bond_dev )
2005-04-17 06:20:36 +08:00
{
2008-11-13 15:37:49 +08:00
struct bonding * bond = netdev_priv ( bond_dev ) ;
2020-05-01 03:21:34 +08:00
struct slave * tx_slave ;
tx_slave = bond_xmit_tlb_slave_get ( bond , skb ) ;
return bond_do_alb_xmit ( skb , bond , tx_slave ) ;
}
struct slave * bond_xmit_alb_slave_get ( struct bonding * bond ,
struct sk_buff * skb )
{
2005-04-17 06:20:36 +08:00
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2007-08-23 08:06:58 +08:00
static const __be32 ip_bcast = htonl ( 0xffffffff ) ;
2020-05-01 03:21:34 +08:00
struct slave * tx_slave = NULL ;
const u8 * hash_start = NULL ;
2014-04-23 07:30:18 +08:00
bool do_tx_balance = true ;
2020-05-01 03:21:34 +08:00
struct ethhdr * eth_data ;
2005-04-17 06:20:36 +08:00
u32 hash_index = 0 ;
2020-05-01 03:21:34 +08:00
int hash_size = 0 ;
2005-04-17 06:20:36 +08:00
2007-03-20 06:30:44 +08:00
skb_reset_mac_header ( skb ) ;
2005-04-17 06:20:36 +08:00
eth_data = eth_hdr ( skb ) ;
switch ( ntohs ( skb - > protocol ) ) {
2007-04-21 13:47:35 +08:00
case ETH_P_IP : {
2020-02-05 11:26:05 +08:00
const struct iphdr * iph ;
2007-04-21 13:47:35 +08:00
2018-05-15 02:48:08 +08:00
if ( is_broadcast_ether_addr ( eth_data - > h_dest ) | |
2020-02-05 11:26:05 +08:00
! pskb_network_may_pull ( skb , sizeof ( * iph ) ) ) {
do_tx_balance = false ;
break ;
}
iph = ip_hdr ( skb ) ;
if ( iph - > daddr = = ip_bcast | | iph - > protocol = = IPPROTO_IGMP ) {
2014-04-23 07:30:18 +08:00
do_tx_balance = false ;
2005-04-17 06:20:36 +08:00
break ;
}
2007-04-21 13:47:35 +08:00
hash_start = ( char * ) & ( iph - > daddr ) ;
hash_size = sizeof ( iph - > daddr ) ;
2005-04-17 06:20:36 +08:00
break ;
2020-02-05 11:26:05 +08:00
}
case ETH_P_IPV6 : {
const struct ipv6hdr * ip6hdr ;
2008-08-29 03:38:41 +08:00
/* IPv6 doesn't really use broadcast mac address, but leave
* that here just in case .
*/
2018-05-15 02:48:08 +08:00
if ( is_broadcast_ether_addr ( eth_data - > h_dest ) ) {
2014-04-23 07:30:18 +08:00
do_tx_balance = false ;
2005-04-17 06:20:36 +08:00
break ;
2008-08-29 03:38:41 +08:00
}
/* IPv6 uses all-nodes multicast as an equivalent to
* broadcasts in IPv4 .
*/
net, drivers/net: Convert compare_ether_addr_64bits to ether_addr_equal_64bits
Use the new bool function ether_addr_equal_64bits to add
some clarity and reduce the likelihood for misuse of
compare_ether_addr_64bits for sorting.
Done via cocci script:
$ cat compare_ether_addr_64bits.cocci
@@
expression a,b;
@@
- !compare_ether_addr_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- compare_ether_addr_64bits(a, b)
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) == 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !ether_addr_equal_64bits(a, b) != 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) == 0
+ !ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- ether_addr_equal_64bits(a, b) != 0
+ ether_addr_equal_64bits(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal_64bits(a, b)
+ ether_addr_equal_64bits(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-05-10 01:04:04 +08:00
if ( ether_addr_equal_64bits ( eth_data - > h_dest , mac_v6_allmcast ) ) {
2014-04-23 07:30:18 +08:00
do_tx_balance = false ;
2008-08-29 03:38:41 +08:00
break ;
}
2020-02-05 11:26:05 +08:00
if ( ! pskb_network_may_pull ( skb , sizeof ( * ip6hdr ) ) ) {
do_tx_balance = false ;
break ;
}
/* Additionally, DAD probes should not be tx-balanced as that
2008-08-29 03:38:41 +08:00
* will lead to false positives for duplicate addresses and
* prevent address configuration from working .
*/
ip6hdr = ipv6_hdr ( skb ) ;
if ( ipv6_addr_any ( & ip6hdr - > saddr ) ) {
2014-04-23 07:30:18 +08:00
do_tx_balance = false ;
2008-08-29 03:38:41 +08:00
break ;
2005-04-17 06:20:36 +08:00
}
2020-02-05 11:26:05 +08:00
hash_start = ( char * ) & ip6hdr - > daddr ;
hash_size = sizeof ( ip6hdr - > daddr ) ;
2005-04-17 06:20:36 +08:00
break ;
2020-02-05 11:26:05 +08:00
}
2005-04-17 06:20:36 +08:00
case ETH_P_ARP :
2014-04-23 07:30:18 +08:00
do_tx_balance = false ;
2014-02-14 17:15:16 +08:00
if ( bond_info - > rlb_enabled )
2005-04-17 06:20:36 +08:00
tx_slave = rlb_arp_xmit ( skb , bond ) ;
break ;
default :
2014-04-23 07:30:18 +08:00
do_tx_balance = false ;
2005-04-17 06:20:36 +08:00
break ;
}
if ( do_tx_balance ) {
2018-05-15 02:48:09 +08:00
if ( bond - > params . tlb_dynamic_lb ) {
hash_index = _simple_hash ( hash_start , hash_size ) ;
tx_slave = tlb_choose_channel ( bond , hash_index , skb - > len ) ;
} else {
/*
* do_tx_balance means we are free to select the tx_slave
* So we do exactly what tlb would do for hash selection
*/
struct bond_up_slave * slaves ;
unsigned int count ;
2020-05-01 03:21:33 +08:00
slaves = rcu_dereference ( bond - > usable_slaves ) ;
2018-05-15 02:48:09 +08:00
count = slaves ? READ_ONCE ( slaves - > count ) : 0 ;
if ( likely ( count ) )
tx_slave = slaves - > arr [ bond_xmit_hash ( bond , skb ) %
count ] ;
}
2005-04-17 06:20:36 +08:00
}
2020-05-01 03:21:34 +08:00
return tx_slave ;
}
netdev_tx_t bond_alb_xmit ( struct sk_buff * skb , struct net_device * bond_dev )
{
struct bonding * bond = netdev_priv ( bond_dev ) ;
struct slave * tx_slave = NULL ;
2005-04-17 06:20:36 +08:00
2020-05-01 03:21:34 +08:00
tx_slave = bond_xmit_alb_slave_get ( bond , skb ) ;
2014-04-23 07:30:18 +08:00
return bond_do_alb_xmit ( skb , bond , tx_slave ) ;
2005-04-17 06:20:36 +08:00
}
2007-10-18 08:37:45 +08:00
void bond_alb_monitor ( struct work_struct * work )
2005-04-17 06:20:36 +08:00
{
2007-10-18 08:37:45 +08:00
struct bonding * bond = container_of ( work , struct bonding ,
alb_work . work ) ;
2005-04-17 06:20:36 +08:00
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
2013-09-25 15:20:14 +08:00
struct list_head * iter ;
2005-04-17 06:20:36 +08:00
struct slave * slave ;
2013-09-25 15:20:21 +08:00
if ( ! bond_has_slaves ( bond ) ) {
2021-12-03 10:27:18 +08:00
atomic_set ( & bond_info - > tx_rebalance_counter , 0 ) ;
2005-04-17 06:20:36 +08:00
bond_info - > lp_counter = 0 ;
goto re_arm ;
}
2013-12-13 10:19:45 +08:00
rcu_read_lock ( ) ;
2021-12-03 10:27:18 +08:00
atomic_inc ( & bond_info - > tx_rebalance_counter ) ;
2005-04-17 06:20:36 +08:00
bond_info - > lp_counter + + ;
/* send learning packets */
2013-09-13 23:05:33 +08:00
if ( bond_info - > lp_counter > = BOND_ALB_LP_TICKS ( bond ) ) {
2014-05-22 01:19:48 +08:00
bool strict_match ;
bond_for_each_slave_rcu ( bond , slave , iter ) {
/* If updating current_active, use all currently
2021-05-21 11:31:35 +08:00
* user mac addresses ( ! strict_match ) . Otherwise , only
2014-05-22 01:19:48 +08:00
* use mac of the slave device .
2014-06-05 04:23:38 +08:00
* In RLB mode , we always use strict matches .
2014-05-22 01:19:48 +08:00
*/
2014-07-15 21:56:55 +08:00
strict_match = ( slave ! = rcu_access_pointer ( bond - > curr_active_slave ) | |
2014-06-05 04:23:38 +08:00
bond_info - > rlb_enabled ) ;
2014-05-22 01:19:48 +08:00
alb_send_learning_packets ( slave , slave - > dev - > dev_addr ,
strict_match ) ;
}
2005-04-17 06:20:36 +08:00
bond_info - > lp_counter = 0 ;
}
/* rebalance tx traffic */
2021-12-03 10:27:18 +08:00
if ( atomic_read ( & bond_info - > tx_rebalance_counter ) > = BOND_TLB_REBALANCE_TICKS ) {
2013-12-13 10:19:45 +08:00
bond_for_each_slave_rcu ( bond , slave , iter ) {
2005-04-17 06:20:36 +08:00
tlb_clear_slave ( bond , slave , 1 ) ;
2014-07-15 21:56:55 +08:00
if ( slave = = rcu_access_pointer ( bond - > curr_active_slave ) ) {
2005-04-17 06:20:36 +08:00
SLAVE_TLB_INFO ( slave ) . load =
bond_info - > unbalanced_load /
BOND_TLB_REBALANCE_INTERVAL ;
bond_info - > unbalanced_load = 0 ;
}
}
2021-12-03 10:27:18 +08:00
atomic_set ( & bond_info - > tx_rebalance_counter , 0 ) ;
2005-04-17 06:20:36 +08:00
}
if ( bond_info - > rlb_enabled ) {
if ( bond_info - > primary_is_promisc & &
( + + bond_info - > rlb_promisc_timeout_counter > = RLB_PROMISC_TIMEOUT ) ) {
2014-09-15 23:19:34 +08:00
/* dev_set_promiscuity requires rtnl and
2013-10-28 12:11:22 +08:00
* nothing else . Avoid race with bond_close .
*/
2013-12-13 10:19:45 +08:00
rcu_read_unlock ( ) ;
if ( ! rtnl_trylock ( ) )
2013-10-28 12:11:22 +08:00
goto re_arm ;
2005-04-17 06:20:36 +08:00
bond_info - > rlb_promisc_timeout_counter = 0 ;
/* If the primary was set to promiscuous mode
* because a slave was disabled then
* it can now leave promiscuous mode .
*/
2014-07-15 21:56:55 +08:00
dev_set_promiscuity ( rtnl_dereference ( bond - > curr_active_slave ) - > dev ,
- 1 ) ;
2005-04-17 06:20:36 +08:00
bond_info - > primary_is_promisc = 0 ;
2013-10-28 12:11:22 +08:00
rtnl_unlock ( ) ;
2013-12-13 10:19:45 +08:00
rcu_read_lock ( ) ;
2007-10-18 08:37:51 +08:00
}
2005-04-17 06:20:36 +08:00
if ( bond_info - > rlb_rebalance ) {
bond_info - > rlb_rebalance = 0 ;
rlb_rebalance ( bond ) ;
}
/* check if clients need updating */
if ( bond_info - > rx_ntt ) {
if ( bond_info - > rlb_update_delay_counter ) {
- - bond_info - > rlb_update_delay_counter ;
} else {
rlb_update_rx_clients ( bond ) ;
2014-02-14 17:15:16 +08:00
if ( bond_info - > rlb_update_retry_counter )
2005-04-17 06:20:36 +08:00
- - bond_info - > rlb_update_retry_counter ;
2014-02-14 17:15:16 +08:00
else
2005-04-17 06:20:36 +08:00
bond_info - > rx_ntt = 0 ;
}
}
}
2013-12-13 10:19:45 +08:00
rcu_read_unlock ( ) ;
2005-04-17 06:20:36 +08:00
re_arm :
bonding: eliminate bond_close race conditions
This patch resolves two sets of race conditions.
Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com> reported the
first, as follows:
The bond_close() calls cancel_delayed_work() to cancel delayed works.
It, however, cannot cancel works that were already queued in workqueue.
The bond_open() initializes work->data, and proccess_one_work() refers
get_work_cwq(work)->wq->flags. The get_work_cwq() returns NULL when
work->data has been initialized. Thus, a panic occurs.
He included a patch that converted the cancel_delayed_work calls
in bond_close to flush_delayed_work_sync, which eliminated the above
problem.
His patch is incorporated, at least in principle, into this
patch. In this patch, we use cancel_delayed_work_sync in place of
flush_delayed_work_sync, and also convert bond_uninit in addition to
bond_close.
This conversion to _sync, however, opens new races between
bond_close and three periodically executing workqueue functions:
bond_mii_monitor, bond_alb_monitor and bond_activebackup_arp_mon.
The race occurs because bond_close and bond_uninit are always
called with RTNL held, and these workqueue functions may acquire RTNL to
perform failover-related activities. If bond_close or bond_uninit is
waiting in cancel_delayed_work_sync, deadlock occurs.
These deadlocks are resolved by having the workqueue functions
acquire RTNL conditionally. If the rtnl_trylock() fails, the functions
reschedule and return immediately. For the cases that are attempting to
perform link failover, a delay of 1 is used; for the other cases, the
normal interval is used (as those activities are not as time critical).
Additionally, the bond_mii_monitor function now stores the delay
in a variable (mimicing the structure of activebackup_arp_mon).
Lastly, all of the above renders the kill_timers sentinel moot,
and therefore it has been removed.
Tested-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2011-10-28 23:42:50 +08:00
queue_delayed_work ( bond - > wq , & bond - > alb_work , alb_delta_in_ticks ) ;
2005-04-17 06:20:36 +08:00
}
/* assumption: called before the slave is attached to the bond
* and not locked by the bond lock
*/
int bond_alb_init_slave ( struct bonding * bond , struct slave * slave )
{
int res ;
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
res = alb_set_slave_mac_addr ( slave , slave - > perm_hwaddr ,
slave - > dev - > addr_len ) ;
2014-02-14 17:15:16 +08:00
if ( res )
2005-04-17 06:20:36 +08:00
return res ;
res = alb_handle_addr_collision_on_attach ( bond , slave ) ;
2014-02-14 17:15:16 +08:00
if ( res )
2005-04-17 06:20:36 +08:00
return res ;
tlb_init_slave ( slave ) ;
/* order a rebalance ASAP */
2021-12-03 10:27:18 +08:00
atomic_set ( & bond - > alb_info . tx_rebalance_counter ,
BOND_TLB_REBALANCE_TICKS ) ;
2005-04-17 06:20:36 +08:00
2014-02-14 17:15:16 +08:00
if ( bond - > alb_info . rlb_enabled )
2005-04-17 06:20:36 +08:00
bond - > alb_info . rlb_rebalance = 1 ;
return 0 ;
}
2014-09-15 23:19:34 +08:00
/* Remove slave from tlb and rlb hash tables, and fix up MAC addresses
2008-01-18 08:24:59 +08:00
* if necessary .
*
* Caller must hold RTNL and no other locks
*/
2005-04-17 06:20:36 +08:00
void bond_alb_deinit_slave ( struct bonding * bond , struct slave * slave )
{
2013-09-25 15:20:21 +08:00
if ( bond_has_slaves ( bond ) )
2005-04-17 06:20:36 +08:00
alb_change_hw_addr_on_detach ( bond , slave ) ;
tlb_clear_slave ( bond , slave , 0 ) ;
if ( bond - > alb_info . rlb_enabled ) {
2013-09-25 15:20:17 +08:00
bond - > alb_info . rx_slave = NULL ;
2005-04-17 06:20:36 +08:00
rlb_clear_slave ( bond , slave ) ;
}
2014-07-17 02:10:36 +08:00
2005-04-17 06:20:36 +08:00
}
void bond_alb_handle_link_change ( struct bonding * bond , struct slave * slave , char link )
{
struct alb_bond_info * bond_info = & ( BOND_ALB_INFO ( bond ) ) ;
if ( link = = BOND_LINK_DOWN ) {
tlb_clear_slave ( bond , slave , 0 ) ;
2014-02-14 17:15:17 +08:00
if ( bond - > alb_info . rlb_enabled )
2005-04-17 06:20:36 +08:00
rlb_clear_slave ( bond , slave ) ;
} else if ( link = = BOND_LINK_UP ) {
/* order a rebalance ASAP */
2021-12-03 10:27:18 +08:00
atomic_set ( & bond_info - > tx_rebalance_counter ,
BOND_TLB_REBALANCE_TICKS ) ;
2005-04-17 06:20:36 +08:00
if ( bond - > alb_info . rlb_enabled ) {
bond - > alb_info . rlb_rebalance = 1 ;
/* If the updelay module parameter is smaller than the
* forwarding delay of the switch the rebalance will
* not work because the rebalance arp replies will
* not be forwarded to the clients . .
*/
}
}
2014-07-17 02:10:36 +08:00
if ( bond_is_nondyn_tlb ( bond ) ) {
2014-10-05 08:45:01 +08:00
if ( bond_update_slave_arr ( bond , NULL ) )
2014-07-17 02:10:36 +08:00
pr_err ( " Failed to build slave-array for TLB mode. \n " ) ;
}
2005-04-17 06:20:36 +08:00
}
/**
* bond_alb_handle_active_change - assign new curr_active_slave
* @ bond : our bonding struct
* @ new_slave : new slave to assign
*
* Set the bond - > curr_active_slave to @ new_slave and handle
* mac address swapping and promiscuity changes as needed .
*
2014-09-12 04:49:23 +08:00
* Caller must hold RTNL
2005-04-17 06:20:36 +08:00
*/
void bond_alb_handle_active_change ( struct bonding * bond , struct slave * new_slave )
{
struct slave * swap_slave ;
2014-07-15 21:56:55 +08:00
struct slave * curr_active ;
2005-04-17 06:20:36 +08:00
2014-09-12 04:49:23 +08:00
curr_active = rtnl_dereference ( bond - > curr_active_slave ) ;
2014-07-15 21:56:55 +08:00
if ( curr_active = = new_slave )
2005-04-17 06:20:36 +08:00
return ;
2014-07-15 21:56:55 +08:00
if ( curr_active & & bond - > alb_info . primary_is_promisc ) {
dev_set_promiscuity ( curr_active - > dev , - 1 ) ;
2005-04-17 06:20:36 +08:00
bond - > alb_info . primary_is_promisc = 0 ;
bond - > alb_info . rlb_promisc_timeout_counter = 0 ;
}
2014-07-15 21:56:55 +08:00
swap_slave = curr_active ;
bonding: initial RCU conversion
This patch does the initial bonding conversion to RCU. After it the
following modes are protected by RCU alone: roundrobin, active-backup,
broadcast and xor. Modes ALB/TLB and 3ad still acquire bond->lock for
reading, and will be dealt with later. curr_active_slave needs to be
dereferenced via rcu in the converted modes because the only thing
protecting the slave after this patch is rcu_read_lock, so we need the
proper barrier for weakly ordered archs and to make sure we don't have
stale pointer. It's not tagged with __rcu yet because there's still work
to be done to remove the curr_slave_lock, so sparse will complain when
rcu_assign_pointer and rcu_dereference are used, but the alternative to use
rcu_dereference_protected would've created much bigger code churn which is
more difficult to test and review. That will be converted in time.
1. Active-backup mode
1.1 Perf recording while doing iperf -P 4
- old bonding: iperf spent 0.55% in bonding, system spent 0.29% CPU
in bonding
- new bonding: iperf spent 0.29% in bonding, system spent 0.15% CPU
in bonding
1.2. Bandwidth measurements
- old bonding: 16.1 gbps consistently
- new bonding: 17.5 gbps consistently
2. Round-robin mode
2.1 Perf recording while doing iperf -P 4
- old bonding: iperf spent 0.51% in bonding, system spent 0.24% CPU
in bonding
- new bonding: iperf spent 0.16% in bonding, system spent 0.11% CPU
in bonding
2.2 Bandwidth measurements
- old bonding: 8 gbps (variable due to packet reorderings)
- new bonding: 10 gbps (variable due to packet reorderings)
Of course the latency has improved in all converted modes, and moreover
while
doing enslave/release (since it doesn't affect tx anymore).
Also I've stress tested all modes doing enslave/release in a loop while
transmitting traffic.
Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-01 22:54:51 +08:00
rcu_assign_pointer ( bond - > curr_active_slave , new_slave ) ;
2005-04-17 06:20:36 +08:00
2013-09-25 15:20:21 +08:00
if ( ! new_slave | | ! bond_has_slaves ( bond ) )
2005-04-17 06:20:36 +08:00
return ;
/* set the new curr_active_slave to the bonds mac address
* i . e . swap mac addresses of old curr_active_slave and new curr_active_slave
*/
2013-06-18 19:44:52 +08:00
if ( ! swap_slave )
swap_slave = bond_slave_has_mac ( bond , bond - > dev - > dev_addr ) ;
2005-04-17 06:20:36 +08:00
2014-09-15 23:19:34 +08:00
/* Arrange for swap_slave and new_slave to temporarily be
2007-10-18 08:37:49 +08:00
* ignored so we can mess with their MAC addresses without
* fear of interference from transmit activity .
*/
2013-08-01 22:54:47 +08:00
if ( swap_slave )
2007-10-18 08:37:49 +08:00
tlb_clear_slave ( bond , swap_slave , 1 ) ;
tlb_clear_slave ( bond , new_slave , 1 ) ;
2013-10-07 15:17:20 +08:00
/* in TLB mode, the slave might flip down/up with the old dev_addr,
* and thus filter bond - > dev_addr ' s packets , so force bond ' s mac
*/
2014-05-16 03:39:55 +08:00
if ( BOND_MODE ( bond ) = = BOND_MODE_TLB ) {
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
struct sockaddr_storage ss ;
u8 tmp_addr [ MAX_ADDR_LEN ] ;
2013-10-07 15:17:20 +08:00
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
bond_hw_addr_copy ( tmp_addr , new_slave - > dev - > dev_addr ,
new_slave - > dev - > addr_len ) ;
2013-10-07 15:17:20 +08:00
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
bond_hw_addr_copy ( ss . __data , bond - > dev - > dev_addr ,
bond - > dev - > addr_len ) ;
ss . ss_family = bond - > dev - > type ;
2013-10-07 15:17:20 +08:00
/* we don't care if it can't change its mac, best effort */
2018-12-13 19:54:30 +08:00
dev_set_mac_address ( new_slave - > dev , ( struct sockaddr * ) & ss ,
NULL ) ;
2013-10-07 15:17:20 +08:00
2021-10-23 07:20:59 +08:00
dev_addr_set ( new_slave - > dev , tmp_addr ) ;
2013-10-07 15:17:20 +08:00
}
2005-04-17 06:20:36 +08:00
/* curr_active_slave must be set before calling alb_swap_mac_addr */
if ( swap_slave ) {
/* swap mac address */
2013-05-28 07:14:51 +08:00
alb_swap_mac_addr ( swap_slave , new_slave ) ;
2007-10-18 08:37:49 +08:00
alb_fasten_mac_swap ( bond , swap_slave , new_slave ) ;
} else {
2013-06-18 19:44:52 +08:00
/* set the new_slave to the bond mac address */
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
alb_set_slave_mac_addr ( new_slave , bond - > dev - > dev_addr ,
bond - > dev - > addr_len ) ;
2014-05-22 01:19:48 +08:00
alb_send_learning_packets ( new_slave , bond - > dev - > dev_addr ,
false ) ;
2005-04-17 06:20:36 +08:00
}
}
2014-09-10 05:16:59 +08:00
/* Called with RTNL */
2005-04-17 06:20:36 +08:00
int bond_alb_set_mac_address ( struct net_device * bond_dev , void * addr )
{
2008-11-13 15:37:49 +08:00
struct bonding * bond = netdev_priv ( bond_dev ) ;
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
struct sockaddr_storage * ss = addr ;
2014-07-15 21:56:55 +08:00
struct slave * curr_active ;
2013-06-18 19:44:52 +08:00
struct slave * swap_slave ;
2005-04-17 06:20:36 +08:00
int res ;
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
if ( ! is_valid_ether_addr ( ss - > __data ) )
2005-04-17 06:20:36 +08:00
return - EADDRNOTAVAIL ;
res = alb_set_mac_address ( bond , addr ) ;
2014-02-14 17:15:17 +08:00
if ( res )
2005-04-17 06:20:36 +08:00
return res ;
2021-10-23 07:20:59 +08:00
dev_addr_set ( bond_dev , ss - > __data ) ;
2005-04-17 06:20:36 +08:00
/* If there is no curr_active_slave there is nothing else to do.
* Otherwise we ' ll need to pass the new address to it and handle
* duplications .
*/
2014-07-15 21:56:55 +08:00
curr_active = rtnl_dereference ( bond - > curr_active_slave ) ;
if ( ! curr_active )
2005-04-17 06:20:36 +08:00
return 0 ;
2013-06-18 19:44:52 +08:00
swap_slave = bond_slave_has_mac ( bond , bond_dev - > dev_addr ) ;
2005-04-17 06:20:36 +08:00
if ( swap_slave ) {
2014-07-15 21:56:55 +08:00
alb_swap_mac_addr ( swap_slave , curr_active ) ;
alb_fasten_mac_swap ( bond , swap_slave , curr_active ) ;
2005-04-17 06:20:36 +08:00
} else {
bonding: attempt to better support longer hw addresses
People are using bonding over Infiniband IPoIB connections, and who knows
what else. Infiniband has a hardware address length of 20 octets
(INFINIBAND_ALEN), and the network core defines a MAX_ADDR_LEN of 32.
Various places in the bonding code are currently hard-wired to 6 octets
(ETH_ALEN), such as the 3ad code, which I've left untouched here. Besides,
only alb is currently possible on Infiniband links right now anyway, due
to commit 1533e7731522, so the alb code is where most of the changes are.
One major component of this change is the addition of a bond_hw_addr_copy
function that takes a length argument, instead of using ether_addr_copy
everywhere that hardware addresses need to be copied about. The other
major component of this change is converting the bonding code from using
struct sockaddr for address storage to struct sockaddr_storage, as the
former has an address storage space of only 14, while the latter is 128
minus a few, which is necessary to support bonding over device with up to
MAX_ADDR_LEN octet hardware addresses. Additionally, this probably fixes
up some memory corruption issues with the current code, where it's
possible to write an infiniband hardware address into a sockaddr declared
on the stack.
Lightly tested on a dual mlx4 IPoIB setup, which properly shows a 20-octet
hardware address now:
$ cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: fault-tolerance (active-backup) (fail_over_mac active)
Primary Slave: mlx4_ib0 (primary_reselect always)
Currently Active Slave: mlx4_ib0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 100
Down Delay (ms): 100
Slave Interface: mlx4_ib0
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:08:fe:80:00:00:00:00:00:00:e4:1d:2d:03:00:1d:67:01
Slave queue ID: 0
Slave Interface: mlx4_ib1
MII Status: up
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr:
80:00:02:09:fe:80:00:00:00:00:00:01:e4:1d:2d:03:00:1d:67:02
Slave queue ID: 0
Also tested with a standard 1Gbps NIC bonding setup (with a mix of
e1000 and e1000e cards), running LNST's bonding tests.
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-04-05 05:32:42 +08:00
alb_set_slave_mac_addr ( curr_active , bond_dev - > dev_addr ,
bond_dev - > addr_len ) ;
2005-04-17 06:20:36 +08:00
2014-07-15 21:56:55 +08:00
alb_send_learning_packets ( curr_active ,
2014-05-22 01:19:48 +08:00
bond_dev - > dev_addr , false ) ;
2005-04-17 06:20:36 +08:00
if ( bond - > alb_info . rlb_enabled ) {
/* inform clients mac address has changed */
2014-07-15 21:56:55 +08:00
rlb_req_update_slave_clients ( bond , curr_active ) ;
2005-04-17 06:20:36 +08:00
}
}
return 0 ;
}
void bond_alb_clear_vlan ( struct bonding * bond , unsigned short vlan_id )
{
2014-02-14 17:15:17 +08:00
if ( bond - > alb_info . rlb_enabled )
2005-04-17 06:20:36 +08:00
rlb_clear_vlan ( bond , vlan_id ) ;
}