Merge branch 'smc-RDMA-net-namespace'

Tony Lu says:

====================
RDMA device net namespace support for SMC

This patch set introduces net namespace support for linkgroups.

Path 1 is the main approach to implement net ns support.

Path 2 - 4 are the additional modifications to let us know the netns.
Also, I will submit changes of smc-tools to github later.

Currently, smc doesn't support net namespace isolation. The ibdevs
registered to smc are shared for all linkgroups and connections. When
running applications in different net namespaces, such as container
environment, applications should only use the ibdevs that belongs to the
same net namespace.

This adds a new field, net, in smc linkgroup struct. During first
contact, it checks and find the linkgroup has same net namespace, if
not, it is going to create and initialized the net field with first
link's ibdev net namespace. When finding the rdma devices, it also checks
the sk net device's and ibdev's net namespaces. After net namespace
destroyed, the net device and ibdev move to root net namespace,
linkgroups won't be matched, and wait for lgr free.

If rdma net namespace exclusive mode is not enabled, it behaves as
before.

Steps to enable and test net namespaces:

1. enable RDMA device net namespace exclusive support
	rdma system set netns exclusive # default is shared

2. create new net namespace, move and initialize them
	ip netns add test1
	rdma dev set mlx5_1 netns test1
	ip link set dev eth2 netns test1
	ip netns exec test1 ip link set eth2 up
	ip netns exec test1 ip addr add ${HOST_IP}/26 dev eth2

3. setup server and client, connect N <-> M
	ip netns exec test1 smc_run sockperf server --tcp # server
	ip netns exec test1 smc_run sockperf pp --tcp -i ${SERVER_IP} # client

4. netns isolated linkgroups (2 * 2 mesh) with their own linkgroups
  - server
LG-ID    LG-Role  LG-Type  VLAN  #Conns  PNET-ID
00000100 SERV     SINGLE      0       0
00000200 SERV     SINGLE      0       0
00000300 SERV     SINGLE      0       0
00000400 SERV     SINGLE      0       0

  - client
LG-ID    LG-Role  LG-Type  VLAN  #Conns  PNET-ID
00000100 CLNT     SINGLE      0       0
00000200 CLNT     SINGLE      0       0
00000300 CLNT     SINGLE      0       0
00000400 CLNT     SINGLE      0       0
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2022-01-02 12:07:39 +00:00
commit ab6dd952b2
9 changed files with 92 additions and 40 deletions

View File

@ -119,6 +119,8 @@ enum {
SMC_NLA_LGR_R_CONNS_NUM, /* u32 */
SMC_NLA_LGR_R_V2_COMMON, /* nest */
SMC_NLA_LGR_R_V2, /* nest */
SMC_NLA_LGR_R_NET_COOKIE, /* u64 */
SMC_NLA_LGR_R_PAD, /* flag */
__SMC_NLA_LGR_R_MAX,
SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1
};

View File

@ -84,11 +84,12 @@ struct smc_diag_conninfo {
/* SMC_DIAG_LINKINFO */
struct smc_diag_linkinfo {
__u8 link_id; /* link identifier */
__u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */
__u8 ibport; /* RDMA device port number */
__u8 gid[40]; /* local GID */
__u8 peer_gid[40]; /* peer GID */
__u8 link_id; /* link identifier */
__u8 ibname[IB_DEVICE_NAME_MAX]; /* name of the RDMA device */
__u8 ibport; /* RDMA device port number */
__u8 gid[40]; /* local GID */
__u8 peer_gid[40]; /* peer GID */
__aligned_u64 net_cookie; /* RDMA device net namespace */
};
struct smc_diag_lgrinfo {

View File

@ -348,6 +348,9 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr,
goto errattr;
if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id))
goto errattr;
if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE,
lgr->net->net_cookie, SMC_NLA_LGR_R_PAD))
goto errattr;
memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN);
smc_target[SMC_MAX_PNETID_LEN] = 0;
if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target))
@ -897,6 +900,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
smc_wr_free_lgr_mem(lgr);
goto free_wq;
}
lgr->net = smc_ib_net(lnk->smcibdev);
lgr_list = &smc_lgr_list.list;
lgr_lock = &smc_lgr_list.lock;
atomic_inc(&lgr_cnt);
@ -1548,9 +1552,9 @@ void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type)
lgr_type = "ASYMMETRIC_LOCAL";
break;
}
pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: "
pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: "
"%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id,
lgr_type, lgr->pnet_id);
lgr->net->net_cookie, lgr_type, lgr->pnet_id);
}
/* set new lgr type and tag a link as asymmetric */
@ -1585,7 +1589,8 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
SMC_MAX_PNETID_LEN) ||
lgr->type == SMC_LGR_SYMMETRIC ||
lgr->type == SMC_LGR_ASYMMETRIC_PEER)
lgr->type == SMC_LGR_ASYMMETRIC_PEER ||
!rdma_dev_access_netns(smcibdev->ibdev, lgr->net))
continue;
/* trigger local add link processing */
@ -1743,8 +1748,10 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version,
u8 peer_systemid[],
u8 peer_gid[],
u8 peer_mac_v1[],
enum smc_lgr_role role, u32 clcqpn)
enum smc_lgr_role role, u32 clcqpn,
struct net *net)
{
struct smc_link *lnk;
int i;
if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) ||
@ -1752,12 +1759,17 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version,
return false;
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (!smc_link_active(&lgr->lnk[i]))
lnk = &lgr->lnk[i];
if (!smc_link_active(lnk))
continue;
if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) &&
!memcmp(lgr->lnk[i].peer_gid, peer_gid, SMC_GID_SIZE) &&
/* use verbs API to check netns, instead of lgr->net */
if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net))
return false;
if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) &&
!memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) &&
(smcr_version == SMC_V2 ||
!memcmp(lgr->lnk[i].peer_mac, peer_mac_v1, ETH_ALEN)))
!memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN)))
return true;
}
return false;
@ -1773,6 +1785,7 @@ static bool smcd_lgr_match(struct smc_link_group *lgr,
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
{
struct smc_connection *conn = &smc->conn;
struct net *net = sock_net(&smc->sk);
struct list_head *lgr_list;
struct smc_link_group *lgr;
enum smc_lgr_role role;
@ -1799,7 +1812,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
smcr_lgr_match(lgr, ini->smcr_version,
ini->peer_systemid,
ini->peer_gid, ini->peer_mac, role,
ini->ib_clcqpn)) &&
ini->ib_clcqpn, net)) &&
!lgr->sync_err &&
(ini->smcd_version == SMC_V2 ||
lgr->vlan_id == ini->vlan_id) &&

View File

@ -306,6 +306,8 @@ struct smc_link_group {
u8 nexthop_mac[ETH_ALEN];
u8 uses_gateway;
__be32 saddr;
/* net namespace */
struct net *net;
};
struct { /* SMC-D */
u64 peer_gid;

View File

@ -145,19 +145,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
if (smc->conn.lgr && !smc->conn.lgr->is_smcd &&
(req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
!list_empty(&smc->conn.lgr->list)) {
struct smc_link *link = smc->conn.lnk;
struct net *net = read_pnet(&link->smcibdev->ibdev->coredev.rdma_net);
struct smc_diag_lgrinfo linfo = {
.role = smc->conn.lgr->role,
.lnk[0].ibport = smc->conn.lnk->ibport,
.lnk[0].link_id = smc->conn.lnk->link_id,
.lnk[0].ibport = link->ibport,
.lnk[0].link_id = link->link_id,
.lnk[0].net_cookie = net->net_cookie,
};
memcpy(linfo.lnk[0].ibname,
smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
sizeof(smc->conn.lnk->smcibdev->ibdev->name));
smc_gid_be16_convert(linfo.lnk[0].gid,
smc->conn.lnk->gid);
smc_gid_be16_convert(linfo.lnk[0].peer_gid,
smc->conn.lnk->peer_gid);
sizeof(link->smcibdev->ibdev->name));
smc_gid_be16_convert(linfo.lnk[0].gid, link->gid);
smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid);
if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
goto errout;

View File

@ -69,6 +69,13 @@ static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE])
return cpu_to_be32(INADDR_NONE);
}
static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev)
{
if (smcibdev && smcibdev->ibdev)
return read_pnet(&smcibdev->ibdev->coredev.rdma_net);
return NULL;
}
struct smc_init_info_smcrv2;
struct smc_buf_desc;
struct smc_link;

View File

@ -242,9 +242,10 @@ static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type,
}
/* drop parallel or already-in-progress llc requests */
if (flow_type != msg_type)
pr_warn_once("smc: SMC-R lg %*phN dropped parallel "
pr_warn_once("smc: SMC-R lg %*phN net %llu dropped parallel "
"LLC msg: msg %d flow %d role %d\n",
SMC_LGR_ID_SIZE, &lgr->id,
lgr->net->net_cookie,
qentry->msg.raw.hdr.common.type,
flow_type, lgr->role);
kfree(qentry);
@ -359,9 +360,10 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
smc_llc_flow_qentry_clr(flow));
return NULL;
}
pr_warn_once("smc: SMC-R lg %*phN dropped unexpected LLC msg: "
pr_warn_once("smc: SMC-R lg %*phN net %llu dropped unexpected LLC msg: "
"msg %d exp %d flow %d role %d flags %x\n",
SMC_LGR_ID_SIZE, &lgr->id, rcv_msg, exp_msg,
SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie,
rcv_msg, exp_msg,
flow->type, lgr->role,
flow->qentry->msg.raw.hdr.flags);
smc_llc_flow_qentry_del(flow);
@ -1816,8 +1818,9 @@ finish:
static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type)
{
pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: "
"llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type);
pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu LLC protocol violation: "
"llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id,
lgr->net->net_cookie, type);
smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL);
smc_lgr_terminate_sched(lgr);
}
@ -2146,9 +2149,10 @@ int smc_llc_link_init(struct smc_link *link)
void smc_llc_link_active(struct smc_link *link)
{
pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, "
pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link added: id %*phN, "
"peerid %*phN, ibdev %s, ibport %d\n",
SMC_LGR_ID_SIZE, &link->lgr->id,
link->lgr->net->net_cookie,
SMC_LGR_ID_SIZE, &link->link_uid,
SMC_LGR_ID_SIZE, &link->peer_link_uid,
link->smcibdev->ibdev->name, link->ibport);
@ -2164,9 +2168,10 @@ void smc_llc_link_active(struct smc_link *link)
void smc_llc_link_clear(struct smc_link *link, bool log)
{
if (log)
pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN"
pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link removed: id %*phN"
", peerid %*phN, ibdev %s, ibport %d\n",
SMC_LGR_ID_SIZE, &link->lgr->id,
link->lgr->net->net_cookie,
SMC_LGR_ID_SIZE, &link->link_uid,
SMC_LGR_ID_SIZE, &link->peer_link_uid,
link->smcibdev->ibdev->name, link->ibport);

View File

@ -977,14 +977,16 @@ static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i,
/* find a roce device for the given pnetid */
static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
struct smc_init_info *ini,
struct smc_ib_device *known_dev)
struct smc_ib_device *known_dev,
struct net *net)
{
struct smc_ib_device *ibdev;
int i;
mutex_lock(&smc_ib_devices.mutex);
list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
if (ibdev == known_dev)
if (ibdev == known_dev ||
!rdma_dev_access_netns(ibdev->ibdev, net))
continue;
for (i = 1; i <= SMC_MAX_PORTS; i++) {
if (!rdma_is_port_valid(ibdev->ibdev, i))
@ -1001,12 +1003,14 @@ out:
mutex_unlock(&smc_ib_devices.mutex);
}
/* find alternate roce device with same pnet_id and vlan_id */
/* find alternate roce device with same pnet_id, vlan_id and net namespace */
void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
struct smc_init_info *ini,
struct smc_ib_device *known_dev)
{
_smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev);
struct net *net = lgr->net;
_smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net);
}
/* if handshake network device belongs to a roce device, return its
@ -1015,6 +1019,7 @@ void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
static void smc_pnet_find_rdma_dev(struct net_device *netdev,
struct smc_init_info *ini)
{
struct net *net = dev_net(netdev);
struct smc_ib_device *ibdev;
mutex_lock(&smc_ib_devices.mutex);
@ -1022,6 +1027,10 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev,
struct net_device *ndev;
int i;
/* check rdma net namespace */
if (!rdma_dev_access_netns(ibdev->ibdev, net))
continue;
for (i = 1; i <= SMC_MAX_PORTS; i++) {
if (!rdma_is_port_valid(ibdev->ibdev, i))
continue;
@ -1052,15 +1061,17 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
struct smc_init_info *ini)
{
u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
struct net *net;
ndev = pnet_find_base_ndev(ndev);
net = dev_net(ndev);
if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
ndev_pnetid) &&
smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) {
smc_pnet_find_rdma_dev(ndev, ini);
return; /* pnetid could not be determined */
}
_smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL);
_smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net);
}
static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,

View File

@ -22,6 +22,7 @@ TRACE_EVENT(smc_switch_to_fallback,
TP_STRUCT__entry(
__field(const void *, sk)
__field(const void *, clcsk)
__field(u64, net_cookie)
__field(int, fallback_rsn)
),
@ -31,11 +32,13 @@ TRACE_EVENT(smc_switch_to_fallback,
__entry->sk = sk;
__entry->clcsk = clcsk;
__entry->net_cookie = sock_net(sk)->net_cookie;
__entry->fallback_rsn = fallback_rsn;
),
TP_printk("sk=%p clcsk=%p fallback_rsn=%d",
__entry->sk, __entry->clcsk, __entry->fallback_rsn)
TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d",
__entry->sk, __entry->clcsk,
__entry->net_cookie, __entry->fallback_rsn)
);
DECLARE_EVENT_CLASS(smc_msg_event,
@ -46,19 +49,23 @@ DECLARE_EVENT_CLASS(smc_msg_event,
TP_STRUCT__entry(
__field(const void *, smc)
__field(u64, net_cookie)
__field(size_t, len)
__string(name, smc->conn.lnk->ibname)
),
TP_fast_assign(
const struct sock *sk = &smc->sk;
__entry->smc = smc;
__entry->net_cookie = sock_net(sk)->net_cookie;
__entry->len = len;
__assign_str(name, smc->conn.lnk->ibname);
),
TP_printk("smc=%p len=%zu dev=%s",
__entry->smc, __entry->len,
__get_str(name))
TP_printk("smc=%p net=%llu len=%zu dev=%s",
__entry->smc, __entry->net_cookie,
__entry->len, __get_str(name))
);
DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg,
@ -84,6 +91,7 @@ TRACE_EVENT(smcr_link_down,
TP_STRUCT__entry(
__field(const void *, lnk)
__field(const void *, lgr)
__field(u64, net_cookie)
__field(int, state)
__string(name, lnk->ibname)
__field(void *, location)
@ -94,13 +102,14 @@ TRACE_EVENT(smcr_link_down,
__entry->lnk = lnk;
__entry->lgr = lgr;
__entry->net_cookie = lgr->net->net_cookie;
__entry->state = lnk->state;
__assign_str(name, lnk->ibname);
__entry->location = location;
),
TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%pS",
__entry->lnk, __entry->lgr,
TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS",
__entry->lnk, __entry->lgr, __entry->net_cookie,
__entry->state, __get_str(name),
__entry->location)
);