RDMA/core: Implement compat device/sysfs tree in net namespace
Implement compatibility layer sysfs entries of ib_core so that non init_net net namespaces can also discover rdma devices. Each non init_net net namespace has ib_core_device created in it. Such ib_core_device sysfs tree resembles rdma devices found in init_net namespace. This allows discovering rdma devices in multiple non init_net net namespaces via sysfs entries and helpful to rdma-core userspace. Signed-off-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
This commit is contained in:
parent
62dfa7955e
commit
4e0f7b9070
|
@ -38,6 +38,8 @@
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/init.h>
|
#include <linux/init.h>
|
||||||
#include <linux/netdevice.h>
|
#include <linux/netdevice.h>
|
||||||
|
#include <net/net_namespace.h>
|
||||||
|
#include <net/netns/generic.h>
|
||||||
#include <linux/security.h>
|
#include <linux/security.h>
|
||||||
#include <linux/notifier.h>
|
#include <linux/notifier.h>
|
||||||
#include <linux/hashtable.h>
|
#include <linux/hashtable.h>
|
||||||
|
@ -101,6 +103,30 @@ static DECLARE_RWSEM(clients_rwsem);
|
||||||
* be registered.
|
* be registered.
|
||||||
*/
|
*/
|
||||||
#define CLIENT_DATA_REGISTERED XA_MARK_1
|
#define CLIENT_DATA_REGISTERED XA_MARK_1
|
||||||
|
|
||||||
|
/**
|
||||||
|
* struct rdma_dev_net - rdma net namespace metadata for a net
|
||||||
|
* @net: Pointer to owner net namespace
|
||||||
|
* @id: xarray id to identify the net namespace.
|
||||||
|
*/
|
||||||
|
struct rdma_dev_net {
|
||||||
|
possible_net_t net;
|
||||||
|
u32 id;
|
||||||
|
};
|
||||||
|
|
||||||
|
static unsigned int rdma_dev_net_id;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A list of net namespaces is maintained in an xarray. This is necessary
|
||||||
|
* because we can't get the locking right using the existing net ns list. We
|
||||||
|
* would require a init_net callback after the list is updated.
|
||||||
|
*/
|
||||||
|
static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
|
||||||
|
/*
|
||||||
|
* rwsem to protect accessing the rdma_nets xarray entries.
|
||||||
|
*/
|
||||||
|
static DECLARE_RWSEM(rdma_nets_rwsem);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* xarray has this behavior where it won't iterate over NULL values stored in
|
* xarray has this behavior where it won't iterate over NULL values stored in
|
||||||
* allocated arrays. So we need our own iterator to see all values stored in
|
* allocated arrays. So we need our own iterator to see all values stored in
|
||||||
|
@ -268,6 +294,26 @@ struct ib_device *ib_device_get_by_name(const char *name,
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ib_device_get_by_name);
|
EXPORT_SYMBOL(ib_device_get_by_name);
|
||||||
|
|
||||||
|
static int rename_compat_devs(struct ib_device *device)
|
||||||
|
{
|
||||||
|
struct ib_core_device *cdev;
|
||||||
|
unsigned long index;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
mutex_lock(&device->compat_devs_mutex);
|
||||||
|
xa_for_each (&device->compat_devs, index, cdev) {
|
||||||
|
ret = device_rename(&cdev->dev, dev_name(&device->dev));
|
||||||
|
if (ret) {
|
||||||
|
dev_warn(&cdev->dev,
|
||||||
|
"Fail to rename compatdev to new name %s\n",
|
||||||
|
dev_name(&device->dev));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mutex_unlock(&device->compat_devs_mutex);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
int ib_device_rename(struct ib_device *ibdev, const char *name)
|
int ib_device_rename(struct ib_device *ibdev, const char *name)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
@ -287,6 +333,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out;
|
goto out;
|
||||||
strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
|
strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
|
||||||
|
ret = rename_compat_devs(ibdev);
|
||||||
out:
|
out:
|
||||||
up_write(&devices_rwsem);
|
up_write(&devices_rwsem);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -336,6 +383,7 @@ static void ib_device_release(struct device *device)
|
||||||
WARN_ON(refcount_read(&dev->refcount));
|
WARN_ON(refcount_read(&dev->refcount));
|
||||||
ib_cache_release_one(dev);
|
ib_cache_release_one(dev);
|
||||||
ib_security_release_port_pkey_list(dev);
|
ib_security_release_port_pkey_list(dev);
|
||||||
|
xa_destroy(&dev->compat_devs);
|
||||||
xa_destroy(&dev->client_data);
|
xa_destroy(&dev->client_data);
|
||||||
if (dev->port_data)
|
if (dev->port_data)
|
||||||
kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
|
kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
|
||||||
|
@ -359,7 +407,10 @@ static int ib_device_uevent(struct device *device,
|
||||||
|
|
||||||
static const void *net_namespace(struct device *d)
|
static const void *net_namespace(struct device *d)
|
||||||
{
|
{
|
||||||
return &init_net;
|
struct ib_core_device *coredev =
|
||||||
|
container_of(d, struct ib_core_device, dev);
|
||||||
|
|
||||||
|
return read_pnet(&coredev->rdma_net);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct class ib_class = {
|
static struct class ib_class = {
|
||||||
|
@ -371,7 +422,7 @@ static struct class ib_class = {
|
||||||
};
|
};
|
||||||
|
|
||||||
static void rdma_init_coredev(struct ib_core_device *coredev,
|
static void rdma_init_coredev(struct ib_core_device *coredev,
|
||||||
struct ib_device *dev)
|
struct ib_device *dev, struct net *net)
|
||||||
{
|
{
|
||||||
/* This BUILD_BUG_ON is intended to catch layout change
|
/* This BUILD_BUG_ON is intended to catch layout change
|
||||||
* of union of ib_core_device and device.
|
* of union of ib_core_device and device.
|
||||||
|
@ -387,6 +438,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
|
||||||
device_initialize(&coredev->dev);
|
device_initialize(&coredev->dev);
|
||||||
coredev->owner = dev;
|
coredev->owner = dev;
|
||||||
INIT_LIST_HEAD(&coredev->port_list);
|
INIT_LIST_HEAD(&coredev->port_list);
|
||||||
|
write_pnet(&coredev->rdma_net, net);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -416,7 +468,7 @@ struct ib_device *_ib_alloc_device(size_t size)
|
||||||
}
|
}
|
||||||
|
|
||||||
device->groups[0] = &ib_dev_attr_group;
|
device->groups[0] = &ib_dev_attr_group;
|
||||||
rdma_init_coredev(&device->coredev, device);
|
rdma_init_coredev(&device->coredev, device, &init_net);
|
||||||
|
|
||||||
INIT_LIST_HEAD(&device->event_handler_list);
|
INIT_LIST_HEAD(&device->event_handler_list);
|
||||||
spin_lock_init(&device->event_handler_lock);
|
spin_lock_init(&device->event_handler_lock);
|
||||||
|
@ -427,6 +479,8 @@ struct ib_device *_ib_alloc_device(size_t size)
|
||||||
*/
|
*/
|
||||||
xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
|
xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
|
||||||
init_rwsem(&device->client_data_rwsem);
|
init_rwsem(&device->client_data_rwsem);
|
||||||
|
xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
|
||||||
|
mutex_init(&device->compat_devs_mutex);
|
||||||
init_completion(&device->unreg_completion);
|
init_completion(&device->unreg_completion);
|
||||||
INIT_WORK(&device->unregistration_work, ib_unregister_work);
|
INIT_WORK(&device->unregistration_work, ib_unregister_work);
|
||||||
|
|
||||||
|
@ -459,6 +513,7 @@ void ib_dealloc_device(struct ib_device *device)
|
||||||
/* Expedite releasing netdev references */
|
/* Expedite releasing netdev references */
|
||||||
free_netdevs(device);
|
free_netdevs(device);
|
||||||
|
|
||||||
|
WARN_ON(!xa_empty(&device->compat_devs));
|
||||||
WARN_ON(!xa_empty(&device->client_data));
|
WARN_ON(!xa_empty(&device->client_data));
|
||||||
WARN_ON(refcount_read(&device->refcount));
|
WARN_ON(refcount_read(&device->refcount));
|
||||||
rdma_restrack_clean(device);
|
rdma_restrack_clean(device);
|
||||||
|
@ -667,6 +722,180 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
|
||||||
return NOTIFY_OK;
|
return NOTIFY_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void compatdev_release(struct device *dev)
|
||||||
|
{
|
||||||
|
struct ib_core_device *cdev =
|
||||||
|
container_of(dev, struct ib_core_device, dev);
|
||||||
|
|
||||||
|
kfree(cdev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int add_one_compat_dev(struct ib_device *device,
|
||||||
|
struct rdma_dev_net *rnet)
|
||||||
|
{
|
||||||
|
struct ib_core_device *cdev;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Create and add compat device in all namespaces other than where it
|
||||||
|
* is currently bound to.
|
||||||
|
*/
|
||||||
|
if (net_eq(read_pnet(&rnet->net),
|
||||||
|
read_pnet(&device->coredev.rdma_net)))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The first of init_net() or ib_register_device() to take the
|
||||||
|
* compat_devs_mutex wins and gets to add the device. Others will wait
|
||||||
|
* for completion here.
|
||||||
|
*/
|
||||||
|
mutex_lock(&device->compat_devs_mutex);
|
||||||
|
cdev = xa_load(&device->compat_devs, rnet->id);
|
||||||
|
if (cdev) {
|
||||||
|
ret = 0;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
|
||||||
|
if (ret)
|
||||||
|
goto done;
|
||||||
|
|
||||||
|
cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
|
||||||
|
if (!cdev) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto cdev_err;
|
||||||
|
}
|
||||||
|
|
||||||
|
cdev->dev.parent = device->dev.parent;
|
||||||
|
rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
|
||||||
|
cdev->dev.release = compatdev_release;
|
||||||
|
dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
|
||||||
|
|
||||||
|
ret = device_add(&cdev->dev);
|
||||||
|
if (ret)
|
||||||
|
goto add_err;
|
||||||
|
|
||||||
|
ret = xa_err(xa_store(&device->compat_devs, rnet->id,
|
||||||
|
cdev, GFP_KERNEL));
|
||||||
|
if (ret)
|
||||||
|
goto insert_err;
|
||||||
|
|
||||||
|
mutex_unlock(&device->compat_devs_mutex);
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
insert_err:
|
||||||
|
device_del(&cdev->dev);
|
||||||
|
add_err:
|
||||||
|
put_device(&cdev->dev);
|
||||||
|
cdev_err:
|
||||||
|
xa_release(&device->compat_devs, rnet->id);
|
||||||
|
done:
|
||||||
|
mutex_unlock(&device->compat_devs_mutex);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remove_one_compat_dev(struct ib_device *device, u32 id)
|
||||||
|
{
|
||||||
|
struct ib_core_device *cdev;
|
||||||
|
|
||||||
|
mutex_lock(&device->compat_devs_mutex);
|
||||||
|
cdev = xa_erase(&device->compat_devs, id);
|
||||||
|
mutex_unlock(&device->compat_devs_mutex);
|
||||||
|
if (cdev) {
|
||||||
|
device_del(&cdev->dev);
|
||||||
|
put_device(&cdev->dev);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remove_compat_devs(struct ib_device *device)
|
||||||
|
{
|
||||||
|
struct ib_core_device *cdev;
|
||||||
|
unsigned long index;
|
||||||
|
|
||||||
|
xa_for_each (&device->compat_devs, index, cdev)
|
||||||
|
remove_one_compat_dev(device, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int add_compat_devs(struct ib_device *device)
|
||||||
|
{
|
||||||
|
struct rdma_dev_net *rnet;
|
||||||
|
unsigned long index;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
down_read(&rdma_nets_rwsem);
|
||||||
|
xa_for_each (&rdma_nets, index, rnet) {
|
||||||
|
ret = add_one_compat_dev(device, rnet);
|
||||||
|
if (ret)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
up_read(&rdma_nets_rwsem);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rdma_dev_exit_net(struct net *net)
|
||||||
|
{
|
||||||
|
struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
|
||||||
|
struct ib_device *dev;
|
||||||
|
unsigned long index;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
down_write(&rdma_nets_rwsem);
|
||||||
|
/*
|
||||||
|
* Prevent the ID from being re-used and hide the id from xa_for_each.
|
||||||
|
*/
|
||||||
|
ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
|
||||||
|
WARN_ON(ret);
|
||||||
|
up_write(&rdma_nets_rwsem);
|
||||||
|
|
||||||
|
down_read(&devices_rwsem);
|
||||||
|
xa_for_each (&devices, index, dev) {
|
||||||
|
get_device(&dev->dev);
|
||||||
|
/*
|
||||||
|
* Release the devices_rwsem so that pontentially blocking
|
||||||
|
* device_del, doesn't hold the devices_rwsem for too long.
|
||||||
|
*/
|
||||||
|
up_read(&devices_rwsem);
|
||||||
|
|
||||||
|
remove_one_compat_dev(dev, rnet->id);
|
||||||
|
|
||||||
|
put_device(&dev->dev);
|
||||||
|
down_read(&devices_rwsem);
|
||||||
|
}
|
||||||
|
up_read(&devices_rwsem);
|
||||||
|
|
||||||
|
xa_erase(&rdma_nets, rnet->id);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __net_init int rdma_dev_init_net(struct net *net)
|
||||||
|
{
|
||||||
|
struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
|
||||||
|
unsigned long index;
|
||||||
|
struct ib_device *dev;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/* No need to create any compat devices in default init_net. */
|
||||||
|
if (net_eq(net, &init_net))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
write_pnet(&rnet->net, net);
|
||||||
|
|
||||||
|
ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
down_read(&devices_rwsem);
|
||||||
|
xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
|
||||||
|
ret = add_one_compat_dev(dev, rnet);
|
||||||
|
if (ret)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
up_read(&devices_rwsem);
|
||||||
|
|
||||||
|
if (ret)
|
||||||
|
rdma_dev_exit_net(net);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Assign the unique string device name and the unique device index. This is
|
* Assign the unique string device name and the unique device index. This is
|
||||||
* undone by ib_dealloc_device.
|
* undone by ib_dealloc_device.
|
||||||
|
@ -788,6 +1017,13 @@ static void disable_device(struct ib_device *device)
|
||||||
ib_device_put(device);
|
ib_device_put(device);
|
||||||
wait_for_completion(&device->unreg_completion);
|
wait_for_completion(&device->unreg_completion);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* compat devices must be removed after device refcount drops to zero.
|
||||||
|
* Otherwise init_net() may add more compatdevs after removing compat
|
||||||
|
* devices and before device is disabled.
|
||||||
|
*/
|
||||||
|
remove_compat_devs(device);
|
||||||
|
|
||||||
/* Expedite removing unregistered pointers from the hash table */
|
/* Expedite removing unregistered pointers from the hash table */
|
||||||
free_netdevs(device);
|
free_netdevs(device);
|
||||||
}
|
}
|
||||||
|
@ -830,7 +1066,8 @@ static int enable_device_and_get(struct ib_device *device)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
up_read(&clients_rwsem);
|
up_read(&clients_rwsem);
|
||||||
|
if (!ret)
|
||||||
|
ret = add_compat_devs(device);
|
||||||
out:
|
out:
|
||||||
up_read(&devices_rwsem);
|
up_read(&devices_rwsem);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -1061,6 +1298,13 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ib_unregister_device_queued);
|
EXPORT_SYMBOL(ib_unregister_device_queued);
|
||||||
|
|
||||||
|
static struct pernet_operations rdma_dev_net_ops = {
|
||||||
|
.init = rdma_dev_init_net,
|
||||||
|
.exit = rdma_dev_exit_net,
|
||||||
|
.id = &rdma_dev_net_id,
|
||||||
|
.size = sizeof(struct rdma_dev_net),
|
||||||
|
};
|
||||||
|
|
||||||
static int assign_client_id(struct ib_client *client)
|
static int assign_client_id(struct ib_client *client)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
|
@ -1926,12 +2170,20 @@ static int __init ib_core_init(void)
|
||||||
goto err_sa;
|
goto err_sa;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = register_pernet_device(&rdma_dev_net_ops);
|
||||||
|
if (ret) {
|
||||||
|
pr_warn("Couldn't init compat dev. ret %d\n", ret);
|
||||||
|
goto err_compat;
|
||||||
|
}
|
||||||
|
|
||||||
nldev_init();
|
nldev_init();
|
||||||
rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
|
rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
|
||||||
roce_gid_mgmt_init();
|
roce_gid_mgmt_init();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
err_compat:
|
||||||
|
unregister_lsm_notifier(&ibdev_lsm_nb);
|
||||||
err_sa:
|
err_sa:
|
||||||
ib_sa_cleanup();
|
ib_sa_cleanup();
|
||||||
err_mad:
|
err_mad:
|
||||||
|
@ -1956,6 +2208,7 @@ static void __exit ib_core_cleanup(void)
|
||||||
roce_gid_mgmt_cleanup();
|
roce_gid_mgmt_cleanup();
|
||||||
nldev_exit();
|
nldev_exit();
|
||||||
rdma_nl_unregister(RDMA_NL_LS);
|
rdma_nl_unregister(RDMA_NL_LS);
|
||||||
|
unregister_pernet_device(&rdma_dev_net_ops);
|
||||||
unregister_lsm_notifier(&ibdev_lsm_nb);
|
unregister_lsm_notifier(&ibdev_lsm_nb);
|
||||||
ib_sa_cleanup();
|
ib_sa_cleanup();
|
||||||
ib_mad_cleanup();
|
ib_mad_cleanup();
|
||||||
|
|
|
@ -2559,6 +2559,7 @@ struct ib_core_device {
|
||||||
* union of ib_core_device and device exists in ib_device.
|
* union of ib_core_device and device exists in ib_device.
|
||||||
*/
|
*/
|
||||||
struct device dev;
|
struct device dev;
|
||||||
|
possible_net_t rdma_net;
|
||||||
struct kobject *ports_kobj;
|
struct kobject *ports_kobj;
|
||||||
struct list_head port_list;
|
struct list_head port_list;
|
||||||
struct ib_device *owner; /* reach back to owner ib_device */
|
struct ib_device *owner; /* reach back to owner ib_device */
|
||||||
|
@ -2636,6 +2637,11 @@ struct ib_device {
|
||||||
struct work_struct unregistration_work;
|
struct work_struct unregistration_work;
|
||||||
|
|
||||||
const struct rdma_link_ops *link_ops;
|
const struct rdma_link_ops *link_ops;
|
||||||
|
|
||||||
|
/* Protects compat_devs xarray modifications */
|
||||||
|
struct mutex compat_devs_mutex;
|
||||||
|
/* Maintains compat devices for each net namespace */
|
||||||
|
struct xarray compat_devs;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ib_client {
|
struct ib_client {
|
||||||
|
|
Loading…
Reference in New Issue