From 6ee51a4e866bbb0921180b457ed16cd172859346 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 12 Mar 2014 12:00:37 +0200 Subject: [PATCH 1/7] mlx4: Adjust QP1 multiplexing for RoCE/SRIOV This requires the following modifications: 1. Fix build_mlx4_header to properly fill in the ETH fields 2. Adjust mux and demux QP1 flow to support RoCE. This commit still assumes only one GID per slave for RoCE. The commit enabling multiple GIDs is a subsequent commit, and is done separately because of its complexity. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/cm.c | 8 +++- drivers/infiniband/hw/mlx4/mad.c | 50 ++++++++++++++++++++--- drivers/infiniband/hw/mlx4/qp.c | 29 +++++++------ drivers/net/ethernet/mellanox/mlx4/mlx4.h | 5 +++ drivers/net/ethernet/mellanox/mlx4/port.c | 34 +++++++++++++++ include/linux/mlx4/device.h | 4 ++ 6 files changed, 110 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index d1f5f1dd77b0..b8d911543783 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -315,7 +315,7 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id } int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, - struct ib_mad *mad) + struct ib_mad *mad) { u32 pv_cm_id; struct id_map_entry *id; @@ -323,6 +323,9 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { union ib_gid gid; + if (!slave) + return 0; + gid = gid_from_req_msg(ibdev, mad); *slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); if (*slave < 0) { @@ -341,7 +344,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, return -ENOENT; } - *slave = id->slave_id; + if (slave) + *slave = id->slave_id; set_remote_comm_id(mad, id->sl_cm_id); if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index f2a3f48107e7..c2e9879a5a34 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -467,6 +467,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, int ret = 0; u16 tun_pkey_ix; u16 cached_pkey; + u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; if (dest_qpt > IB_QPT_GSI) return -EINVAL; @@ -509,6 +510,12 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, * The driver will set the force loopback bit in post_send */ memset(&attr, 0, sizeof attr); attr.port_num = port; + if (is_eth) { + ret = mlx4_get_roce_gid_from_slave(dev->dev, port, slave, attr.grh.dgid.raw); + if (ret) + return ret; + attr.ah_flags = IB_AH_GRH; + } ah = ib_create_ah(tun_ctx->pd, &attr); if (IS_ERR(ah)) return -ENOMEM; @@ -580,6 +587,41 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, int err; int slave; u8 *slave_id; + int is_eth = 0; + + if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + is_eth = 0; + else + is_eth = 1; + + if (is_eth) { + if (!(wc->wc_flags & IB_WC_GRH)) { + mlx4_ib_warn(ibdev, "RoCE grh not present.\n"); + return -EINVAL; + } + if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) { + mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n"); + return -EINVAL; + } + if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + if (slave >= dev->dev->caps.sqp_demux) { + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", + slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad)) + return 0; + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + pr_debug("failed sending to slave %d via tunnel qp (%d)\n", + slave, err); + return 0; + } /* Initially assume that this mad is for us */ slave = mlx4_master_func_num(dev->dev); @@ -1260,12 +1302,8 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); ah.ibah.device = ctx->ib_dev; mlx4_ib_query_ah(&ah.ibah, &ah_attr); - if ((ah_attr.ah_flags & IB_AH_GRH) && - (ah_attr.grh.sgid_index != slave)) { - mlx4_ib_warn(ctx->ib_dev, "slave:%d accessed invalid sgid_index:%d\n", - slave, ah_attr.grh.sgid_index); - return; - } + if (ah_attr.ah_flags & IB_AH_GRH) + ah_attr.grh.sgid_index = slave; mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index d8f4d1fe8494..c6ef2e7e3045 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1842,9 +1842,9 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, { struct ib_device *ib_dev = sqp->qp.ibqp.device; struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_ctrl_seg *ctrl = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); - struct net_device *ndev; union ib_gid sgid; u16 pkey; int send_size; @@ -1868,12 +1868,11 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so * we must use our own cache */ - sgid.global.subnet_prefix = - to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. - subnet_prefix; - sgid.global.interface_id = - to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. - guid_cache[ah->av.ib.gid_index]; + err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid.raw[0]); + if (err) + return err; } else { err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, @@ -1902,6 +1901,9 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + if (is_eth) + memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16); + else { if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so @@ -1917,6 +1919,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + } memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16); } @@ -1948,17 +1951,19 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, } if (is_eth) { - u8 *smac; + u8 smac[6]; + struct in6_addr in6; + u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; mlx->sched_prio = cpu_to_be16(pcp); memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); /* FIXME: cache smac value? */ - ndev = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]; - if (!ndev) - return -ENODEV; - smac = ndev->dev_addr; + memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); + memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); + memcpy(&in6, sgid.raw, sizeof(in6)); + rdma_get_ll_mac(&in6, smac); memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 7aec6c833973..da829f4ef938 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -788,6 +788,10 @@ enum { MLX4_USE_RR = 1, }; +struct mlx4_roce_gid_entry { + u8 raw[16]; +}; + struct mlx4_priv { struct mlx4_dev dev; @@ -834,6 +838,7 @@ struct mlx4_priv { int fs_hash_mode; u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; __be64 slave_node_guids[MLX4_MFUNC_MAX]; + struct mlx4_roce_gid_entry roce_gids[MLX4_MAX_PORTS][MLX4_ROCE_MAX_GIDS]; atomic_t opreq_count; struct work_struct opreq_task; diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index a58bcbf1b806..9c063d6122b3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -927,3 +927,37 @@ void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap) *stats_bitmap |= MLX4_STATS_ERROR_COUNTERS_MASK; } EXPORT_SYMBOL(mlx4_set_stats_bitmap); + +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, found_ix = -1; + + if (!mlx4_is_mfunc(dev)) + return -EINVAL; + + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { + if (!memcmp(priv->roce_gids[port - 1][i].raw, gid, 16)) { + found_ix = i; + break; + } + } + + if (found_ix >= 0) + *slave_id = found_ix; + + return (found_ix >= 0) ? 0 : -EINVAL; +} +EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid); + +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + if (!mlx4_is_master(dev)) + return -EINVAL; + + memcpy(gid, priv->roce_gids[port - 1][slave_id].raw, 16); + return 0; +} +EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 5edd2c68274d..fbe6cda00ba7 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -48,6 +48,8 @@ #define MSIX_LEGACY_SZ 4 #define MIN_MSIX_P_PORT 5 +#define MLX4_ROCE_MAX_GIDS 128 + enum { MLX4_FLAG_MSI_X = 1 << 0, MLX4_FLAG_OLD_PORT_CMDS = 1 << 1, @@ -1182,6 +1184,8 @@ int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid); __be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave); +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id); +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid); int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn); From 9cd593529c8652785bc9962acc79b6b176741f99 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 12 Mar 2014 12:00:38 +0200 Subject: [PATCH 2/7] mlx4_core: For RoCE, allow slaves to set the GID entry at that slave's index For IB transport, the host determines the slave GIDs. For ETH (RoCE), however, the slave's GID is determined by the IP address that the slave itself assigns to the ETH device used by RoCE. In this case, the slave must be able to write its GIDs to the HCA gid table (at the GID indices that slave "owns"). This commit adds processing for the SET_PORT_GID_TABLE opcode modifier for the SET_PORT command wrapper (so that slaves may modify their GIDS for RoCE). Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/port.c | 33 ++++++++++++++++++++--- include/linux/mlx4/device.h | 7 +++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index 9c063d6122b3..591740b06043 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -505,6 +505,7 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) mlx4_free_cmd_mailbox(dev, outmailbox); return err; } +static struct mlx4_roce_gid_entry zgid_entry; static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, u8 op_mod, struct mlx4_cmd_mailbox *inbox) @@ -515,6 +516,7 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, struct mlx4_slave_state *slave_st = &master->slave_state[slave]; struct mlx4_set_port_rqp_calc_context *qpn_context; struct mlx4_set_port_general_context *gen_context; + struct mlx4_roce_gid_entry *gid_entry; int reset_qkey_viols; int port; int is_eth; @@ -535,7 +537,8 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, /* Slaves cannot perform SET_PORT operations except changing MTU */ if (is_eth) { if (slave != dev->caps.function && - in_modifier != MLX4_SET_PORT_GENERAL) { + in_modifier != MLX4_SET_PORT_GENERAL && + in_modifier != MLX4_SET_PORT_GID_TABLE) { mlx4_warn(dev, "denying SET_PORT for slave:%d\n", slave); return -EINVAL; @@ -581,6 +584,28 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, gen_context->mtu = cpu_to_be16(master->max_mtu[port]); break; + case MLX4_SET_PORT_GID_TABLE: + gid_entry = (struct mlx4_roce_gid_entry *)(inbox->buf); + /* check that do not have duplicates */ + if (memcmp(gid_entry->raw, zgid_entry.raw, 16)) { + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { + if (slave != i && + !memcmp(gid_entry->raw, priv->roce_gids[port - 1][i].raw, 16)) { + mlx4_warn(dev, "requested gid entry for slave:%d " + "is a duplicate of slave %d\n", + slave, i); + return -EEXIST; + } + } + } + /* insert slave GID at proper index */ + memcpy(priv->roce_gids[port - 1][slave].raw, gid_entry->raw, 16); + /* rewrite roce port gids table to FW */ + for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { + memcpy(gid_entry->raw, priv->roce_gids[port - 1][i].raw, 16); + gid_entry++; + } + break; } return mlx4_cmd(dev, inbox->dma, in_mod, op_mod, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, @@ -928,7 +953,8 @@ void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap) } EXPORT_SYMBOL(mlx4_set_stats_bitmap); -int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id) +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, + int *slave_id) { struct mlx4_priv *priv = mlx4_priv(dev); int i, found_ix = -1; @@ -950,7 +976,8 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *s } EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid); -int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid) +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, + u8 *gid) { struct mlx4_priv *priv = mlx4_priv(dev); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index fbe6cda00ba7..e4853650679d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1184,8 +1184,11 @@ int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid); __be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave); -int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, int *slave_id); -int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, u8 *gid); + +int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, + int *slave_id); +int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id, + u8 *gid); int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn, u32 max_range_qpn); From b6ffaeffaea4d92f05f5ba1ef54df407cb7c8517 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 12 Mar 2014 12:00:39 +0200 Subject: [PATCH 3/7] mlx4: In RoCE allow guests to have multiple GIDS The GIDs are statically distributed, as follows: PF: gets 16 GIDs VFs: Remaining GIDS are divided evenly between VFs activated by the driver. If the division is not even, lower-numbered VFs get an extra GID. For an IB interface, the number of gids per guest remains as before: one gid per guest. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/mad.c | 34 ++++- drivers/net/ethernet/mellanox/mlx4/fw.c | 5 +- drivers/net/ethernet/mellanox/mlx4/main.c | 6 +- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 + drivers/net/ethernet/mellanox/mlx4/port.c | 117 +++++++++++++++--- .../ethernet/mellanox/mlx4/resource_tracker.c | 62 ++++++++-- include/linux/mlx4/device.h | 1 + 7 files changed, 191 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index c2e9879a5a34..c5bca0f0da4a 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -511,9 +511,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, memset(&attr, 0, sizeof attr); attr.port_num = port; if (is_eth) { - ret = mlx4_get_roce_gid_from_slave(dev->dev, port, slave, attr.grh.dgid.raw); - if (ret) - return ret; + memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16); attr.ah_flags = IB_AH_GRH; } ah = ib_create_ah(tun_ctx->pd, &attr); @@ -1216,6 +1214,34 @@ out: return ret; } +static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port) +{ + int gids; + int vfs; + + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + return slave; + + gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; + vfs = dev->dev->num_vfs; + + if (slave == 0) + return 0; + if (slave <= gids % vfs) + return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1); + + return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1)); +} + +static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port, + struct ib_ah_attr *ah_attr) +{ + if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) + ah_attr->grh.sgid_index = slave; + else + ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port); +} + static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) { struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); @@ -1303,7 +1329,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc ah.ibah.device = ctx->ib_dev; mlx4_ib_query_ah(&ah.ibah, &ah_attr); if (ah_attr.ah_flags & IB_AH_GRH) - ah_attr.grh.sgid_index = slave; + fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 9cdf452140da..6e1ee2170a39 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -934,7 +934,10 @@ int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, MLX4_PUT(outbox->buf, port_type, QUERY_PORT_SUPPORTED_TYPE_OFFSET); - short_field = 1; /* slave max gids */ + if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH) + short_field = mlx4_get_slave_num_gids(dev, slave); + else + short_field = 1; /* slave max gids */ MLX4_PUT(outbox->buf, short_field, QUERY_PORT_CUR_MAX_GID_OFFSET); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 979ea4364efb..4c441aa83016 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1462,7 +1462,11 @@ static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev) int i; for (i = 1; i <= dev->caps.num_ports; i++) { - dev->caps.gid_table_len[i] = 1; + if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) + dev->caps.gid_table_len[i] = + mlx4_get_slave_num_gids(dev, 0); + else + dev->caps.gid_table_len[i] = 1; dev->caps.pkey_table_len[i] = dev->phys_caps.pkey_phys_table_len[i] - 1; } diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index da829f4ef938..6ba38c98c492 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -1287,4 +1287,7 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work); void mlx4_init_quotas(struct mlx4_dev *dev); +int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave); +int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave); + #endif /* MLX4_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index 591740b06043..ece328166e94 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -507,6 +507,31 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) } static struct mlx4_roce_gid_entry zgid_entry; +int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave) +{ + if (slave == 0) + return MLX4_ROCE_PF_GIDS; + if (slave <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % dev->num_vfs)) + return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs) + 1; + return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / dev->num_vfs; +} + +int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave) +{ + int gids; + int vfs; + + gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; + vfs = dev->num_vfs; + + if (slave == 0) + return 0; + if (slave <= gids % vfs) + return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1); + + return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1)); +} + static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, u8 op_mod, struct mlx4_cmd_mailbox *inbox) { @@ -516,15 +541,18 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, struct mlx4_slave_state *slave_st = &master->slave_state[slave]; struct mlx4_set_port_rqp_calc_context *qpn_context; struct mlx4_set_port_general_context *gen_context; - struct mlx4_roce_gid_entry *gid_entry; + struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1; int reset_qkey_viols; int port; int is_eth; + int num_gids; + int base; u32 in_modifier; u32 promisc; u16 mtu, prev_mtu; int err; - int i; + int i, j; + int offset; __be32 agg_cap_mask; __be32 slave_cap_mask; __be32 new_cap_mask; @@ -585,26 +613,65 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, gen_context->mtu = cpu_to_be16(master->max_mtu[port]); break; case MLX4_SET_PORT_GID_TABLE: - gid_entry = (struct mlx4_roce_gid_entry *)(inbox->buf); - /* check that do not have duplicates */ - if (memcmp(gid_entry->raw, zgid_entry.raw, 16)) { - for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { - if (slave != i && - !memcmp(gid_entry->raw, priv->roce_gids[port - 1][i].raw, 16)) { - mlx4_warn(dev, "requested gid entry for slave:%d " - "is a duplicate of slave %d\n", - slave, i); - return -EEXIST; + /* change to MULTIPLE entries: number of guest's gids + * need a FOR-loop here over number of gids the guest has. + * 1. Check no duplicates in gids passed by slave + */ + num_gids = mlx4_get_slave_num_gids(dev, slave); + base = mlx4_get_base_gid_ix(dev, slave); + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (i = 0; i < num_gids; gid_entry_mbox++, i++) { + if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, + sizeof(zgid_entry))) + continue; + gid_entry_mb1 = gid_entry_mbox + 1; + for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) { + if (!memcmp(gid_entry_mb1->raw, + zgid_entry.raw, sizeof(zgid_entry))) + continue; + if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw, + sizeof(gid_entry_mbox->raw))) { + /* found duplicate */ + return -EINVAL; } } } - /* insert slave GID at proper index */ - memcpy(priv->roce_gids[port - 1][slave].raw, gid_entry->raw, 16); - /* rewrite roce port gids table to FW */ + + /* 2. Check that do not have duplicates in OTHER + * entries in the port GID table + */ for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) { - memcpy(gid_entry->raw, priv->roce_gids[port - 1][i].raw, 16); - gid_entry++; + if (i >= base && i < base + num_gids) + continue; /* don't compare to slave's current gids */ + gid_entry_tbl = &priv->roce_gids[port - 1][i]; + if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry))) + continue; + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (j = 0; j < num_gids; gid_entry_mbox++, j++) { + if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw, + sizeof(zgid_entry))) + continue; + if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw, + sizeof(gid_entry_tbl->raw))) { + /* found duplicate */ + mlx4_warn(dev, "requested gid entry for slave:%d " + "is a duplicate of gid at index %d\n", + slave, i); + return -EINVAL; + } + } } + + /* insert slave GIDs with memcpy, starting at slave's base index */ + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++) + memcpy(priv->roce_gids[port - 1][offset].raw, gid_entry_mbox->raw, 16); + + /* Now, copy roce port gids table to current mailbox for passing to FW */ + gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf); + for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++) + memcpy(gid_entry_mbox->raw, priv->roce_gids[port - 1][i].raw, 16); + break; } return mlx4_cmd(dev, inbox->dma, in_mod, op_mod, @@ -958,6 +1025,7 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, { struct mlx4_priv *priv = mlx4_priv(dev); int i, found_ix = -1; + int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS; if (!mlx4_is_mfunc(dev)) return -EINVAL; @@ -969,8 +1037,19 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid, } } - if (found_ix >= 0) - *slave_id = found_ix; + if (found_ix >= 0) { + if (found_ix < MLX4_ROCE_PF_GIDS) + *slave_id = 0; + else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % dev->num_vfs) * + (vf_gids / dev->num_vfs + 1)) + *slave_id = ((found_ix - MLX4_ROCE_PF_GIDS) / + (vf_gids / dev->num_vfs + 1)) + 1; + else + *slave_id = + ((found_ix - MLX4_ROCE_PF_GIDS - + ((vf_gids % dev->num_vfs) * ((vf_gids / dev->num_vfs + 1)))) / + (vf_gids / dev->num_vfs)) + vf_gids % dev->num_vfs + 1; + } return (found_ix >= 0) ? 0 : -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 57428a0cb9dd..1c3634eab5e1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -219,6 +219,11 @@ struct res_fs_rule { int qpn; }; +static int mlx4_is_eth(struct mlx4_dev *dev, int port) +{ + return dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB ? 0 : 1; +} + static void *res_tracker_lookup(struct rb_root *root, u64 res_id) { struct rb_node *node = root->rb_node; @@ -600,15 +605,34 @@ static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox, struct mlx4_qp_context *qp_ctx = inbox->buf + 8; enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *) inbox->buf); u32 ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; + int port; - if (MLX4_QP_ST_UD == ts) - qp_ctx->pri_path.mgid_index = 0x80 | slave; + if (MLX4_QP_ST_UD == ts) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) + qp_ctx->pri_path.mgid_index = mlx4_get_base_gid_ix(dev, slave) | 0x80; + else + qp_ctx->pri_path.mgid_index = slave | 0x80; - if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_UC == ts) { - if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) - qp_ctx->pri_path.mgid_index = slave & 0x7F; - if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) - qp_ctx->alt_path.mgid_index = slave & 0x7F; + } else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_XRC == ts || MLX4_QP_ST_UC == ts) { + if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) { + qp_ctx->pri_path.mgid_index += mlx4_get_base_gid_ix(dev, slave); + qp_ctx->pri_path.mgid_index &= 0x7f; + } else { + qp_ctx->pri_path.mgid_index = slave & 0x7F; + } + } + if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) { + port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port)) { + qp_ctx->alt_path.mgid_index += mlx4_get_base_gid_ix(dev, slave); + qp_ctx->alt_path.mgid_index &= 0x7f; + } else { + qp_ctx->alt_path.mgid_index = slave & 0x7F; + } + } } } @@ -2734,6 +2758,8 @@ static int verify_qp_parameters(struct mlx4_dev *dev, u32 qp_type; struct mlx4_qp_context *qp_ctx; enum mlx4_qp_optpar optpar; + int port; + int num_gids; qp_ctx = inbox->buf + 8; qp_type = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; @@ -2741,6 +2767,7 @@ static int verify_qp_parameters(struct mlx4_dev *dev, switch (qp_type) { case MLX4_QP_ST_RC: + case MLX4_QP_ST_XRC: case MLX4_QP_ST_UC: switch (transition) { case QP_TRANS_INIT2RTR: @@ -2749,13 +2776,24 @@ static int verify_qp_parameters(struct mlx4_dev *dev, case QP_TRANS_SQD2SQD: case QP_TRANS_SQD2RTS: if (slave != mlx4_master_func_num(dev)) - /* slaves have only gid index 0 */ - if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) - if (qp_ctx->pri_path.mgid_index) + if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) { + port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1; + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) + num_gids = mlx4_get_slave_num_gids(dev, slave); + else + num_gids = 1; + if (qp_ctx->pri_path.mgid_index >= num_gids) return -EINVAL; - if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) - if (qp_ctx->alt_path.mgid_index) + } + if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) { + port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1; + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) + num_gids = mlx4_get_slave_num_gids(dev, slave); + else + num_gids = 1; + if (qp_ctx->alt_path.mgid_index >= num_gids) return -EINVAL; + } break; default: break; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index e4853650679d..86e02e5c2c77 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -49,6 +49,7 @@ #define MIN_MSIX_P_PORT 5 #define MLX4_ROCE_MAX_GIDS 128 +#define MLX4_ROCE_PF_GIDS 16 enum { MLX4_FLAG_MSI_X = 1 << 0, From 2f5bb473681b88819a9de28ac3a47e7737815a92 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 12 Mar 2014 12:00:40 +0200 Subject: [PATCH 4/7] mlx4: Add ref counting to port MAC table for RoCE The IB side of RoCE requires the MAC table index of the MAC address used by its QPs. To obtain the real MAC index, the IB side registers the MAC (increasing its ref count, and also returning the real MAC index) during the modify-qp sequence. This protects against the ETH side deleting or modifying that MAC table entry while the QP is active. Note that until the modify-qp command returns success, the MAC and VLAN information only has "candidate" status. If the modify-qp succeeds, the "candidate" info is promoted to the operational MAC/VLAN info for the qp. If the modify fails, the candidate MAC/VLAN is unregistered, and the old qp info is preserved. The patch is a bit complex, because there are multiple qp transitions where the primary-path information may be modified: INIT-to-RTR, and SQD-to-SQD. Similarly for the alternate path information. Therefore the code must handle cases where path information has already been entered into the QP context by previous qp transitions. For the MAC address, the success logic is as follows: 1. If there was no previous MAC, simply move the candidate MAC information to the operational information, and reset the candidate MAC info. 2. If there was a previous MAC, unregister it. Then move the MAC information from candidate to operational, and reset the candidate info (as in 1. above). The MAC address failure logic is the same for all cases: - Unregister the candidate MAC, and reset the candidate MAC info. For Vlan registration, the logic is similar. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/mlx4_ib.h | 19 +- drivers/infiniband/hw/mlx4/qp.c | 285 +++++++++++++++--- .../ethernet/mellanox/mlx4/resource_tracker.c | 75 ++++- 3 files changed, 329 insertions(+), 50 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index a230683af940..febc8f9bc59a 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -241,6 +241,22 @@ struct mlx4_ib_proxy_sqp_hdr { struct mlx4_rcv_tunnel_hdr tun; } __packed; +struct mlx4_roce_smac_vlan_info { + u64 smac; + int smac_index; + int smac_port; + u64 candidate_smac; + int candidate_smac_index; + int candidate_smac_port; + u16 vid; + int vlan_index; + int vlan_port; + u16 candidate_vid; + int candidate_vlan_index; + int candidate_vlan_port; + int update_vid; +}; + struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; @@ -273,8 +289,9 @@ struct mlx4_ib_qp { struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; + struct mlx4_roce_smac_vlan_info pri; + struct mlx4_roce_smac_vlan_info alt; u64 reg_id; - }; struct mlx4_ib_srq { diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index c6ef2e7e3045..11332f074023 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -662,10 +662,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (!sqp) return -ENOMEM; qp = &sqp->qp; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; } else { qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL); if (!qp) return -ENOMEM; + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; } } else qp = *caller_qp; @@ -940,11 +944,32 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, { struct mlx4_ib_cq *send_cq, *recv_cq; - if (qp->state != IB_QPS_RESET) + if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + } get_cqs(qp, &send_cq, &recv_cq); @@ -1057,6 +1082,8 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, qp = kzalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; /* fall through */ case IB_QPT_UD: { @@ -1188,12 +1215,13 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, u64 smac, u16 vlan_tag, struct mlx4_qp_path *path, - u8 port) + struct mlx4_roce_smac_vlan_info *smac_info, u8 port) { int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_ETHERNET; int vidx; int smac_index; + int err; path->grh_mylmc = ah->src_path_bits & 0x7f; @@ -1223,61 +1251,103 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } if (is_eth) { - path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | - ((port - 1) << 6) | ((ah->sl & 7) << 3); - if (!(ah->ah_flags & IB_AH_GRH)) return -1; - memcpy(path->dmac, ah->dmac, ETH_ALEN); - path->ackto = MLX4_IB_LINK_TYPE_ETH; - /* find the index into MAC table for IBoE */ - if (!is_zero_ether_addr((const u8 *)&smac)) { - if (mlx4_find_cached_mac(dev->dev, port, smac, - &smac_index)) - return -ENOENT; - } else { - smac_index = 0; - } - - path->grh_mylmc &= 0x80 | smac_index; + path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | + ((port - 1) << 6) | ((ah->sl & 7) << 3); path->feup |= MLX4_FEUP_FORCE_ETH_UP; if (vlan_tag < 0x1000) { - if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) - return -ENOENT; - - path->vlan_index = vidx; - path->fl = 1 << 6; + if (smac_info->vid < 0x1000) { + /* both valid vlan ids */ + if (smac_info->vid != vlan_tag) { + /* different VIDs. unreg old and reg new */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } else { + path->vlan_index = smac_info->vlan_index; + } + } else { + /* no current vlan tag in qp */ + err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx); + if (err) + return err; + smac_info->candidate_vid = vlan_tag; + smac_info->candidate_vlan_index = vidx; + smac_info->candidate_vlan_port = port; + smac_info->update_vid = 1; + path->vlan_index = vidx; + } path->feup |= MLX4_FVL_FORCE_ETH_VLAN; + path->fl = 1 << 6; + } else { + /* have current vlan tag. unregister it at modify-qp success */ + if (smac_info->vid < 0x1000) { + smac_info->candidate_vid = 0xFFFF; + smac_info->update_vid = 1; + } } - } else + + /* get smac_index for RoCE use. + * If no smac was yet assigned, register one. + * If one was already assigned, but the new mac differs, + * unregister the old one and register the new one. + */ + if (!smac_info->smac || smac_info->smac != smac) { + /* register candidate now, unreg if needed, after success */ + smac_index = mlx4_register_mac(dev->dev, port, smac); + if (smac_index >= 0) { + smac_info->candidate_smac_index = smac_index; + smac_info->candidate_smac = smac; + smac_info->candidate_smac_port = port; + } else { + return -EINVAL; + } + } else { + smac_index = smac_info->smac_index; + } + + memcpy(path->dmac, ah->dmac, 6); + path->ackto = MLX4_IB_LINK_TYPE_ETH; + /* put MAC table smac index for IBoE */ + path->grh_mylmc = (u8) (smac_index) | 0x80; + } else { path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6) | ((ah->sl & 0xf) << 2); + } return 0; } static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, struct mlx4_qp_path *path, u8 port) { return _mlx4_set_path(dev, &qp->ah_attr, mlx4_mac_to_u64((u8 *)qp->smac), (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff, - path, port); + path, &mqp->pri, port); } static int mlx4_set_alt_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp, enum ib_qp_attr_mask qp_attr_mask, + struct mlx4_ib_qp *mqp, struct mlx4_qp_path *path, u8 port) { return _mlx4_set_path(dev, &qp->alt_ah_attr, mlx4_mac_to_u64((u8 *)qp->alt_smac), (qp_attr_mask & IB_QP_ALT_VID) ? qp->alt_vlan_id : 0xffff, - path, port); + path, &mqp->alt, port); } static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) @@ -1292,6 +1362,37 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) } } +static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, + struct mlx4_qp_context *context) +{ + struct net_device *ndev; + u64 u64_mac; + int smac_index; + + + ndev = dev->iboe.netdevs[qp->port - 1]; + if (ndev) { + smac = ndev->dev_addr; + u64_mac = mlx4_mac_to_u64(smac); + } else { + u64_mac = dev->dev->caps.def_mac[qp->port]; + } + + context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); + if (!qp->pri.smac) { + smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); + if (smac_index >= 0) { + qp->pri.candidate_smac_index = smac_index; + qp->pri.candidate_smac = u64_mac; + qp->pri.candidate_smac_port = qp->port; + context->pri_path.grh_mylmc = 0x80 | (u8) smac_index; + } else { + return -ENOENT; + } + } + return 0; +} + static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -1403,7 +1504,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_AV) { - if (mlx4_set_path(dev, attr, attr_mask, &context->pri_path, + if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) goto out; @@ -1426,7 +1527,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, dev->dev->caps.pkey_table_len[attr->alt_port_num]) goto out; - if (mlx4_set_alt_path(dev, attr, attr_mask, &context->alt_path, + if (mlx4_set_alt_path(dev, attr, attr_mask, qp, + &context->alt_path, attr->alt_port_num)) goto out; @@ -1532,6 +1634,20 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; } + if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) + context->pri_path.feup = 1 << 7; /* don't fsm */ + /* handle smac_index */ + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) { + err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context); + if (err) + return -EINVAL; + } + } } if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) @@ -1619,28 +1735,113 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ - if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, - ibqp->srq ? to_msrq(ibqp->srq): NULL); - if (send_cq != recv_cq) - mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + if (new_state == IB_QPS_RESET) { + if (!ibqp->uobject) { + mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); - qp->rq.head = 0; - qp->rq.tail = 0; - qp->sq.head = 0; - qp->sq.tail = 0; - qp->sq_next_wqe = 0; - if (qp->rq.wqe_cnt) - *qp->db.db = 0; + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq_next_wqe = 0; + if (qp->rq.wqe_cnt) + *qp->db.db = 0; - if (qp->flags & MLX4_IB_QP_NETIF) - mlx4_ib_steer_qp_reg(dev, qp, 0); + if (qp->flags & MLX4_IB_QP_NETIF) + mlx4_ib_steer_qp_reg(dev, qp, 0); + } + if (qp->pri.smac) { + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = 0; + } + if (qp->alt.smac) { + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = 0; + } + if (qp->pri.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid); + qp->pri.vid = 0xFFFF; + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.vid < 0x1000) { + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid); + qp->alt.vid = 0xFFFF; + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } } - out: if (err && steer_qp) mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); + if (qp->pri.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); + } else { + if (qp->pri.smac) + mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); + qp->pri.smac = qp->pri.candidate_smac; + qp->pri.smac_index = qp->pri.candidate_smac_index; + qp->pri.smac_port = qp->pri.candidate_smac_port; + } + qp->pri.candidate_smac = 0; + qp->pri.candidate_smac_index = 0; + qp->pri.candidate_smac_port = 0; + } + if (qp->alt.candidate_smac) { + if (err) { + mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac); + } else { + if (qp->alt.smac) + mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); + qp->alt.smac = qp->alt.candidate_smac; + qp->alt.smac_index = qp->alt.candidate_smac_index; + qp->alt.smac_port = qp->alt.candidate_smac_port; + } + qp->alt.candidate_smac = 0; + qp->alt.candidate_smac_index = 0; + qp->alt.candidate_smac_port = 0; + } + + if (qp->pri.update_vid) { + if (err) { + if (qp->pri.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port, + qp->pri.candidate_vid); + } else { + if (qp->pri.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, + qp->pri.vid); + qp->pri.vid = qp->pri.candidate_vid; + qp->pri.vlan_port = qp->pri.candidate_vlan_port; + qp->pri.vlan_index = qp->pri.candidate_vlan_index; + } + qp->pri.candidate_vid = 0xFFFF; + qp->pri.update_vid = 0; + } + + if (qp->alt.update_vid) { + if (err) { + if (qp->alt.candidate_vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port, + qp->alt.candidate_vid); + } else { + if (qp->alt.vid < 0x1000) + mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, + qp->alt.vid); + qp->alt.vid = qp->alt.candidate_vid; + qp->alt.vlan_port = qp->alt.candidate_vlan_port; + qp->alt.vlan_index = qp->alt.candidate_vlan_index; + } + qp->alt.candidate_vid = 0xFFFF; + qp->alt.update_vid = 0; + } + return err; } diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 1c3634eab5e1..706a6d2b538c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -52,6 +52,8 @@ struct mac_res { struct list_head list; u64 mac; + int ref_count; + u8 smac_index; u8 port; }; @@ -1683,11 +1685,39 @@ static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, return err; } -static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port) +static int mac_find_smac_ix_in_slave(struct mlx4_dev *dev, int slave, int port, + u8 smac_index, u64 *mac) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; - struct mac_res *res; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->smac_index == smac_index && res->port == (u8) port) { + *mac = res->mac; + return 0; + } + } + return -ENOENT; +} + +static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port, u8 smac_index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mac_list = + &tracker->slave_list[slave].res_list[RES_MAC]; + struct mac_res *res, *tmp; + + list_for_each_entry_safe(res, tmp, mac_list, list) { + if (res->mac == mac && res->port == (u8) port) { + /* mac found. update ref count */ + ++res->ref_count; + return 0; + } + } if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port)) return -EINVAL; @@ -1698,6 +1728,8 @@ static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port) } res->mac = mac; res->port = (u8) port; + res->smac_index = smac_index; + res->ref_count = 1; list_add_tail(&res->list, &tracker->slave_list[slave].res_list[RES_MAC]); return 0; @@ -1714,9 +1746,11 @@ static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac, list_for_each_entry_safe(res, tmp, mac_list, list) { if (res->mac == mac && res->port == (u8) port) { - list_del(&res->list); - mlx4_release_resource(dev, slave, RES_MAC, 1, port); - kfree(res); + if (!--res->ref_count) { + list_del(&res->list); + mlx4_release_resource(dev, slave, RES_MAC, 1, port); + kfree(res); + } break; } } @@ -1729,10 +1763,13 @@ static void rem_slave_macs(struct mlx4_dev *dev, int slave) struct list_head *mac_list = &tracker->slave_list[slave].res_list[RES_MAC]; struct mac_res *res, *tmp; + int i; list_for_each_entry_safe(res, tmp, mac_list, list) { list_del(&res->list); - __mlx4_unregister_mac(dev, res->port, res->mac); + /* dereference the mac the num times the slave referenced it */ + for (i = 0; i < res->ref_count; i++) + __mlx4_unregister_mac(dev, res->port, res->mac); mlx4_release_resource(dev, slave, RES_MAC, 1, res->port); kfree(res); } @@ -1744,6 +1781,7 @@ static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, int err = -EINVAL; int port; u64 mac; + u8 smac_index; if (op != RES_OP_RESERVE_AND_MAP) return err; @@ -1753,12 +1791,13 @@ static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, err = __mlx4_register_mac(dev, port, mac); if (err >= 0) { + smac_index = err; set_param_l(out_param, err); err = 0; } if (!err) { - err = mac_add_to_slave(dev, slave, mac, port); + err = mac_add_to_slave(dev, slave, mac, port, smac_index); if (err) __mlx4_unregister_mac(dev, port, mac); } @@ -3306,6 +3345,25 @@ int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd); } +static int roce_verify_mac(struct mlx4_dev *dev, int slave, + struct mlx4_qp_context *qpc, + struct mlx4_cmd_mailbox *inbox) +{ + u64 mac; + int port; + u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff; + u8 sched = *(u8 *)(inbox->buf + 64); + u8 smac_ix; + + port = (sched >> 6 & 1) + 1; + if (mlx4_is_eth(dev, port) && (ts != MLX4_QP_ST_MLX)) { + smac_ix = qpc->pri_path.grh_mylmc & 0x7f; + if (mac_find_smac_ix_in_slave(dev, slave, port, smac_ix, &mac)) + return -ENOENT; + } + return 0; +} + int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, @@ -3328,6 +3386,9 @@ int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, if (err) return err; + if (roce_verify_mac(dev, slave, qpc, inbox)) + return -EINVAL; + update_pkey_index(dev, slave, inbox); update_gid(dev, inbox, (u8)slave); adjust_proxy_tun_qkey(dev, vhcr, qpc); From 5ea8bbfc49291b7e23161fe4de0bf3e4a4e34b18 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 12 Mar 2014 12:00:41 +0200 Subject: [PATCH 5/7] mlx4: Implement IP based gids support for RoCE/SRIOV Since there is no connection between the MAC/VLAN and the GID when using IP-based addressing, the proxy QP1 (running on the slave) must pass the source-mac, destination-mac, and vlan_id information separately from the GID. Additionally, the Host must pass the remote source-mac and vlan_id back to the slave, This is achieved as follows: Outgoing MADs: 1. Source MAC: obtained from the CQ completion structure (struct ib_wc, smac field). 2. Destination MAC: obtained from the tunnel header 3. vlan_id: obtained from the tunnel header. Incoming MADs 1. The source (i.e., remote) MAC and vlan_id are passed in the tunnel header to the proxy QP1. VST mode support: For outgoing MADs, the vlan_id obtained from the header is discarded, and the vlan_id specified by the Hypervisor is used instead. For incoming MADs, the incoming vlan_id (in the wc) is discarded, and the "invalid" vlan (0xffff) is substituted when forwarding to the slave. Signed-off-by: Moni Shoua Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/cq.c | 42 ++++++++++++++-------- drivers/infiniband/hw/mlx4/mad.c | 45 +++++++++++++++++++++--- drivers/infiniband/hw/mlx4/mcg.c | 5 +-- drivers/infiniband/hw/mlx4/mlx4_ib.h | 5 ++- drivers/infiniband/hw/mlx4/qp.c | 11 ++++-- drivers/net/ethernet/mellanox/mlx4/cmd.c | 24 +++++++++++++ include/linux/mlx4/cmd.h | 7 ++++ include/linux/mlx4/device.h | 3 +- 8 files changed, 117 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index cc40f08ca8f1..5f640814cc81 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -564,7 +564,7 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) } static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, - unsigned tail, struct mlx4_cqe *cqe) + unsigned tail, struct mlx4_cqe *cqe, int is_eth) { struct mlx4_ib_proxy_sqp_hdr *hdr; @@ -574,12 +574,20 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct DMA_FROM_DEVICE); hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); wc->pkey_index = be16_to_cpu(hdr->tun.pkey_index); - wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); - wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); wc->src_qp = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF; wc->wc_flags |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0; wc->dlid_path_bits = 0; + if (is_eth) { + wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid); + memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4); + memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + } else { + wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32); + wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12); + } + return 0; } @@ -594,6 +602,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_srq *msrq = NULL; int is_send; int is_error; + int is_eth; u32 g_mlpath_rqpn; u16 wqe_ctr; unsigned tail = 0; @@ -778,11 +787,15 @@ repoll: break; } + is_eth = (rdma_port_get_link_layer(wc->qp->device, + (*cur_qp)->port) == + IB_LINK_LAYER_ETHERNET); if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { if ((*cur_qp)->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) - return use_tunnel_data(*cur_qp, cq, wc, tail, cqe); + return use_tunnel_data(*cur_qp, cq, wc, tail, + cqe, is_eth); } wc->slid = be16_to_cpu(cqe->rlid); @@ -793,20 +806,21 @@ repoll: wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum) ? IB_WC_IP_CSUM_OK : 0; - if (rdma_port_get_link_layer(wc->qp->device, - (*cur_qp)->port) == IB_LINK_LAYER_ETHERNET) + if (is_eth) { wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; - else - wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; - if (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_VLAN_PRESENT_MASK) { - wc->vlan_id = be16_to_cpu(cqe->sl_vid) & - MLX4_CQE_VID_MASK; + if (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK) { + wc->vlan_id = be16_to_cpu(cqe->sl_vid) & + MLX4_CQE_VID_MASK; + } else { + wc->vlan_id = 0xffff; + } + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); } else { + wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; wc->vlan_id = 0xffff; } - wc->wc_flags |= IB_WC_WITH_VLAN; - memcpy(wc->smac, cqe->smac, ETH_ALEN); - wc->wc_flags |= IB_WC_WITH_SMAC; } return 0; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index c5bca0f0da4a..2c572aed3f6f 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -545,11 +545,36 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, /* adjust tunnel data */ tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix); - tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF); tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0; + if (is_eth) { + u16 vlan = 0; + if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan, + NULL)) { + /* VST mode */ + if (vlan != wc->vlan_id) + /* Packet vlan is not the VST-assigned vlan. + * Drop the packet. + */ + goto out; + else + /* Remove the vlan tag before forwarding + * the packet to the VF. + */ + vlan = 0xffff; + } else { + vlan = wc->vlan_id; + } + + tun_mad->hdr.sl_vid = cpu_to_be16(vlan); + memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4); + memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); + } else { + tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); + tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + } + ib_dma_sync_single_for_device(&dev->ib_dev, tun_qp->tx_ring[tun_tx_ix].buf.map, sizeof (struct mlx4_rcv_tunnel_mad), @@ -1116,8 +1141,9 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, - enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad) + enum ib_qp_type dest_qpt, u16 pkey_index, + u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr, + u8 *s_mac, struct ib_mad *mad) { struct ib_sge list; struct ib_send_wr wr, *bad_wr; @@ -1206,6 +1232,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, wr.num_sge = 1; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; + if (s_mac) + memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); + ret = ib_post_send(send_qp, &wr, &bad_wr); out: @@ -1331,13 +1360,19 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc if (ah_attr.ah_flags & IB_AH_GRH) fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr); + memcpy(ah_attr.dmac, tunnel->hdr.mac, 6); + ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan); + /* if slave have default vlan use it */ + mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave, + &ah_attr.vlan_id, &ah_attr.sl); + mlx4_ib_send_to_wire(dev, slave, ctx->port, is_proxy_qp0(dev, wc->src_qp, slave) ? IB_QPT_SMI : IB_QPT_GSI, be16_to_cpu(tunnel->hdr.pkey_index), be32_to_cpu(tunnel->hdr.remote_qpn), be32_to_cpu(tunnel->hdr.qkey), - &ah_attr, &tunnel->mad); + &ah_attr, wc->smac, &tunnel->mad); } static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 25b2cdff00f8..ed327e6c8fdc 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -215,8 +215,9 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) } mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); spin_unlock(&dev->sm_lock); - return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port, - IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad); + return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), + ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY, + &ah_attr, NULL, mad); } static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index febc8f9bc59a..f589522fddfd 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -737,9 +737,12 @@ void mlx4_ib_tunnels_update_work(struct work_struct *work); int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type qpt, struct ib_wc *wc, struct ib_grh *grh, struct ib_mad *mad); + int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, - u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad); + u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, + struct ib_mad *mad); + __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 11332f074023..aadf7f82e1f3 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2152,7 +2152,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, } if (is_eth) { - u8 smac[6]; + u8 *smac; struct in6_addr in6; u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; @@ -2164,7 +2164,12 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); memcpy(&in6, sgid.raw, sizeof(in6)); - rdma_get_ll_mac(&in6, smac); + + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) + smac = to_mdev(sqp->qp.ibqp.device)-> + iboe.netdevs[sqp->qp.port - 1]->dev_addr; + else /* use the src mac of the tunnel */ + smac = ah->av.eth.s_mac; memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); @@ -2396,6 +2401,8 @@ static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_ hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index); hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(hdr.mac, ah->av.eth.mac, 6); + hdr.vlan = ah->av.eth.vlan; spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 0d02fba94536..2b0b45ece14b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -2289,6 +2289,30 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos) } EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan); + /* mlx4_get_slave_default_vlan - + * return true if VST ( default vlan) + * if VST, will return vlan & qos (if not NULL) + */ +bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, + u16 *vlan, u8 *qos) +{ + struct mlx4_vport_oper_state *vp_oper; + struct mlx4_priv *priv; + + priv = mlx4_priv(dev); + vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; + + if (MLX4_VGT != vp_oper->state.default_vlan) { + if (vlan) + *vlan = vp_oper->state.default_vlan; + if (qos) + *qos = vp_oper->state.default_qos; + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(mlx4_get_slave_default_vlan); + int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting) { struct mlx4_priv *priv = mlx4_priv(dev); diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 79a347238168..009985628257 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -240,6 +240,13 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos); int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting); int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf); int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state); +/* + * mlx4_get_slave_default_vlan - + * return true if VST ( default vlan) + * if VST, will return vlan & qos (if not NULL) + */ +bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, + u16 *vlan, u8 *qos); #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8) diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 86e02e5c2c77..f211b51dc726 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -632,7 +632,8 @@ struct mlx4_eth_av { u8 hop_limit; __be32 sl_tclass_flowlabel; u8 dgid[16]; - u32 reserved4[2]; + u8 s_mac[6]; + u8 reserved4[2]; __be16 vlan; u8 mac[ETH_ALEN]; }; From ceb5433b3a54979216d794e45147d25c24c94999 Mon Sep 17 00:00:00 2001 From: Shani Michaelli Date: Wed, 12 Mar 2014 12:00:42 +0200 Subject: [PATCH 6/7] mlx4_ib: Fix SIDR support of for UD QPs under SRIOV/RoCE * Handle CM_SIDR_REQ_ATTR_ID and CM_SIDR_REP_ATTR_ID in multiplex_cm_handler and demux_cm_handler. * Handle Service ID Resolution messages and REQ messages separately, for their formats are different. Signed-off-by: Shani Michaeli Signed-off-by: Matan Barak Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/cm.c | 72 ++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index b8d911543783..56a593e0ae5d 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -61,6 +61,11 @@ struct cm_generic_msg { __be32 remote_comm_id; }; +struct cm_sidr_generic_msg { + struct ib_mad_hdr hdr; + __be32 request_id; +}; + struct cm_req_msg { unsigned char unused[0x60]; union ib_gid primary_path_sgid; @@ -69,28 +74,62 @@ struct cm_req_msg { static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->local_comm_id = cpu_to_be32(cm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); + } } static u32 get_local_comm_id(struct ib_mad *mad) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - - return be32_to_cpu(msg->local_comm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + pr_err("trying to set local_comm_id in SIDR_REP\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->local_comm_id); + } } static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - msg->remote_comm_id = cpu_to_be32(cm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + msg->request_id = cpu_to_be32(cm_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + } } static u32 get_remote_comm_id(struct ib_mad *mad) { - struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; - - return be32_to_cpu(msg->remote_comm_id); + if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { + struct cm_sidr_generic_msg *msg = + (struct cm_sidr_generic_msg *)mad; + return be32_to_cpu(msg->request_id); + } else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + pr_err("trying to set remote_comm_id in SIDR_REQ\n"); + return -1; + } else { + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + return be32_to_cpu(msg->remote_comm_id); + } } static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) @@ -282,19 +321,21 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id u32 sl_cm_id; int pv_cm_id = -1; - sl_cm_id = get_local_comm_id(mad); - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || - mad->mad_hdr.attr_id == CM_REP_ATTR_ID) { + mad->mad_hdr.attr_id == CM_REP_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { + sl_cm_id = get_local_comm_id(mad); id = id_map_alloc(ibdev, slave_id, sl_cm_id); if (IS_ERR(id)) { mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", __func__, slave_id, sl_cm_id); return PTR_ERR(id); } - } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) { + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) { return 0; } else { + sl_cm_id = get_local_comm_id(mad); id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); } @@ -320,7 +361,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, u32 pv_cm_id; struct id_map_entry *id; - if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) { union ib_gid gid; if (!slave) From aa9a2d51a3e70b15a898bec7dde3ce5726fec641 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 12 Mar 2014 12:00:43 +0200 Subject: [PATCH 7/7] mlx4: Activate RoCE/SRIOV To activate RoCE/SRIOV, need to remove the following: 1. In mlx4_ib_add, need to remove the error return preventing initialization of a RoCE port under SRIOV. 2. In update_vport_qp_params (in resource_tracker.c) need to remove the error return when a RoCE RC or UD qp is detected. This error return causes the INIT-to-RTR qp transition to fail in the wrapper function under RoCE/SRIOV. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/main.c | 8 -------- drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 7 ------- 2 files changed, 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f9c12e92fdd6..1d1750ef000a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1888,14 +1888,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) pr_info_once("%s", mlx4_ib_version); - mlx4_foreach_non_ib_transport_port(i, dev) - num_ports++; - - if (mlx4_is_mfunc(dev) && num_ports) { - dev_err(&dev->pdev->dev, "RoCE is not supported over SRIOV as yet\n"); - return NULL; - } - num_ports = 0; mlx4_foreach_ib_transport_port(i, dev) num_ports++; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 706a6d2b538c..74e490d70184 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -645,7 +645,6 @@ static int update_vport_qp_param(struct mlx4_dev *dev, struct mlx4_qp_context *qpc = inbox->buf + 8; struct mlx4_vport_oper_state *vp_oper; struct mlx4_priv *priv; - u32 qp_type; int port; port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1; @@ -653,12 +652,6 @@ static int update_vport_qp_param(struct mlx4_dev *dev, vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port]; if (MLX4_VGT != vp_oper->state.default_vlan) { - qp_type = (be32_to_cpu(qpc->flags) >> 16) & 0xff; - if (MLX4_QP_ST_RC == qp_type || - (MLX4_QP_ST_UD == qp_type && - !mlx4_is_qp_reserved(dev, qpn))) - return -EINVAL; - /* the reserved QPs (special, proxy, tunnel) * do not operate over vlans */