M7350v1_en_gpl

This commit is contained in:
T
2024-09-09 08:52:07 +00:00
commit f9cc65cfda
65988 changed files with 26357421 additions and 0 deletions
@@ -0,0 +1,12 @@
config INFINIBAND_ISER
tristate "iSCSI Extensions for RDMA (iSER)"
depends on SCSI && INET && INFINIBAND_ADDR_TRANS
select SCSI_ISCSI_ATTRS
---help---
Support for the iSCSI Extensions for RDMA (iSER) Protocol
over InfiniBand. This allows you to access storage devices
that speak iSCSI over iSER over InfiniBand.
The iSER protocol is defined by IETF.
See <http://www.ietf.org/rfc/rfc5046.txt>
and <http://members.infinibandta.org/kwspub/spec/Annex_iSER.PDF>
@@ -0,0 +1,4 @@
obj-$(CONFIG_INFINIBAND_ISER) += ib_iser.o
ib_iser-y := iser_verbs.o iser_initiator.o iser_memory.o \
iscsi_iser.o
@@ -0,0 +1,787 @@
/*
* iSCSI Initiator over iSER Data-Path
*
* Copyright (C) 2004 Dmitry Yusupov
* Copyright (C) 2004 Alex Aizman
* Copyright (C) 2005 Mike Christie
* Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved.
* maintained by openib-general@openib.org
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Credits:
* Christoph Hellwig
* FUJITA Tomonori
* Arne Redlich
* Zhenyu Wang
* Modified by:
* Erez Zilber
*/
#include <linux/types.h>
#include <linux/list.h>
#include <linux/hardirq.h>
#include <linux/kfifo.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/ioctl.h>
#include <linux/cdev.h>
#include <linux/in.h>
#include <linux/net.h>
#include <linux/scatterlist.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/sock.h>
#include <asm/uaccess.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_tcq.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi.h>
#include <scsi/scsi_transport_iscsi.h>
#include "iscsi_iser.h"
static struct scsi_host_template iscsi_iser_sht;
static struct iscsi_transport iscsi_iser_transport;
static struct scsi_transport_template *iscsi_iser_scsi_transport;
static unsigned int iscsi_max_lun = 512;
module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
int iser_debug_level = 0;
MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover "
"v" DRV_VER " (" DRV_DATE ")");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
module_param_named(debug_level, iser_debug_level, int, 0644);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
struct iser_global ig;
void
iscsi_iser_recv(struct iscsi_conn *conn,
struct iscsi_hdr *hdr, char *rx_data, int rx_data_len)
{
int rc = 0;
int datalen;
int ahslen;
/* verify PDU length */
datalen = ntoh24(hdr->dlength);
if (datalen > rx_data_len || (datalen + 4) < rx_data_len) {
iser_err("wrong datalen %d (hdr), %d (IB)\n",
datalen, rx_data_len);
rc = ISCSI_ERR_DATALEN;
goto error;
}
if (datalen != rx_data_len)
iser_dbg("aligned datalen (%d) hdr, %d (IB)\n",
datalen, rx_data_len);
/* read AHS */
ahslen = hdr->hlength * 4;
rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len);
if (rc && rc != ISCSI_ERR_NO_SCSI_CMD)
goto error;
return;
error:
iscsi_conn_failure(conn, rc);
}
static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
{
struct iscsi_iser_task *iser_task = task->dd_data;
task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header;
task->hdr_max = sizeof(iser_task->desc.iscsi_header);
return 0;
}
int iser_initialize_task_headers(struct iscsi_task *task,
struct iser_tx_desc *tx_desc)
{
struct iscsi_iser_conn *iser_conn = task->conn->dd_data;
struct iser_device *device = iser_conn->ib_conn->device;
struct iscsi_iser_task *iser_task = task->dd_data;
u64 dma_addr;
dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
ISER_HEADERS_LEN, DMA_TO_DEVICE);
if (ib_dma_mapping_error(device->ib_device, dma_addr))
return -ENOMEM;
tx_desc->dma_addr = dma_addr;
tx_desc->tx_sg[0].addr = tx_desc->dma_addr;
tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
tx_desc->tx_sg[0].lkey = device->mr->lkey;
iser_task->iser_conn = iser_conn;
return 0;
}
/**
* iscsi_iser_task_init - Initialize task
* @task: iscsi task
*
* Initialize the task for the scsi command or mgmt command.
*/
static int
iscsi_iser_task_init(struct iscsi_task *task)
{
struct iscsi_iser_task *iser_task = task->dd_data;
if (iser_initialize_task_headers(task, &iser_task->desc))
return -ENOMEM;
/* mgmt task */
if (!task->sc)
return 0;
iser_task->command_sent = 0;
iser_task_rdma_init(iser_task);
return 0;
}
/**
* iscsi_iser_mtask_xmit - xmit management(immediate) task
* @conn: iscsi connection
* @task: task management task
*
* Notes:
* The function can return -EAGAIN in which case caller must
* call it again later, or recover. '0' return code means successful
* xmit.
*
**/
static int
iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
{
int error = 0;
iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt);
error = iser_send_control(conn, task);
/* since iser xmits control with zero copy, tasks can not be recycled
* right after sending them.
* The recycling scheme is based on whether a response is expected
* - if yes, the task is recycled at iscsi_complete_pdu
* - if no, the task is recycled at iser_snd_completion
*/
return error;
}
static int
iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn,
struct iscsi_task *task)
{
struct iscsi_r2t_info *r2t = &task->unsol_r2t;
struct iscsi_data hdr;
int error = 0;
/* Send data-out PDUs while there's still unsolicited data to send */
while (iscsi_task_has_unsol_data(task)) {
iscsi_prep_data_out_pdu(task, r2t, &hdr);
iser_dbg("Sending data-out: itt 0x%x, data count %d\n",
hdr.itt, r2t->data_count);
/* the buffer description has been passed with the command */
/* Send the command */
error = iser_send_data_out(conn, task, &hdr);
if (error) {
r2t->datasn--;
goto iscsi_iser_task_xmit_unsol_data_exit;
}
r2t->sent += r2t->data_count;
iser_dbg("Need to send %d more as data-out PDUs\n",
r2t->data_length - r2t->sent);
}
iscsi_iser_task_xmit_unsol_data_exit:
return error;
}
static int
iscsi_iser_task_xmit(struct iscsi_task *task)
{
struct iscsi_conn *conn = task->conn;
struct iscsi_iser_task *iser_task = task->dd_data;
int error = 0;
if (!task->sc)
return iscsi_iser_mtask_xmit(conn, task);
if (task->sc->sc_data_direction == DMA_TO_DEVICE) {
BUG_ON(scsi_bufflen(task->sc) == 0);
iser_dbg("cmd [itt %x total %d imm %d unsol_data %d\n",
task->itt, scsi_bufflen(task->sc),
task->imm_count, task->unsol_r2t.data_length);
}
iser_dbg("ctask xmit [cid %d itt 0x%x]\n",
conn->id, task->itt);
/* Send the cmd PDU */
if (!iser_task->command_sent) {
error = iser_send_command(conn, task);
if (error)
goto iscsi_iser_task_xmit_exit;
iser_task->command_sent = 1;
}
/* Send unsolicited data-out PDU(s) if necessary */
if (iscsi_task_has_unsol_data(task))
error = iscsi_iser_task_xmit_unsol_data(conn, task);
iscsi_iser_task_xmit_exit:
return error;
}
static void iscsi_iser_cleanup_task(struct iscsi_task *task)
{
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_tx_desc *tx_desc = &iser_task->desc;
struct iscsi_iser_conn *iser_conn = task->conn->dd_data;
struct iser_device *device = iser_conn->ib_conn->device;
ib_dma_unmap_single(device->ib_device,
tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
/* mgmt tasks do not need special cleanup */
if (!task->sc)
return;
if (iser_task->status == ISER_TASK_STATUS_STARTED) {
iser_task->status = ISER_TASK_STATUS_COMPLETED;
iser_task_rdma_finalize(iser_task);
}
}
static struct iscsi_cls_conn *
iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx)
{
struct iscsi_conn *conn;
struct iscsi_cls_conn *cls_conn;
struct iscsi_iser_conn *iser_conn;
cls_conn = iscsi_conn_setup(cls_session, sizeof(*iser_conn), conn_idx);
if (!cls_conn)
return NULL;
conn = cls_conn->dd_data;
/*
* due to issues with the login code re iser sematics
* this not set in iscsi_conn_setup - FIXME
*/
conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN;
iser_conn = conn->dd_data;
conn->dd_data = iser_conn;
iser_conn->iscsi_conn = conn;
return cls_conn;
}
static void
iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn)
{
struct iscsi_conn *conn = cls_conn->dd_data;
struct iscsi_iser_conn *iser_conn = conn->dd_data;
struct iser_conn *ib_conn = iser_conn->ib_conn;
iscsi_conn_teardown(cls_conn);
/*
* Userspace will normally call the stop callback and
* already have freed the ib_conn, but if it goofed up then
* we free it here.
*/
if (ib_conn) {
ib_conn->iser_conn = NULL;
iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
}
}
static int
iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
struct iscsi_cls_conn *cls_conn, uint64_t transport_eph,
int is_leading)
{
struct iscsi_conn *conn = cls_conn->dd_data;
struct iscsi_iser_conn *iser_conn;
struct iser_conn *ib_conn;
struct iscsi_endpoint *ep;
int error;
error = iscsi_conn_bind(cls_session, cls_conn, is_leading);
if (error)
return error;
/* the transport ep handle comes from user space so it must be
* verified against the global ib connections list */
ep = iscsi_lookup_endpoint(transport_eph);
if (!ep) {
iser_err("can't bind eph %llx\n",
(unsigned long long)transport_eph);
return -EINVAL;
}
ib_conn = ep->dd_data;
if (iser_alloc_rx_descriptors(ib_conn))
return -ENOMEM;
/* binds the iSER connection retrieved from the previously
* connected ep_handle to the iSCSI layer connection. exchanges
* connection pointers */
iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n",
conn, conn->dd_data, ib_conn);
iser_conn = conn->dd_data;
ib_conn->iser_conn = iser_conn;
iser_conn->ib_conn = ib_conn;
iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */
return 0;
}
static void
iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
{
struct iscsi_conn *conn = cls_conn->dd_data;
struct iscsi_iser_conn *iser_conn = conn->dd_data;
struct iser_conn *ib_conn = iser_conn->ib_conn;
/*
* Userspace may have goofed up and not bound the connection or
* might have only partially setup the connection.
*/
if (ib_conn) {
iscsi_conn_stop(cls_conn, flag);
/*
* There is no unbind event so the stop callback
* must release the ref from the bind.
*/
iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */
}
iser_conn->ib_conn = NULL;
}
static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
{
struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
iscsi_session_teardown(cls_session);
iscsi_host_remove(shost);
iscsi_host_free(shost);
}
static struct iscsi_cls_session *
iscsi_iser_session_create(struct iscsi_endpoint *ep,
uint16_t cmds_max, uint16_t qdepth,
uint32_t initial_cmdsn)
{
struct iscsi_cls_session *cls_session;
struct iscsi_session *session;
struct Scsi_Host *shost;
struct iser_conn *ib_conn;
shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
if (!shost)
return NULL;
shost->transportt = iscsi_iser_scsi_transport;
shost->max_lun = iscsi_max_lun;
shost->max_id = 0;
shost->max_channel = 0;
shost->max_cmd_len = 16;
/*
* older userspace tools (before 2.0-870) did not pass us
* the leading conn's ep so this will be NULL;
*/
if (ep)
ib_conn = ep->dd_data;
if (iscsi_host_add(shost,
ep ? ib_conn->device->ib_device->dma_device : NULL))
goto free_host;
/*
* we do not support setting can_queue cmd_per_lun from userspace yet
* because we preallocate so many resources
*/
cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,
ISCSI_DEF_XMIT_CMDS_MAX, 0,
sizeof(struct iscsi_iser_task),
initial_cmdsn, 0);
if (!cls_session)
goto remove_host;
session = cls_session->dd_data;
shost->can_queue = session->scsi_cmds_max;
return cls_session;
remove_host:
iscsi_host_remove(shost);
free_host:
iscsi_host_free(shost);
return NULL;
}
static int
iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
enum iscsi_param param, char *buf, int buflen)
{
int value;
switch (param) {
case ISCSI_PARAM_MAX_RECV_DLENGTH:
/* TBD */
break;
case ISCSI_PARAM_HDRDGST_EN:
sscanf(buf, "%d", &value);
if (value) {
printk(KERN_ERR "DataDigest wasn't negotiated to None");
return -EPROTO;
}
break;
case ISCSI_PARAM_DATADGST_EN:
sscanf(buf, "%d", &value);
if (value) {
printk(KERN_ERR "DataDigest wasn't negotiated to None");
return -EPROTO;
}
break;
case ISCSI_PARAM_IFMARKER_EN:
sscanf(buf, "%d", &value);
if (value) {
printk(KERN_ERR "IFMarker wasn't negotiated to No");
return -EPROTO;
}
break;
case ISCSI_PARAM_OFMARKER_EN:
sscanf(buf, "%d", &value);
if (value) {
printk(KERN_ERR "OFMarker wasn't negotiated to No");
return -EPROTO;
}
break;
default:
return iscsi_set_param(cls_conn, param, buf, buflen);
}
return 0;
}
static void
iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats)
{
struct iscsi_conn *conn = cls_conn->dd_data;
stats->txdata_octets = conn->txdata_octets;
stats->rxdata_octets = conn->rxdata_octets;
stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
stats->dataout_pdus = conn->dataout_pdus_cnt;
stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */
stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */
stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
stats->custom_length = 4;
strcpy(stats->custom[0].desc, "qp_tx_queue_full");
stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */
strcpy(stats->custom[1].desc, "fmr_map_not_avail");
stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */;
strcpy(stats->custom[2].desc, "eh_abort_cnt");
stats->custom[2].value = conn->eh_abort_cnt;
strcpy(stats->custom[3].desc, "fmr_unalign_cnt");
stats->custom[3].value = conn->fmr_unalign_cnt;
}
static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
enum iscsi_param param, char *buf)
{
struct iser_conn *ib_conn = ep->dd_data;
int len;
switch (param) {
case ISCSI_PARAM_CONN_PORT:
case ISCSI_PARAM_CONN_ADDRESS:
if (!ib_conn || !ib_conn->cma_id)
return -ENOTCONN;
return iscsi_conn_get_addr_param((struct sockaddr_storage *)
&ib_conn->cma_id->route.addr.dst_addr,
param, buf);
break;
default:
return -ENOSYS;
}
return len;
}
static struct iscsi_endpoint *
iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
int non_blocking)
{
int err;
struct iser_conn *ib_conn;
struct iscsi_endpoint *ep;
ep = iscsi_create_endpoint(sizeof(*ib_conn));
if (!ep)
return ERR_PTR(-ENOMEM);
ib_conn = ep->dd_data;
ib_conn->ep = ep;
iser_conn_init(ib_conn);
err = iser_connect(ib_conn, NULL, (struct sockaddr_in *)dst_addr,
non_blocking);
if (err) {
iscsi_destroy_endpoint(ep);
return ERR_PTR(err);
}
return ep;
}
static int
iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
{
struct iser_conn *ib_conn;
int rc;
ib_conn = ep->dd_data;
rc = wait_event_interruptible_timeout(ib_conn->wait,
ib_conn->state == ISER_CONN_UP,
msecs_to_jiffies(timeout_ms));
/* if conn establishment failed, return error code to iscsi */
if (!rc &&
(ib_conn->state == ISER_CONN_TERMINATING ||
ib_conn->state == ISER_CONN_DOWN))
rc = -1;
iser_err("ib conn %p rc = %d\n", ib_conn, rc);
if (rc > 0)
return 1; /* success, this is the equivalent of POLLOUT */
else if (!rc)
return 0; /* timeout */
else
return rc; /* signal */
}
static void
iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
{
struct iser_conn *ib_conn;
ib_conn = ep->dd_data;
if (ib_conn->iser_conn)
/*
* Must suspend xmit path if the ep is bound to the
* iscsi_conn, so we know we are not accessing the ib_conn
* when we free it.
*
* This may not be bound if the ep poll failed.
*/
iscsi_suspend_tx(ib_conn->iser_conn->iscsi_conn);
iser_err("ib conn %p state %d\n",ib_conn, ib_conn->state);
iser_conn_terminate(ib_conn);
}
static umode_t iser_attr_is_visible(int param_type, int param)
{
switch (param_type) {
case ISCSI_HOST_PARAM:
switch (param) {
case ISCSI_HOST_PARAM_NETDEV_NAME:
case ISCSI_HOST_PARAM_HWADDRESS:
case ISCSI_HOST_PARAM_INITIATOR_NAME:
return S_IRUGO;
default:
return 0;
}
case ISCSI_PARAM:
switch (param) {
case ISCSI_PARAM_MAX_RECV_DLENGTH:
case ISCSI_PARAM_MAX_XMIT_DLENGTH:
case ISCSI_PARAM_HDRDGST_EN:
case ISCSI_PARAM_DATADGST_EN:
case ISCSI_PARAM_CONN_ADDRESS:
case ISCSI_PARAM_CONN_PORT:
case ISCSI_PARAM_EXP_STATSN:
case ISCSI_PARAM_PERSISTENT_ADDRESS:
case ISCSI_PARAM_PERSISTENT_PORT:
case ISCSI_PARAM_PING_TMO:
case ISCSI_PARAM_RECV_TMO:
case ISCSI_PARAM_INITIAL_R2T_EN:
case ISCSI_PARAM_MAX_R2T:
case ISCSI_PARAM_IMM_DATA_EN:
case ISCSI_PARAM_FIRST_BURST:
case ISCSI_PARAM_MAX_BURST:
case ISCSI_PARAM_PDU_INORDER_EN:
case ISCSI_PARAM_DATASEQ_INORDER_EN:
case ISCSI_PARAM_TARGET_NAME:
case ISCSI_PARAM_TPGT:
case ISCSI_PARAM_USERNAME:
case ISCSI_PARAM_PASSWORD:
case ISCSI_PARAM_USERNAME_IN:
case ISCSI_PARAM_PASSWORD_IN:
case ISCSI_PARAM_FAST_ABORT:
case ISCSI_PARAM_ABORT_TMO:
case ISCSI_PARAM_LU_RESET_TMO:
case ISCSI_PARAM_TGT_RESET_TMO:
case ISCSI_PARAM_IFACE_NAME:
case ISCSI_PARAM_INITIATOR_NAME:
return S_IRUGO;
default:
return 0;
}
}
return 0;
}
static struct scsi_host_template iscsi_iser_sht = {
.module = THIS_MODULE,
.name = "iSCSI Initiator over iSER, v." DRV_VER,
.queuecommand = iscsi_queuecommand,
.change_queue_depth = iscsi_change_queue_depth,
.sg_tablesize = ISCSI_ISER_SG_TABLESIZE,
.max_sectors = 1024,
.cmd_per_lun = ISER_DEF_CMD_PER_LUN,
.eh_abort_handler = iscsi_eh_abort,
.eh_device_reset_handler= iscsi_eh_device_reset,
.eh_target_reset_handler = iscsi_eh_recover_target,
.target_alloc = iscsi_target_alloc,
.use_clustering = DISABLE_CLUSTERING,
.proc_name = "iscsi_iser",
.this_id = -1,
};
static struct iscsi_transport iscsi_iser_transport = {
.owner = THIS_MODULE,
.name = "iser",
.caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T,
/* session management */
.create_session = iscsi_iser_session_create,
.destroy_session = iscsi_iser_session_destroy,
/* connection management */
.create_conn = iscsi_iser_conn_create,
.bind_conn = iscsi_iser_conn_bind,
.destroy_conn = iscsi_iser_conn_destroy,
.attr_is_visible = iser_attr_is_visible,
.set_param = iscsi_iser_set_param,
.get_conn_param = iscsi_conn_get_param,
.get_ep_param = iscsi_iser_get_ep_param,
.get_session_param = iscsi_session_get_param,
.start_conn = iscsi_conn_start,
.stop_conn = iscsi_iser_conn_stop,
/* iscsi host params */
.get_host_param = iscsi_host_get_param,
.set_host_param = iscsi_host_set_param,
/* IO */
.send_pdu = iscsi_conn_send_pdu,
.get_stats = iscsi_iser_conn_get_stats,
.init_task = iscsi_iser_task_init,
.xmit_task = iscsi_iser_task_xmit,
.cleanup_task = iscsi_iser_cleanup_task,
.alloc_pdu = iscsi_iser_pdu_alloc,
/* recovery */
.session_recovery_timedout = iscsi_session_recovery_timedout,
.ep_connect = iscsi_iser_ep_connect,
.ep_poll = iscsi_iser_ep_poll,
.ep_disconnect = iscsi_iser_ep_disconnect
};
static int __init iser_init(void)
{
int err;
iser_dbg("Starting iSER datamover...\n");
if (iscsi_max_lun < 1) {
printk(KERN_ERR "Invalid max_lun value of %u\n", iscsi_max_lun);
return -EINVAL;
}
memset(&ig, 0, sizeof(struct iser_global));
ig.desc_cache = kmem_cache_create("iser_descriptors",
sizeof(struct iser_tx_desc),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (ig.desc_cache == NULL)
return -ENOMEM;
/* device init is called only after the first addr resolution */
mutex_init(&ig.device_list_mutex);
INIT_LIST_HEAD(&ig.device_list);
mutex_init(&ig.connlist_mutex);
INIT_LIST_HEAD(&ig.connlist);
iscsi_iser_scsi_transport = iscsi_register_transport(
&iscsi_iser_transport);
if (!iscsi_iser_scsi_transport) {
iser_err("iscsi_register_transport failed\n");
err = -EINVAL;
goto register_transport_failure;
}
return 0;
register_transport_failure:
kmem_cache_destroy(ig.desc_cache);
return err;
}
static void __exit iser_exit(void)
{
iser_dbg("Removing iSER datamover...\n");
iscsi_unregister_transport(&iscsi_iser_transport);
kmem_cache_destroy(ig.desc_cache);
}
module_init(iser_init);
module_exit(iser_exit);
@@ -0,0 +1,370 @@
/*
* iSER transport for the Open iSCSI Initiator & iSER transport internals
*
* Copyright (C) 2004 Dmitry Yusupov
* Copyright (C) 2004 Alex Aizman
* Copyright (C) 2005 Mike Christie
* based on code maintained by open-iscsi@googlegroups.com
*
* Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
* Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef __ISCSI_ISER_H__
#define __ISCSI_ISER_H__
#include <linux/types.h>
#include <linux/net.h>
#include <scsi/libiscsi.h>
#include <scsi/scsi_transport_iscsi.h>
#include <linux/interrupt.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/dma-mapping.h>
#include <linux/mutex.h>
#include <linux/mempool.h>
#include <linux/uio.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_fmr_pool.h>
#include <rdma/rdma_cm.h>
#define DRV_NAME "iser"
#define PFX DRV_NAME ": "
#define DRV_VER "0.1"
#define DRV_DATE "May 7th, 2006"
#define iser_dbg(fmt, arg...) \
do { \
if (iser_debug_level > 1) \
printk(KERN_DEBUG PFX "%s:" fmt,\
__func__ , ## arg); \
} while (0)
#define iser_warn(fmt, arg...) \
do { \
if (iser_debug_level > 0) \
printk(KERN_DEBUG PFX "%s:" fmt,\
__func__ , ## arg); \
} while (0)
#define iser_err(fmt, arg...) \
do { \
printk(KERN_ERR PFX "%s:" fmt, \
__func__ , ## arg); \
} while (0)
#define SHIFT_4K 12
#define SIZE_4K (1ULL << SHIFT_4K)
#define MASK_4K (~(SIZE_4K-1))
/* support up to 512KB in one RDMA */
#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K)
#define ISER_DEF_CMD_PER_LUN 128
/* QP settings */
/* Maximal bounds on received asynchronous PDUs */
#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */
#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), *
* SCSI_TMFUNC(2), LOGOUT(1) */
#define ISER_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX)
#define ISER_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
/* the max TX (send) WR supported by the iSER QP is defined by *
* max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect *
* to have at max for SCSI command. The tx posting & completion handling code *
* supports -EAGAIN scheme where tx is suspended till the QP has room for more *
* send WR. D=8 comes from 64K/8K */
#define ISER_INFLIGHT_DATAOUTS 8
#define ISER_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \
(1 + ISER_INFLIGHT_DATAOUTS) + \
ISER_MAX_TX_MISC_PDUS + \
ISER_MAX_RX_MISC_PDUS)
#define ISER_VER 0x10
#define ISER_WSV 0x08
#define ISER_RSV 0x04
struct iser_hdr {
u8 flags;
u8 rsvd[3];
__be32 write_stag; /* write rkey */
__be64 write_va;
__be32 read_stag; /* read rkey */
__be64 read_va;
} __attribute__((packed));
/* Constant PDU lengths calculations */
#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
#define ISER_RECV_DATA_SEG_LEN 128
#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
/* Length of an object name string */
#define ISER_OBJECT_NAME_SIZE 64
enum iser_ib_conn_state {
ISER_CONN_INIT, /* descriptor allocd, no conn */
ISER_CONN_PENDING, /* in the process of being established */
ISER_CONN_UP, /* up and running */
ISER_CONN_TERMINATING, /* in the process of being terminated */
ISER_CONN_DOWN, /* shut down */
ISER_CONN_STATES_NUM
};
enum iser_task_status {
ISER_TASK_STATUS_INIT = 0,
ISER_TASK_STATUS_STARTED,
ISER_TASK_STATUS_COMPLETED
};
enum iser_data_dir {
ISER_DIR_IN = 0, /* to initiator */
ISER_DIR_OUT, /* from initiator */
ISER_DIRS_NUM
};
struct iser_data_buf {
void *buf; /* pointer to the sg list */
unsigned int size; /* num entries of this sg */
unsigned long data_len; /* total data len */
unsigned int dma_nents; /* returned by dma_map_sg */
char *copy_buf; /* allocated copy buf for SGs unaligned *
* for rdma which are copied */
struct scatterlist sg_single; /* SG-ified clone of a non SG SC or *
* unaligned SG */
};
/* fwd declarations */
struct iser_device;
struct iscsi_iser_conn;
struct iscsi_iser_task;
struct iscsi_endpoint;
struct iser_mem_reg {
u32 lkey;
u32 rkey;
u64 va;
u64 len;
void *mem_h;
int is_fmr;
};
struct iser_regd_buf {
struct iser_mem_reg reg; /* memory registration info */
void *virt_addr;
struct iser_device *device; /* device->device for dma_unmap */
enum dma_data_direction direction; /* direction for dma_unmap */
unsigned int data_size;
};
enum iser_desc_type {
ISCSI_TX_CONTROL ,
ISCSI_TX_SCSI_COMMAND,
ISCSI_TX_DATAOUT
};
struct iser_tx_desc {
struct iser_hdr iser_header;
struct iscsi_hdr iscsi_header;
enum iser_desc_type type;
u64 dma_addr;
/* sg[0] points to iser/iscsi headers, sg[1] optionally points to either
of immediate data, unsolicited data-out or control (login,text) */
struct ib_sge tx_sg[2];
int num_sge;
};
#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \
sizeof(u64) + sizeof(struct ib_sge)))
struct iser_rx_desc {
struct iser_hdr iser_header;
struct iscsi_hdr iscsi_header;
char data[ISER_RECV_DATA_SEG_LEN];
u64 dma_addr;
struct ib_sge rx_sg;
char pad[ISER_RX_PAD_SIZE];
} __attribute__((packed));
struct iser_device {
struct ib_device *ib_device;
struct ib_pd *pd;
struct ib_cq *rx_cq;
struct ib_cq *tx_cq;
struct ib_mr *mr;
struct tasklet_struct cq_tasklet;
struct ib_event_handler event_handler;
struct list_head ig_list; /* entry in ig devices list */
int refcount;
};
struct iser_conn {
struct iscsi_iser_conn *iser_conn; /* iser conn for upcalls */
struct iscsi_endpoint *ep;
enum iser_ib_conn_state state; /* rdma connection state */
atomic_t refcount;
spinlock_t lock; /* used for state changes */
struct iser_device *device; /* device context */
struct rdma_cm_id *cma_id; /* CMA ID */
struct ib_qp *qp; /* QP */
struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */
wait_queue_head_t wait; /* waitq for conn/disconn */
int post_recv_buf_count; /* posted rx count */
atomic_t post_send_buf_count; /* posted tx count */
char name[ISER_OBJECT_NAME_SIZE];
struct iser_page_vec *page_vec; /* represents SG to fmr maps*
* maps serialized as tx is*/
struct list_head conn_list; /* entry in ig conn list */
char *login_buf;
char *login_req_buf, *login_resp_buf;
u64 login_req_dma, login_resp_dma;
unsigned int rx_desc_head;
struct iser_rx_desc *rx_descs;
struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX];
};
struct iscsi_iser_conn {
struct iscsi_conn *iscsi_conn;/* ptr to iscsi conn */
struct iser_conn *ib_conn; /* iSER IB conn */
};
struct iscsi_iser_task {
struct iser_tx_desc desc;
struct iscsi_iser_conn *iser_conn;
enum iser_task_status status;
int command_sent; /* set if command sent */
int dir[ISER_DIRS_NUM]; /* set if dir use*/
struct iser_regd_buf rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */
struct iser_data_buf data[ISER_DIRS_NUM]; /* orig. data des*/
struct iser_data_buf data_copy[ISER_DIRS_NUM];/* contig. copy */
};
struct iser_page_vec {
u64 *pages;
int length;
int offset;
int data_size;
};
struct iser_global {
struct mutex device_list_mutex;/* */
struct list_head device_list; /* all iSER devices */
struct mutex connlist_mutex;
struct list_head connlist; /* all iSER IB connections */
struct kmem_cache *desc_cache;
};
extern struct iser_global ig;
extern int iser_debug_level;
/* allocate connection resources needed for rdma functionality */
int iser_conn_set_full_featured_mode(struct iscsi_conn *conn);
int iser_send_control(struct iscsi_conn *conn,
struct iscsi_task *task);
int iser_send_command(struct iscsi_conn *conn,
struct iscsi_task *task);
int iser_send_data_out(struct iscsi_conn *conn,
struct iscsi_task *task,
struct iscsi_data *hdr);
void iscsi_iser_recv(struct iscsi_conn *conn,
struct iscsi_hdr *hdr,
char *rx_data,
int rx_data_len);
void iser_conn_init(struct iser_conn *ib_conn);
void iser_conn_get(struct iser_conn *ib_conn);
int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed);
void iser_conn_terminate(struct iser_conn *ib_conn);
void iser_rcv_completion(struct iser_rx_desc *desc,
unsigned long dto_xfer_len,
struct iser_conn *ib_conn);
void iser_snd_completion(struct iser_tx_desc *desc, struct iser_conn *ib_conn);
void iser_task_rdma_init(struct iscsi_iser_task *task);
void iser_task_rdma_finalize(struct iscsi_iser_task *task);
void iser_free_rx_descriptors(struct iser_conn *ib_conn);
void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task,
enum iser_data_dir cmd_dir);
int iser_reg_rdma_mem(struct iscsi_iser_task *task,
enum iser_data_dir cmd_dir);
int iser_connect(struct iser_conn *ib_conn,
struct sockaddr_in *src_addr,
struct sockaddr_in *dst_addr,
int non_blocking);
int iser_reg_page_vec(struct iser_conn *ib_conn,
struct iser_page_vec *page_vec,
struct iser_mem_reg *mem_reg);
void iser_unreg_mem(struct iser_mem_reg *mem_reg);
int iser_post_recvl(struct iser_conn *ib_conn);
int iser_post_recvm(struct iser_conn *ib_conn, int count);
int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc);
int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
struct iser_data_buf *data,
enum iser_data_dir iser_dir,
enum dma_data_direction dma_dir);
void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task);
int iser_initialize_task_headers(struct iscsi_task *task,
struct iser_tx_desc *tx_desc);
int iser_alloc_rx_descriptors(struct iser_conn *ib_conn);
#endif
@@ -0,0 +1,570 @@
/*
* Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/scatterlist.h>
#include <linux/kfifo.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_host.h>
#include "iscsi_iser.h"
/* Register user buffer memory and initialize passive rdma
* dto descriptor. Total data size is stored in
* iser_task->data[ISER_DIR_IN].data_len
*/
static int iser_prepare_read_cmd(struct iscsi_task *task,
unsigned int edtl)
{
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_regd_buf *regd_buf;
int err;
struct iser_hdr *hdr = &iser_task->desc.iser_header;
struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
err = iser_dma_map_task_data(iser_task,
buf_in,
ISER_DIR_IN,
DMA_FROM_DEVICE);
if (err)
return err;
if (edtl > iser_task->data[ISER_DIR_IN].data_len) {
iser_err("Total data length: %ld, less than EDTL: "
"%d, in READ cmd BHS itt: %d, conn: 0x%p\n",
iser_task->data[ISER_DIR_IN].data_len, edtl,
task->itt, iser_task->iser_conn);
return -EINVAL;
}
err = iser_reg_rdma_mem(iser_task,ISER_DIR_IN);
if (err) {
iser_err("Failed to set up Data-IN RDMA\n");
return err;
}
regd_buf = &iser_task->rdma_regd[ISER_DIR_IN];
hdr->flags |= ISER_RSV;
hdr->read_stag = cpu_to_be32(regd_buf->reg.rkey);
hdr->read_va = cpu_to_be64(regd_buf->reg.va);
iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n",
task->itt, regd_buf->reg.rkey,
(unsigned long long)regd_buf->reg.va);
return 0;
}
/* Register user buffer memory and initialize passive rdma
* dto descriptor. Total data size is stored in
* task->data[ISER_DIR_OUT].data_len
*/
static int
iser_prepare_write_cmd(struct iscsi_task *task,
unsigned int imm_sz,
unsigned int unsol_sz,
unsigned int edtl)
{
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_regd_buf *regd_buf;
int err;
struct iser_hdr *hdr = &iser_task->desc.iser_header;
struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
err = iser_dma_map_task_data(iser_task,
buf_out,
ISER_DIR_OUT,
DMA_TO_DEVICE);
if (err)
return err;
if (edtl > iser_task->data[ISER_DIR_OUT].data_len) {
iser_err("Total data length: %ld, less than EDTL: %d, "
"in WRITE cmd BHS itt: %d, conn: 0x%p\n",
iser_task->data[ISER_DIR_OUT].data_len,
edtl, task->itt, task->conn);
return -EINVAL;
}
err = iser_reg_rdma_mem(iser_task,ISER_DIR_OUT);
if (err != 0) {
iser_err("Failed to register write cmd RDMA mem\n");
return err;
}
regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT];
if (unsol_sz < edtl) {
hdr->flags |= ISER_WSV;
hdr->write_stag = cpu_to_be32(regd_buf->reg.rkey);
hdr->write_va = cpu_to_be64(regd_buf->reg.va + unsol_sz);
iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X "
"VA:%#llX + unsol:%d\n",
task->itt, regd_buf->reg.rkey,
(unsigned long long)regd_buf->reg.va, unsol_sz);
}
if (imm_sz > 0) {
iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n",
task->itt, imm_sz);
tx_dsg->addr = regd_buf->reg.va;
tx_dsg->length = imm_sz;
tx_dsg->lkey = regd_buf->reg.lkey;
iser_task->desc.num_sge = 2;
}
return 0;
}
/* creates a new tx descriptor and adds header regd buffer */
static void iser_create_send_desc(struct iser_conn *ib_conn,
struct iser_tx_desc *tx_desc)
{
struct iser_device *device = ib_conn->device;
ib_dma_sync_single_for_cpu(device->ib_device,
tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
tx_desc->iser_header.flags = ISER_VER;
tx_desc->num_sge = 1;
if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
tx_desc->tx_sg[0].lkey = device->mr->lkey;
iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
}
}
int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)
{
int i, j;
u64 dma_addr;
struct iser_rx_desc *rx_desc;
struct ib_sge *rx_sg;
struct iser_device *device = ib_conn->device;
ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS *
sizeof(struct iser_rx_desc), GFP_KERNEL);
if (!ib_conn->rx_descs)
goto rx_desc_alloc_fail;
rx_desc = ib_conn->rx_descs;
for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++) {
dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
if (ib_dma_mapping_error(device->ib_device, dma_addr))
goto rx_desc_dma_map_failed;
rx_desc->dma_addr = dma_addr;
rx_sg = &rx_desc->rx_sg;
rx_sg->addr = rx_desc->dma_addr;
rx_sg->length = ISER_RX_PAYLOAD_SIZE;
rx_sg->lkey = device->mr->lkey;
}
ib_conn->rx_desc_head = 0;
return 0;
rx_desc_dma_map_failed:
rx_desc = ib_conn->rx_descs;
for (j = 0; j < i; j++, rx_desc++)
ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
kfree(ib_conn->rx_descs);
ib_conn->rx_descs = NULL;
rx_desc_alloc_fail:
iser_err("failed allocating rx descriptors / data buffers\n");
return -ENOMEM;
}
void iser_free_rx_descriptors(struct iser_conn *ib_conn)
{
int i;
struct iser_rx_desc *rx_desc;
struct iser_device *device = ib_conn->device;
if (!ib_conn->rx_descs)
return;
rx_desc = ib_conn->rx_descs;
for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)
ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
kfree(ib_conn->rx_descs);
}
static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
{
struct iscsi_iser_conn *iser_conn = conn->dd_data;
iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
/* check if this is the last login - going to full feature phase */
if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
return 0;
/*
* Check that there is one posted recv buffer (for the last login
* response) and no posted send buffers left - they must have been
* consumed during previous login phases.
*/
WARN_ON(iser_conn->ib_conn->post_recv_buf_count != 1);
WARN_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0);
iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX);
/* Initial post receive buffers */
if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX))
return -ENOMEM;
return 0;
}
/**
* iser_send_command - send command PDU
*/
int iser_send_command(struct iscsi_conn *conn,
struct iscsi_task *task)
{
struct iscsi_iser_conn *iser_conn = conn->dd_data;
struct iscsi_iser_task *iser_task = task->dd_data;
unsigned long edtl;
int err;
struct iser_data_buf *data_buf;
struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
struct scsi_cmnd *sc = task->sc;
struct iser_tx_desc *tx_desc = &iser_task->desc;
edtl = ntohl(hdr->data_length);
/* build the tx desc regd header and add it to the tx desc dto */
tx_desc->type = ISCSI_TX_SCSI_COMMAND;
iser_create_send_desc(iser_conn->ib_conn, tx_desc);
if (hdr->flags & ISCSI_FLAG_CMD_READ)
data_buf = &iser_task->data[ISER_DIR_IN];
else
data_buf = &iser_task->data[ISER_DIR_OUT];
if (scsi_sg_count(sc)) { /* using a scatter list */
data_buf->buf = scsi_sglist(sc);
data_buf->size = scsi_sg_count(sc);
}
data_buf->data_len = scsi_bufflen(sc);
if (hdr->flags & ISCSI_FLAG_CMD_READ) {
err = iser_prepare_read_cmd(task, edtl);
if (err)
goto send_command_error;
}
if (hdr->flags & ISCSI_FLAG_CMD_WRITE) {
err = iser_prepare_write_cmd(task,
task->imm_count,
task->imm_count +
task->unsol_r2t.data_length,
edtl);
if (err)
goto send_command_error;
}
iser_task->status = ISER_TASK_STATUS_STARTED;
err = iser_post_send(iser_conn->ib_conn, tx_desc);
if (!err)
return 0;
send_command_error:
iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err);
return err;
}
/**
* iser_send_data_out - send data out PDU
*/
int iser_send_data_out(struct iscsi_conn *conn,
struct iscsi_task *task,
struct iscsi_data *hdr)
{
struct iscsi_iser_conn *iser_conn = conn->dd_data;
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_tx_desc *tx_desc = NULL;
struct iser_regd_buf *regd_buf;
unsigned long buf_offset;
unsigned long data_seg_len;
uint32_t itt;
int err = 0;
struct ib_sge *tx_dsg;
itt = (__force uint32_t)hdr->itt;
data_seg_len = ntoh24(hdr->dlength);
buf_offset = ntohl(hdr->offset);
iser_dbg("%s itt %d dseg_len %d offset %d\n",
__func__,(int)itt,(int)data_seg_len,(int)buf_offset);
tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
if (tx_desc == NULL) {
iser_err("Failed to alloc desc for post dataout\n");
return -ENOMEM;
}
tx_desc->type = ISCSI_TX_DATAOUT;
tx_desc->iser_header.flags = ISER_VER;
memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
/* build the tx desc */
iser_initialize_task_headers(task, tx_desc);
regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT];
tx_dsg = &tx_desc->tx_sg[1];
tx_dsg->addr = regd_buf->reg.va + buf_offset;
tx_dsg->length = data_seg_len;
tx_dsg->lkey = regd_buf->reg.lkey;
tx_desc->num_sge = 2;
if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) {
iser_err("Offset:%ld & DSL:%ld in Data-Out "
"inconsistent with total len:%ld, itt:%d\n",
buf_offset, data_seg_len,
iser_task->data[ISER_DIR_OUT].data_len, itt);
err = -EINVAL;
goto send_data_out_error;
}
iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n",
itt, buf_offset, data_seg_len);
err = iser_post_send(iser_conn->ib_conn, tx_desc);
if (!err)
return 0;
send_data_out_error:
kmem_cache_free(ig.desc_cache, tx_desc);
iser_err("conn %p failed err %d\n",conn, err);
return err;
}
int iser_send_control(struct iscsi_conn *conn,
struct iscsi_task *task)
{
struct iscsi_iser_conn *iser_conn = conn->dd_data;
struct iscsi_iser_task *iser_task = task->dd_data;
struct iser_tx_desc *mdesc = &iser_task->desc;
unsigned long data_seg_len;
int err = 0;
struct iser_device *device;
struct iser_conn *ib_conn = iser_conn->ib_conn;
/* build the tx desc regd header and add it to the tx desc dto */
mdesc->type = ISCSI_TX_CONTROL;
iser_create_send_desc(iser_conn->ib_conn, mdesc);
device = iser_conn->ib_conn->device;
data_seg_len = ntoh24(task->hdr->dlength);
if (data_seg_len > 0) {
struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
if (task != conn->login_task) {
iser_err("data present on non login task!!!\n");
goto send_control_error;
}
ib_dma_sync_single_for_cpu(device->ib_device,
ib_conn->login_req_dma, task->data_count,
DMA_TO_DEVICE);
memcpy(iser_conn->ib_conn->login_req_buf, task->data,
task->data_count);
ib_dma_sync_single_for_device(device->ib_device,
ib_conn->login_req_dma, task->data_count,
DMA_TO_DEVICE);
tx_dsg->addr = iser_conn->ib_conn->login_req_dma;
tx_dsg->length = task->data_count;
tx_dsg->lkey = device->mr->lkey;
mdesc->num_sge = 2;
}
if (task == conn->login_task) {
err = iser_post_recvl(iser_conn->ib_conn);
if (err)
goto send_control_error;
err = iser_post_rx_bufs(conn, task->hdr);
if (err)
goto send_control_error;
}
err = iser_post_send(iser_conn->ib_conn, mdesc);
if (!err)
return 0;
send_control_error:
iser_err("conn %p failed err %d\n",conn, err);
return err;
}
/**
* iser_rcv_dto_completion - recv DTO completion
*/
void iser_rcv_completion(struct iser_rx_desc *rx_desc,
unsigned long rx_xfer_len,
struct iser_conn *ib_conn)
{
struct iscsi_iser_conn *conn = ib_conn->iser_conn;
struct iscsi_hdr *hdr;
u64 rx_dma;
int rx_buflen, outstanding, count, err;
/* differentiate between login to all other PDUs */
if ((char *)rx_desc == ib_conn->login_resp_buf) {
rx_dma = ib_conn->login_resp_dma;
rx_buflen = ISER_RX_LOGIN_SIZE;
} else {
rx_dma = rx_desc->dma_addr;
rx_buflen = ISER_RX_PAYLOAD_SIZE;
}
ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
rx_buflen, DMA_FROM_DEVICE);
hdr = &rx_desc->iscsi_header;
iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
iscsi_iser_recv(conn->iscsi_conn, hdr,
rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN);
ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
rx_buflen, DMA_FROM_DEVICE);
/* decrementing conn->post_recv_buf_count only --after-- freeing the *
* task eliminates the need to worry on tasks which are completed in *
* parallel to the execution of iser_conn_term. So the code that waits *
* for the posted rx bufs refcount to become zero handles everything */
conn->ib_conn->post_recv_buf_count--;
if (rx_dma == ib_conn->login_resp_dma)
return;
outstanding = ib_conn->post_recv_buf_count;
if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) {
count = min(ISER_QP_MAX_RECV_DTOS - outstanding,
ISER_MIN_POSTED_RX);
err = iser_post_recvm(ib_conn, count);
if (err)
iser_err("posting %d rx bufs err %d\n", count, err);
}
}
void iser_snd_completion(struct iser_tx_desc *tx_desc,
struct iser_conn *ib_conn)
{
struct iscsi_task *task;
struct iser_device *device = ib_conn->device;
if (tx_desc->type == ISCSI_TX_DATAOUT) {
ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
ISER_HEADERS_LEN, DMA_TO_DEVICE);
kmem_cache_free(ig.desc_cache, tx_desc);
}
atomic_dec(&ib_conn->post_send_buf_count);
if (tx_desc->type == ISCSI_TX_CONTROL) {
/* this arithmetic is legal by libiscsi dd_data allocation */
task = (void *) ((long)(void *)tx_desc -
sizeof(struct iscsi_task));
if (task->hdr->itt == RESERVED_ITT)
iscsi_put_task(task);
}
}
void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
{
iser_task->status = ISER_TASK_STATUS_INIT;
iser_task->dir[ISER_DIR_IN] = 0;
iser_task->dir[ISER_DIR_OUT] = 0;
iser_task->data[ISER_DIR_IN].data_len = 0;
iser_task->data[ISER_DIR_OUT].data_len = 0;
memset(&iser_task->rdma_regd[ISER_DIR_IN], 0,
sizeof(struct iser_regd_buf));
memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0,
sizeof(struct iser_regd_buf));
}
void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
{
int is_rdma_aligned = 1;
struct iser_regd_buf *regd;
/* if we were reading, copy back to unaligned sglist,
* anyway dma_unmap and free the copy
*/
if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) {
is_rdma_aligned = 0;
iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_IN);
}
if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) {
is_rdma_aligned = 0;
iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_OUT);
}
if (iser_task->dir[ISER_DIR_IN]) {
regd = &iser_task->rdma_regd[ISER_DIR_IN];
if (regd->reg.is_fmr)
iser_unreg_mem(&regd->reg);
}
if (iser_task->dir[ISER_DIR_OUT]) {
regd = &iser_task->rdma_regd[ISER_DIR_OUT];
if (regd->reg.is_fmr)
iser_unreg_mem(&regd->reg);
}
/* if the data was unaligned, it was already unmapped and then copied */
if (is_rdma_aligned)
iser_dma_unmap_task_data(iser_task);
}
@@ -0,0 +1,422 @@
/*
* Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/scatterlist.h>
#include "iscsi_iser.h"
#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
/**
* iser_start_rdma_unaligned_sg
*/
static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir)
{
int dma_nents;
struct ib_device *dev;
char *mem = NULL;
struct iser_data_buf *data = &iser_task->data[cmd_dir];
unsigned long cmd_data_len = data->data_len;
if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
mem = (void *)__get_free_pages(GFP_ATOMIC,
ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
else
mem = kmalloc(cmd_data_len, GFP_ATOMIC);
if (mem == NULL) {
iser_err("Failed to allocate mem size %d %d for copying sglist\n",
data->size,(int)cmd_data_len);
return -ENOMEM;
}
if (cmd_dir == ISER_DIR_OUT) {
/* copy the unaligned sg the buffer which is used for RDMA */
struct scatterlist *sgl = (struct scatterlist *)data->buf;
struct scatterlist *sg;
int i;
char *p, *from;
p = mem;
for_each_sg(sgl, sg, data->size, i) {
from = kmap_atomic(sg_page(sg));
memcpy(p,
from + sg->offset,
sg->length);
kunmap_atomic(from);
p += sg->length;
}
}
sg_init_one(&iser_task->data_copy[cmd_dir].sg_single, mem, cmd_data_len);
iser_task->data_copy[cmd_dir].buf =
&iser_task->data_copy[cmd_dir].sg_single;
iser_task->data_copy[cmd_dir].size = 1;
iser_task->data_copy[cmd_dir].copy_buf = mem;
dev = iser_task->iser_conn->ib_conn->device->ib_device;
dma_nents = ib_dma_map_sg(dev,
&iser_task->data_copy[cmd_dir].sg_single,
1,
(cmd_dir == ISER_DIR_OUT) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE);
BUG_ON(dma_nents == 0);
iser_task->data_copy[cmd_dir].dma_nents = dma_nents;
return 0;
}
/**
* iser_finalize_rdma_unaligned_sg
*/
void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir)
{
struct ib_device *dev;
struct iser_data_buf *mem_copy;
unsigned long cmd_data_len;
dev = iser_task->iser_conn->ib_conn->device->ib_device;
mem_copy = &iser_task->data_copy[cmd_dir];
ib_dma_unmap_sg(dev, &mem_copy->sg_single, 1,
(cmd_dir == ISER_DIR_OUT) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE);
if (cmd_dir == ISER_DIR_IN) {
char *mem;
struct scatterlist *sgl, *sg;
unsigned char *p, *to;
unsigned int sg_size;
int i;
/* copy back read RDMA to unaligned sg */
mem = mem_copy->copy_buf;
sgl = (struct scatterlist *)iser_task->data[ISER_DIR_IN].buf;
sg_size = iser_task->data[ISER_DIR_IN].size;
p = mem;
for_each_sg(sgl, sg, sg_size, i) {
to = kmap_atomic(sg_page(sg));
memcpy(to + sg->offset,
p,
sg->length);
kunmap_atomic(to);
p += sg->length;
}
}
cmd_data_len = iser_task->data[cmd_dir].data_len;
if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
free_pages((unsigned long)mem_copy->copy_buf,
ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
else
kfree(mem_copy->copy_buf);
mem_copy->copy_buf = NULL;
}
#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0)
/**
* iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
* and returns the length of resulting physical address array (may be less than
* the original due to possible compaction).
*
* we build a "page vec" under the assumption that the SG meets the RDMA
* alignment requirements. Other then the first and last SG elements, all
* the "internal" elements can be compacted into a list whose elements are
* dma addresses of physical pages. The code supports also the weird case
* where --few fragments of the same page-- are present in the SG as
* consecutive elements. Also, it handles one entry SG.
*/
static int iser_sg_to_page_vec(struct iser_data_buf *data,
struct iser_page_vec *page_vec,
struct ib_device *ibdev)
{
struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;
u64 start_addr, end_addr, page, chunk_start = 0;
unsigned long total_sz = 0;
unsigned int dma_len;
int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
/* compute the offset of first element */
page_vec->offset = (u64) sgl[0].offset & ~MASK_4K;
new_chunk = 1;
cur_page = 0;
for_each_sg(sgl, sg, data->dma_nents, i) {
start_addr = ib_sg_dma_address(ibdev, sg);
if (new_chunk)
chunk_start = start_addr;
dma_len = ib_sg_dma_len(ibdev, sg);
end_addr = start_addr + dma_len;
total_sz += dma_len;
/* collect page fragments until aligned or end of SG list */
if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
new_chunk = 0;
continue;
}
new_chunk = 1;
/* address of the first page in the contiguous chunk;
masking relevant for the very first SG entry,
which might be unaligned */
page = chunk_start & MASK_4K;
do {
page_vec->pages[cur_page++] = page;
page += SIZE_4K;
} while (page < end_addr);
}
page_vec->data_size = total_sz;
iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page);
return cur_page;
}
/**
* iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
* for RDMA sub-list of a scatter-gather list of memory buffers, and returns
* the number of entries which are aligned correctly. Supports the case where
* consecutive SG elements are actually fragments of the same physcial page.
*/
static int iser_data_buf_aligned_len(struct iser_data_buf *data,
struct ib_device *ibdev)
{
struct scatterlist *sgl, *sg, *next_sg = NULL;
u64 start_addr, end_addr;
int i, ret_len, start_check = 0;
if (data->dma_nents == 1)
return 1;
sgl = (struct scatterlist *)data->buf;
start_addr = ib_sg_dma_address(ibdev, sgl);
for_each_sg(sgl, sg, data->dma_nents, i) {
if (start_check && !IS_4K_ALIGNED(start_addr))
break;
next_sg = sg_next(sg);
if (!next_sg)
break;
end_addr = start_addr + ib_sg_dma_len(ibdev, sg);
start_addr = ib_sg_dma_address(ibdev, next_sg);
if (end_addr == start_addr) {
start_check = 0;
continue;
} else
start_check = 1;
if (!IS_4K_ALIGNED(end_addr))
break;
}
ret_len = (next_sg) ? i : i+1;
iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
ret_len, data->dma_nents, data);
return ret_len;
}
static void iser_data_buf_dump(struct iser_data_buf *data,
struct ib_device *ibdev)
{
struct scatterlist *sgl = (struct scatterlist *)data->buf;
struct scatterlist *sg;
int i;
if (iser_debug_level == 0)
return;
for_each_sg(sgl, sg, data->dma_nents, i)
iser_warn("sg[%d] dma_addr:0x%lX page:0x%p "
"off:0x%x sz:0x%x dma_len:0x%x\n",
i, (unsigned long)ib_sg_dma_address(ibdev, sg),
sg_page(sg), sg->offset,
sg->length, ib_sg_dma_len(ibdev, sg));
}
static void iser_dump_page_vec(struct iser_page_vec *page_vec)
{
int i;
iser_err("page vec length %d data size %d\n",
page_vec->length, page_vec->data_size);
for (i = 0; i < page_vec->length; i++)
iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
}
static void iser_page_vec_build(struct iser_data_buf *data,
struct iser_page_vec *page_vec,
struct ib_device *ibdev)
{
int page_vec_len = 0;
page_vec->length = 0;
page_vec->offset = 0;
iser_dbg("Translating sg sz: %d\n", data->dma_nents);
page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev);
iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len);
page_vec->length = page_vec_len;
if (page_vec_len * SIZE_4K < page_vec->data_size) {
iser_err("page_vec too short to hold this SG\n");
iser_data_buf_dump(data, ibdev);
iser_dump_page_vec(page_vec);
BUG();
}
}
int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
struct iser_data_buf *data,
enum iser_data_dir iser_dir,
enum dma_data_direction dma_dir)
{
struct ib_device *dev;
iser_task->dir[iser_dir] = 1;
dev = iser_task->iser_conn->ib_conn->device->ib_device;
data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir);
if (data->dma_nents == 0) {
iser_err("dma_map_sg failed!!!\n");
return -EINVAL;
}
return 0;
}
void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task)
{
struct ib_device *dev;
struct iser_data_buf *data;
dev = iser_task->iser_conn->ib_conn->device->ib_device;
if (iser_task->dir[ISER_DIR_IN]) {
data = &iser_task->data[ISER_DIR_IN];
ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE);
}
if (iser_task->dir[ISER_DIR_OUT]) {
data = &iser_task->data[ISER_DIR_OUT];
ib_dma_unmap_sg(dev, data->buf, data->size, DMA_TO_DEVICE);
}
}
/**
* iser_reg_rdma_mem - Registers memory intended for RDMA,
* obtaining rkey and va
*
* returns 0 on success, errno code on failure
*/
int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
enum iser_data_dir cmd_dir)
{
struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn;
struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn;
struct iser_device *device = ib_conn->device;
struct ib_device *ibdev = device->ib_device;
struct iser_data_buf *mem = &iser_task->data[cmd_dir];
struct iser_regd_buf *regd_buf;
int aligned_len;
int err;
int i;
struct scatterlist *sg;
regd_buf = &iser_task->rdma_regd[cmd_dir];
aligned_len = iser_data_buf_aligned_len(mem, ibdev);
if (aligned_len != mem->dma_nents) {
iscsi_conn->fmr_unalign_cnt++;
iser_warn("rdma alignment violation %d/%d aligned\n",
aligned_len, mem->size);
iser_data_buf_dump(mem, ibdev);
/* unmap the command data before accessing it */
iser_dma_unmap_task_data(iser_task);
/* allocate copy buf, if we are writing, copy the */
/* unaligned scatterlist, dma map the copy */
if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0)
return -ENOMEM;
mem = &iser_task->data_copy[cmd_dir];
}
/* if there a single dma entry, FMR is not needed */
if (mem->dma_nents == 1) {
sg = (struct scatterlist *)mem->buf;
regd_buf->reg.lkey = device->mr->lkey;
regd_buf->reg.rkey = device->mr->rkey;
regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]);
regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]);
regd_buf->reg.is_fmr = 0;
iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X "
"va: 0x%08lX sz: %ld]\n",
(unsigned int)regd_buf->reg.lkey,
(unsigned int)regd_buf->reg.rkey,
(unsigned long)regd_buf->reg.va,
(unsigned long)regd_buf->reg.len);
} else { /* use FMR for multiple dma entries */
iser_page_vec_build(mem, ib_conn->page_vec, ibdev);
err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, &regd_buf->reg);
if (err) {
iser_data_buf_dump(mem, ibdev);
iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
mem->dma_nents,
ntoh24(iser_task->desc.iscsi_header.dlength));
iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
ib_conn->page_vec->data_size, ib_conn->page_vec->length,
ib_conn->page_vec->offset);
for (i=0 ; i<ib_conn->page_vec->length ; i++)
iser_err("page_vec[%d] = 0x%llx\n", i,
(unsigned long long) ib_conn->page_vec->pages[i]);
return err;
}
}
return 0;
}
@@ -0,0 +1,868 @@
/*
* Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
* Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include "iscsi_iser.h"
#define ISCSI_ISER_MAX_CONN 8
#define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
#define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN)
static void iser_cq_tasklet_fn(unsigned long data);
static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
static void iser_cq_event_callback(struct ib_event *cause, void *context)
{
iser_err("got cq event %d \n", cause->event);
}
static void iser_qp_event_callback(struct ib_event *cause, void *context)
{
iser_err("got qp event %d\n",cause->event);
}
static void iser_event_handler(struct ib_event_handler *handler,
struct ib_event *event)
{
iser_err("async event %d on device %s port %d\n", event->event,
event->device->name, event->element.port_num);
}
/**
* iser_create_device_ib_res - creates Protection Domain (PD), Completion
* Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
* the adapator.
*
* returns 0 on success, -1 on failure
*/
static int iser_create_device_ib_res(struct iser_device *device)
{
device->pd = ib_alloc_pd(device->ib_device);
if (IS_ERR(device->pd))
goto pd_err;
device->rx_cq = ib_create_cq(device->ib_device,
iser_cq_callback,
iser_cq_event_callback,
(void *)device,
ISER_MAX_RX_CQ_LEN, 0);
if (IS_ERR(device->rx_cq))
goto rx_cq_err;
device->tx_cq = ib_create_cq(device->ib_device,
NULL, iser_cq_event_callback,
(void *)device,
ISER_MAX_TX_CQ_LEN, 0);
if (IS_ERR(device->tx_cq))
goto tx_cq_err;
if (ib_req_notify_cq(device->rx_cq, IB_CQ_NEXT_COMP))
goto cq_arm_err;
tasklet_init(&device->cq_tasklet,
iser_cq_tasklet_fn,
(unsigned long)device);
device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ);
if (IS_ERR(device->mr))
goto dma_mr_err;
INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
iser_event_handler);
if (ib_register_event_handler(&device->event_handler))
goto handler_err;
return 0;
handler_err:
ib_dereg_mr(device->mr);
dma_mr_err:
tasklet_kill(&device->cq_tasklet);
cq_arm_err:
ib_destroy_cq(device->tx_cq);
tx_cq_err:
ib_destroy_cq(device->rx_cq);
rx_cq_err:
ib_dealloc_pd(device->pd);
pd_err:
iser_err("failed to allocate an IB resource\n");
return -1;
}
/**
* iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
* CQ and PD created with the device associated with the adapator.
*/
static void iser_free_device_ib_res(struct iser_device *device)
{
BUG_ON(device->mr == NULL);
tasklet_kill(&device->cq_tasklet);
(void)ib_unregister_event_handler(&device->event_handler);
(void)ib_dereg_mr(device->mr);
(void)ib_destroy_cq(device->tx_cq);
(void)ib_destroy_cq(device->rx_cq);
(void)ib_dealloc_pd(device->pd);
device->mr = NULL;
device->tx_cq = NULL;
device->rx_cq = NULL;
device->pd = NULL;
}
/**
* iser_create_ib_conn_res - Creates FMR pool and Queue-Pair (QP)
*
* returns 0 on success, -1 on failure
*/
static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
{
struct iser_device *device;
struct ib_qp_init_attr init_attr;
int req_err, resp_err, ret = -ENOMEM;
struct ib_fmr_pool_param params;
BUG_ON(ib_conn->device == NULL);
device = ib_conn->device;
ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
ISER_RX_LOGIN_SIZE, GFP_KERNEL);
if (!ib_conn->login_buf)
goto out_err;
ib_conn->login_req_buf = ib_conn->login_buf;
ib_conn->login_resp_buf = ib_conn->login_buf + ISCSI_DEF_MAX_RECV_SEG_LEN;
ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device,
(void *)ib_conn->login_req_buf,
ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device,
(void *)ib_conn->login_resp_buf,
ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
req_err = ib_dma_mapping_error(device->ib_device, ib_conn->login_req_dma);
resp_err = ib_dma_mapping_error(device->ib_device, ib_conn->login_resp_dma);
if (req_err || resp_err) {
if (req_err)
ib_conn->login_req_dma = 0;
if (resp_err)
ib_conn->login_resp_dma = 0;
goto out_err;
}
ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) +
(sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)),
GFP_KERNEL);
if (!ib_conn->page_vec)
goto out_err;
ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1);
params.page_shift = SHIFT_4K;
/* when the first/last SG element are not start/end *
* page aligned, the map whould be of N+1 pages */
params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
/* make the pool size twice the max number of SCSI commands *
* the ML is expected to queue, watermark for unmap at 50% */
params.pool_size = ISCSI_DEF_XMIT_CMDS_MAX * 2;
params.dirty_watermark = ISCSI_DEF_XMIT_CMDS_MAX;
params.cache = 0;
params.flush_function = NULL;
params.access = (IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ);
ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, &params);
if (IS_ERR(ib_conn->fmr_pool)) {
ret = PTR_ERR(ib_conn->fmr_pool);
ib_conn->fmr_pool = NULL;
goto out_err;
}
memset(&init_attr, 0, sizeof init_attr);
init_attr.event_handler = iser_qp_event_callback;
init_attr.qp_context = (void *)ib_conn;
init_attr.send_cq = device->tx_cq;
init_attr.recv_cq = device->rx_cq;
init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS;
init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS;
init_attr.cap.max_send_sge = 2;
init_attr.cap.max_recv_sge = 1;
init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
init_attr.qp_type = IB_QPT_RC;
ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
if (ret)
goto out_err;
ib_conn->qp = ib_conn->cma_id->qp;
iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n",
ib_conn, ib_conn->cma_id,
ib_conn->fmr_pool, ib_conn->cma_id->qp);
return ret;
out_err:
iser_err("unable to alloc mem or create resource, err %d\n", ret);
return ret;
}
/**
* releases the FMR pool, QP and CMA ID objects, returns 0 on success,
* -1 on failure
*/
static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id)
{
BUG_ON(ib_conn == NULL);
iser_err("freeing conn %p cma_id %p fmr pool %p qp %p\n",
ib_conn, ib_conn->cma_id,
ib_conn->fmr_pool, ib_conn->qp);
/* qp is created only once both addr & route are resolved */
if (ib_conn->fmr_pool != NULL)
ib_destroy_fmr_pool(ib_conn->fmr_pool);
if (ib_conn->qp != NULL)
rdma_destroy_qp(ib_conn->cma_id);
/* if cma handler context, the caller acts s.t the cma destroy the id */
if (ib_conn->cma_id != NULL && can_destroy_id)
rdma_destroy_id(ib_conn->cma_id);
ib_conn->fmr_pool = NULL;
ib_conn->qp = NULL;
ib_conn->cma_id = NULL;
kfree(ib_conn->page_vec);
if (ib_conn->login_buf) {
if (ib_conn->login_req_dma)
ib_dma_unmap_single(ib_conn->device->ib_device,
ib_conn->login_req_dma,
ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
if (ib_conn->login_resp_dma)
ib_dma_unmap_single(ib_conn->device->ib_device,
ib_conn->login_resp_dma,
ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
kfree(ib_conn->login_buf);
}
return 0;
}
/**
* based on the resolved device node GUID see if there already allocated
* device for this device. If there's no such, create one.
*/
static
struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
{
struct iser_device *device;
mutex_lock(&ig.device_list_mutex);
list_for_each_entry(device, &ig.device_list, ig_list)
/* find if there's a match using the node GUID */
if (device->ib_device->node_guid == cma_id->device->node_guid)
goto inc_refcnt;
device = kzalloc(sizeof *device, GFP_KERNEL);
if (device == NULL)
goto out;
/* assign this device to the device */
device->ib_device = cma_id->device;
/* init the device and link it into ig device list */
if (iser_create_device_ib_res(device)) {
kfree(device);
device = NULL;
goto out;
}
list_add(&device->ig_list, &ig.device_list);
inc_refcnt:
device->refcount++;
out:
mutex_unlock(&ig.device_list_mutex);
return device;
}
/* if there's no demand for this device, release it */
static void iser_device_try_release(struct iser_device *device)
{
mutex_lock(&ig.device_list_mutex);
device->refcount--;
iser_err("device %p refcount %d\n",device,device->refcount);
if (!device->refcount) {
iser_free_device_ib_res(device);
list_del(&device->ig_list);
kfree(device);
}
mutex_unlock(&ig.device_list_mutex);
}
static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
enum iser_ib_conn_state comp,
enum iser_ib_conn_state exch)
{
int ret;
spin_lock_bh(&ib_conn->lock);
if ((ret = (ib_conn->state == comp)))
ib_conn->state = exch;
spin_unlock_bh(&ib_conn->lock);
return ret;
}
/**
* Frees all conn objects and deallocs conn descriptor
*/
static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id)
{
struct iser_device *device = ib_conn->device;
BUG_ON(ib_conn->state != ISER_CONN_DOWN);
mutex_lock(&ig.connlist_mutex);
list_del(&ib_conn->conn_list);
mutex_unlock(&ig.connlist_mutex);
iser_free_rx_descriptors(ib_conn);
iser_free_ib_conn_res(ib_conn, can_destroy_id);
ib_conn->device = NULL;
/* on EVENT_ADDR_ERROR there's no device yet for this conn */
if (device != NULL)
iser_device_try_release(device);
iscsi_destroy_endpoint(ib_conn->ep);
}
void iser_conn_get(struct iser_conn *ib_conn)
{
atomic_inc(&ib_conn->refcount);
}
int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id)
{
if (atomic_dec_and_test(&ib_conn->refcount)) {
iser_conn_release(ib_conn, can_destroy_id);
return 1;
}
return 0;
}
/**
* triggers start of the disconnect procedures and wait for them to be done
*/
void iser_conn_terminate(struct iser_conn *ib_conn)
{
int err = 0;
/* change the ib conn state only if the conn is UP, however always call
* rdma_disconnect since this is the only way to cause the CMA to change
* the QP state to ERROR
*/
iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING);
err = rdma_disconnect(ib_conn->cma_id);
if (err)
iser_err("Failed to disconnect, conn: 0x%p err %d\n",
ib_conn,err);
wait_event_interruptible(ib_conn->wait,
ib_conn->state == ISER_CONN_DOWN);
iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */
}
static int iser_connect_error(struct rdma_cm_id *cma_id)
{
struct iser_conn *ib_conn;
ib_conn = (struct iser_conn *)cma_id->context;
ib_conn->state = ISER_CONN_DOWN;
wake_up_interruptible(&ib_conn->wait);
return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
}
static int iser_addr_handler(struct rdma_cm_id *cma_id)
{
struct iser_device *device;
struct iser_conn *ib_conn;
int ret;
device = iser_device_find_by_ib_device(cma_id);
if (!device) {
iser_err("device lookup/creation failed\n");
return iser_connect_error(cma_id);
}
ib_conn = (struct iser_conn *)cma_id->context;
ib_conn->device = device;
ret = rdma_resolve_route(cma_id, 1000);
if (ret) {
iser_err("resolve route failed: %d\n", ret);
return iser_connect_error(cma_id);
}
return 0;
}
static int iser_route_handler(struct rdma_cm_id *cma_id)
{
struct rdma_conn_param conn_param;
int ret;
ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context);
if (ret)
goto failure;
memset(&conn_param, 0, sizeof conn_param);
conn_param.responder_resources = 4;
conn_param.initiator_depth = 1;
conn_param.retry_count = 7;
conn_param.rnr_retry_count = 6;
ret = rdma_connect(cma_id, &conn_param);
if (ret) {
iser_err("failure connecting: %d\n", ret);
goto failure;
}
return 0;
failure:
return iser_connect_error(cma_id);
}
static void iser_connected_handler(struct rdma_cm_id *cma_id)
{
struct iser_conn *ib_conn;
ib_conn = (struct iser_conn *)cma_id->context;
ib_conn->state = ISER_CONN_UP;
wake_up_interruptible(&ib_conn->wait);
}
static int iser_disconnected_handler(struct rdma_cm_id *cma_id)
{
struct iser_conn *ib_conn;
int ret;
ib_conn = (struct iser_conn *)cma_id->context;
/* getting here when the state is UP means that the conn is being *
* terminated asynchronously from the iSCSI layer's perspective. */
if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP,
ISER_CONN_TERMINATING))
iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn,
ISCSI_ERR_CONN_FAILED);
/* Complete the termination process if no posts are pending */
if (ib_conn->post_recv_buf_count == 0 &&
(atomic_read(&ib_conn->post_send_buf_count) == 0)) {
ib_conn->state = ISER_CONN_DOWN;
wake_up_interruptible(&ib_conn->wait);
}
ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */
return ret;
}
static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
{
int ret = 0;
iser_err("event %d status %d conn %p id %p\n",
event->event, event->status, cma_id->context, cma_id);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
ret = iser_addr_handler(cma_id);
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
ret = iser_route_handler(cma_id);
break;
case RDMA_CM_EVENT_ESTABLISHED:
iser_connected_handler(cma_id);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_REJECTED:
ret = iser_connect_error(cma_id);
break;
case RDMA_CM_EVENT_DISCONNECTED:
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_ADDR_CHANGE:
ret = iser_disconnected_handler(cma_id);
break;
default:
iser_err("Unexpected RDMA CM event (%d)\n", event->event);
break;
}
return ret;
}
void iser_conn_init(struct iser_conn *ib_conn)
{
ib_conn->state = ISER_CONN_INIT;
init_waitqueue_head(&ib_conn->wait);
ib_conn->post_recv_buf_count = 0;
atomic_set(&ib_conn->post_send_buf_count, 0);
atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */
INIT_LIST_HEAD(&ib_conn->conn_list);
spin_lock_init(&ib_conn->lock);
}
/**
* starts the process of connecting to the target
* sleeps until the connection is established or rejected
*/
int iser_connect(struct iser_conn *ib_conn,
struct sockaddr_in *src_addr,
struct sockaddr_in *dst_addr,
int non_blocking)
{
struct sockaddr *src, *dst;
int err = 0;
sprintf(ib_conn->name, "%pI4:%d",
&dst_addr->sin_addr.s_addr, dst_addr->sin_port);
/* the device is known only --after-- address resolution */
ib_conn->device = NULL;
iser_err("connecting to: %pI4, port 0x%x\n",
&dst_addr->sin_addr, dst_addr->sin_port);
ib_conn->state = ISER_CONN_PENDING;
iser_conn_get(ib_conn); /* ref ib conn's cma id */
ib_conn->cma_id = rdma_create_id(iser_cma_handler,
(void *)ib_conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ib_conn->cma_id)) {
err = PTR_ERR(ib_conn->cma_id);
iser_err("rdma_create_id failed: %d\n", err);
goto id_failure;
}
src = (struct sockaddr *)src_addr;
dst = (struct sockaddr *)dst_addr;
err = rdma_resolve_addr(ib_conn->cma_id, src, dst, 1000);
if (err) {
iser_err("rdma_resolve_addr failed: %d\n", err);
goto addr_failure;
}
if (!non_blocking) {
wait_event_interruptible(ib_conn->wait,
(ib_conn->state != ISER_CONN_PENDING));
if (ib_conn->state != ISER_CONN_UP) {
err = -EIO;
goto connect_failure;
}
}
mutex_lock(&ig.connlist_mutex);
list_add(&ib_conn->conn_list, &ig.connlist);
mutex_unlock(&ig.connlist_mutex);
return 0;
id_failure:
ib_conn->cma_id = NULL;
addr_failure:
ib_conn->state = ISER_CONN_DOWN;
connect_failure:
iser_conn_release(ib_conn, 1);
return err;
}
/**
* iser_reg_page_vec - Register physical memory
*
* returns: 0 on success, errno code on failure
*/
int iser_reg_page_vec(struct iser_conn *ib_conn,
struct iser_page_vec *page_vec,
struct iser_mem_reg *mem_reg)
{
struct ib_pool_fmr *mem;
u64 io_addr;
u64 *page_list;
int status;
page_list = page_vec->pages;
io_addr = page_list[0];
mem = ib_fmr_pool_map_phys(ib_conn->fmr_pool,
page_list,
page_vec->length,
io_addr);
if (IS_ERR(mem)) {
status = (int)PTR_ERR(mem);
iser_err("ib_fmr_pool_map_phys failed: %d\n", status);
return status;
}
mem_reg->lkey = mem->fmr->lkey;
mem_reg->rkey = mem->fmr->rkey;
mem_reg->len = page_vec->length * SIZE_4K;
mem_reg->va = io_addr;
mem_reg->is_fmr = 1;
mem_reg->mem_h = (void *)mem;
mem_reg->va += page_vec->offset;
mem_reg->len = page_vec->data_size;
iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, "
"entry[0]: (0x%08lx,%ld)] -> "
"[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n",
page_vec, page_vec->length,
(unsigned long)page_vec->pages[0],
(unsigned long)page_vec->data_size,
(unsigned int)mem_reg->lkey, mem_reg->mem_h,
(unsigned long)mem_reg->va, (unsigned long)mem_reg->len);
return 0;
}
/**
* Unregister (previosuly registered) memory.
*/
void iser_unreg_mem(struct iser_mem_reg *reg)
{
int ret;
iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
if (ret)
iser_err("ib_fmr_pool_unmap failed %d\n", ret);
reg->mem_h = NULL;
}
int iser_post_recvl(struct iser_conn *ib_conn)
{
struct ib_recv_wr rx_wr, *rx_wr_failed;
struct ib_sge sge;
int ib_ret;
sge.addr = ib_conn->login_resp_dma;
sge.length = ISER_RX_LOGIN_SIZE;
sge.lkey = ib_conn->device->mr->lkey;
rx_wr.wr_id = (unsigned long)ib_conn->login_resp_buf;
rx_wr.sg_list = &sge;
rx_wr.num_sge = 1;
rx_wr.next = NULL;
ib_conn->post_recv_buf_count++;
ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
if (ib_ret) {
iser_err("ib_post_recv failed ret=%d\n", ib_ret);
ib_conn->post_recv_buf_count--;
}
return ib_ret;
}
int iser_post_recvm(struct iser_conn *ib_conn, int count)
{
struct ib_recv_wr *rx_wr, *rx_wr_failed;
int i, ib_ret;
unsigned int my_rx_head = ib_conn->rx_desc_head;
struct iser_rx_desc *rx_desc;
for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
rx_desc = &ib_conn->rx_descs[my_rx_head];
rx_wr->wr_id = (unsigned long)rx_desc;
rx_wr->sg_list = &rx_desc->rx_sg;
rx_wr->num_sge = 1;
rx_wr->next = rx_wr + 1;
my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1);
}
rx_wr--;
rx_wr->next = NULL; /* mark end of work requests list */
ib_conn->post_recv_buf_count += count;
ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
if (ib_ret) {
iser_err("ib_post_recv failed ret=%d\n", ib_ret);
ib_conn->post_recv_buf_count -= count;
} else
ib_conn->rx_desc_head = my_rx_head;
return ib_ret;
}
/**
* iser_start_send - Initiate a Send DTO operation
*
* returns 0 on success, -1 on failure
*/
int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc)
{
int ib_ret;
struct ib_send_wr send_wr, *send_wr_failed;
ib_dma_sync_single_for_device(ib_conn->device->ib_device,
tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
send_wr.next = NULL;
send_wr.wr_id = (unsigned long)tx_desc;
send_wr.sg_list = tx_desc->tx_sg;
send_wr.num_sge = tx_desc->num_sge;
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
atomic_inc(&ib_conn->post_send_buf_count);
ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
if (ib_ret) {
iser_err("ib_post_send failed, ret:%d\n", ib_ret);
atomic_dec(&ib_conn->post_send_buf_count);
}
return ib_ret;
}
static void iser_handle_comp_error(struct iser_tx_desc *desc,
struct iser_conn *ib_conn)
{
if (desc && desc->type == ISCSI_TX_DATAOUT)
kmem_cache_free(ig.desc_cache, desc);
if (ib_conn->post_recv_buf_count == 0 &&
atomic_read(&ib_conn->post_send_buf_count) == 0) {
/* getting here when the state is UP means that the conn is *
* being terminated asynchronously from the iSCSI layer's *
* perspective. */
if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP,
ISER_CONN_TERMINATING))
iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn,
ISCSI_ERR_CONN_FAILED);
/* no more non completed posts to the QP, complete the
* termination process w.o worrying on disconnect event */
ib_conn->state = ISER_CONN_DOWN;
wake_up_interruptible(&ib_conn->wait);
}
}
static int iser_drain_tx_cq(struct iser_device *device)
{
struct ib_cq *cq = device->tx_cq;
struct ib_wc wc;
struct iser_tx_desc *tx_desc;
struct iser_conn *ib_conn;
int completed_tx = 0;
while (ib_poll_cq(cq, 1, &wc) == 1) {
tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id;
ib_conn = wc.qp->qp_context;
if (wc.status == IB_WC_SUCCESS) {
if (wc.opcode == IB_WC_SEND)
iser_snd_completion(tx_desc, ib_conn);
else
iser_err("expected opcode %d got %d\n",
IB_WC_SEND, wc.opcode);
} else {
iser_err("tx id %llx status %d vend_err %x\n",
wc.wr_id, wc.status, wc.vendor_err);
atomic_dec(&ib_conn->post_send_buf_count);
iser_handle_comp_error(tx_desc, ib_conn);
}
completed_tx++;
}
return completed_tx;
}
static void iser_cq_tasklet_fn(unsigned long data)
{
struct iser_device *device = (struct iser_device *)data;
struct ib_cq *cq = device->rx_cq;
struct ib_wc wc;
struct iser_rx_desc *desc;
unsigned long xfer_len;
struct iser_conn *ib_conn;
int completed_tx, completed_rx;
completed_tx = completed_rx = 0;
while (ib_poll_cq(cq, 1, &wc) == 1) {
desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id;
BUG_ON(desc == NULL);
ib_conn = wc.qp->qp_context;
if (wc.status == IB_WC_SUCCESS) {
if (wc.opcode == IB_WC_RECV) {
xfer_len = (unsigned long)wc.byte_len;
iser_rcv_completion(desc, xfer_len, ib_conn);
} else
iser_err("expected opcode %d got %d\n",
IB_WC_RECV, wc.opcode);
} else {
if (wc.status != IB_WC_WR_FLUSH_ERR)
iser_err("rx id %llx status %d vend_err %x\n",
wc.wr_id, wc.status, wc.vendor_err);
ib_conn->post_recv_buf_count--;
iser_handle_comp_error(NULL, ib_conn);
}
completed_rx++;
if (!(completed_rx & 63))
completed_tx += iser_drain_tx_cq(device);
}
/* #warning "it is assumed here that arming CQ only once its empty" *
* " would not cause interrupts to be missed" */
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
completed_tx += iser_drain_tx_cq(device);
iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx);
}
static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
{
struct iser_device *device = (struct iser_device *)cq_context;
tasklet_schedule(&device->cq_tasklet);
}