From 68ce3363aa8500e666cd5a748deb5cac9100ff24 Mon Sep 17 00:00:00 2001 From: wanghonghao Date: Tue, 3 Dec 2019 15:32:41 +0800 Subject: [PATCH] iser: dynamic memory region allocator Implement an allocator for allocating memory region of different lengths. The allocator registers 4MB memory chunks as memory regions, and select a free segment from one of them each time. 4KB is the minimum allocation unit, and free segments in the same chunk can be merged into a larger free segment by the rule of buddy allocation. As a result, size of allocated segments will be power of 2, this may waste some space but produces less fragments. In each chunk, a complete binary tree (which is actully an array) is used to maintain free segments. Each node records the order of the largest segment can be allocated from its subtree. Here's a miniature example. A chunk with all segments free: level 4 4(0x1) level 3 3(0x2) 3(0x3) level 2 2(0x4) 2(0x5) 2(0x6) 2(0x7) level 1 1(0x8) 1(0x9) 1(0xa) 1(0xb) 1(0xc) 1(0xd) 1(0xe) 1(0xf) After allocate a 16KB(order=3) memory region: level 4 3(0x1) level 3 0(0x2) 3(0x3) level 2 2(0x4) 2(0x5) 2(0x6) 2(0x7) level 1 1(0x8) 1(0x9) 1(0xa) 1(0xb) 1(0xc) 1(0xd) 1(0xe) 1(0xf) It tooks 1 comparison to determine if a chunk can satisfy and at most 11 loops to find the leftmost free segment meets the requirments. The value of each node is not more than 11, and a 8-bit integer is enough to store it, so only 2048 bytes is required for each tree. And since the entire tree is in a contiguous piece of memory and no rotations are needed, it's far more efficient than self-balancing trees of the same size. Different 4MB chunks are linked as a list, and the selection order is from head to tail each time. If no existing chunks can satisfy the allocation, the allocator will register another 4M chunk and add it to the tail. +---------+ +---------+ +---------+ |4MB chunk| ---> |4MB chunk| ---> |4MB chunk| +---------+ +---------+ +---------+ In most cases, smaller IOs can always get memory regions from the first or second chunk and never traverse the list too much, and if we really send a lot of large IOs, the cost of the traversal is rarely critical. At last, obviously, the chunks can only allocate a maximum of 4MB memory region, if a larger memory region is needed, the allocater registers/deregisters a memory region directly regardless of buffer. Signed-off-by: wanghonghao --- include/iser-private.h | 19 ++- lib/iser.c | 355 +++++++++++++++++++++++++++++------------ 2 files changed, 263 insertions(+), 111 deletions(-) diff --git a/include/iser-private.h b/include/iser-private.h index b943d34..07c119e 100644 --- a/include/iser-private.h +++ b/include/iser-private.h @@ -44,8 +44,12 @@ #define ISER_WSV 0x08 #define ISER_RSV 0x04 -#define NUM_MRS 0x100 -#define DATA_BUFFER_SIZE 0x40000 +#define DATA_BUFFER_UNIT_SHIFT SHIFT_4K +#define DATA_BUFFER_UNIT_SIZE SIZE_4K +#define DATA_BUFFER_CHUNK_SHIFT 22 +#define DATA_BUFFER_CHUNK_SIZE (1ULL << DATA_BUFFER_CHUNK_SHIFT) +#define DATA_BUFFER_CHUNK_UNITS_SHIFT (DATA_BUFFER_CHUNK_SHIFT - DATA_BUFFER_UNIT_SHIFT) +#define DATA_BUFFER_CHUNK_UNITS (1 << DATA_BUFFER_CHUNK_UNITS_SHIFT) #define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + ISCSI_RAW_HEADER_SIZE) @@ -165,6 +169,13 @@ struct iser_pdu { struct iser_tx_desc *desc; }; +struct iser_buf_chunk { + unsigned char *buf; + struct ibv_mr *mr; + struct iser_buf_chunk *next; + int8_t tree[DATA_BUFFER_CHUNK_UNITS << 1]; +}; + struct iser_conn { struct rdma_cm_id *cma_id; struct rdma_event_channel *cma_channel; @@ -180,9 +191,6 @@ struct iser_conn { sem_t sem_connect; struct ibv_mr *login_resp_mr; - struct ibv_mr *login_req_mr; - unsigned char *login_buf; - unsigned char *login_req_buf; unsigned char *login_resp_buf; pthread_t cmthread; @@ -199,6 +207,7 @@ struct iser_conn { enum conn_state conn_state; struct iser_tx_desc *tx_desc; + struct iser_buf_chunk *buf_chunk; }; void iscsi_init_iser_transport(struct iscsi_context *iscsi); diff --git a/lib/iser.c b/lib/iser.c index f3baf4c..46116f1 100644 --- a/lib/iser.c +++ b/lib/iser.c @@ -123,6 +123,210 @@ iscsi_iser_service(struct iscsi_context *iscsi, int revents) return iscsi_iser_revive_queued_pdus(iscsi); } +static inline int +fls(int x) +{ + if (!x) + return 0; + + return sizeof(int) * 8 - __builtin_clz(x); +} + +static inline void +iser_buf_chunk_tree_propagate(int8_t *tree, int pos, int level) { + int value; + + for (pos >>= 1, level++; pos; pos >>= 1, level++) { + if (tree[pos << 1] == level - 1) { + if (tree[(pos << 1) | 1] == level - 1) { + value = level; + } else { + value = level - 1; + } + } else { + value = (tree[pos << 1] > tree[(pos << 1) | 1] ? + tree[pos << 1] : tree[(pos << 1) | 1]); + } + + if (value == tree[pos]) + break; + + tree[pos] = value; + } +} + +static inline void +iser_buf_chunk_free(struct iser_buf_chunk *chunk, void *ptr) { + int8_t *tree = chunk->tree; + int unit = ((unsigned char *)ptr - chunk->buf) >> DATA_BUFFER_UNIT_SHIFT; + int pos = unit + DATA_BUFFER_CHUNK_UNITS; + int level = 1; + + for (; tree[pos]; pos >>= 1, level++) { + } + + tree[pos] = level; + iser_buf_chunk_tree_propagate(tree, pos, level); +} + +static inline void * +iser_buf_chunk_alloc(struct iser_buf_chunk *chunk, int want) { + int8_t *tree = chunk->tree; + int pos, level, unit; + void *result; + + /* satisfy ? */ + if (tree[1] < want) { + return NULL; + } + + /* lookup */ + for (pos = 1, level = DATA_BUFFER_CHUNK_UNITS_SHIFT + 1; want < level; level--) { + pos = pos << 1; + pos = tree[pos] >= want ? pos : (pos | 1); + } + + /* pick the node */ + tree[pos] = 0; + unit = (pos << (level - 1)) - DATA_BUFFER_CHUNK_UNITS; + result = chunk->buf + (unit << DATA_BUFFER_UNIT_SHIFT); + + /* propagate */ + iser_buf_chunk_tree_propagate(tree, pos, level); + + return result; +} + +static inline int +iser_buf_chunk_contains(struct iser_buf_chunk *chunk, void *ptr) { + return ((unsigned char *)ptr >= chunk->buf && + (unsigned char *)ptr < chunk->buf + DATA_BUFFER_CHUNK_SIZE); +} + +static void +iser_tx_desc_free(struct iscsi_context *iscsi, struct iser_tx_desc *tx_desc) +{ + struct iser_conn *iser_conn = iscsi->opaque; + struct iser_buf_chunk *chunk = iser_conn->buf_chunk; + + if (tx_desc->data_mr != NULL) { + for (; chunk != NULL; chunk = chunk->next) { + if (chunk->mr == tx_desc->data_mr) { + iser_buf_chunk_free(chunk, tx_desc->data_buff); + break; + } + } + if (chunk == NULL) { + ibv_dereg_mr(tx_desc->data_mr); + iscsi_free(iscsi, tx_desc->data_buff); + } + } + + ISCSI_LIST_ADD(&iser_conn->tx_desc, tx_desc); +} + +static struct iser_tx_desc * +iser_tx_desc_alloc(struct iscsi_context *iscsi, size_t data_size) { + struct iser_conn *iser_conn = iscsi->opaque; + struct iser_tx_desc *tx_desc = iser_conn->tx_desc; + struct iser_buf_chunk **buf_chunk = &iser_conn->buf_chunk; + struct iser_buf_chunk *chunk; + int want, i; + void *buf; + + if (tx_desc != NULL) { + ISCSI_LIST_REMOVE(&iser_conn->tx_desc, tx_desc); + } else { + tx_desc = iscsi_malloc(iscsi, sizeof(*tx_desc)); + if (tx_desc == NULL) { + iscsi_set_error(iscsi, "Out-Of-Memory, failed to allocate data buffer"); + return NULL; + } + + tx_desc->hdr_mr = ibv_reg_mr(iser_conn->pd, tx_desc, ISER_HEADERS_LEN, IBV_ACCESS_LOCAL_WRITE); + if (tx_desc->hdr_mr == NULL) { + iscsi_free(iscsi, tx_desc); + iscsi_set_error(iscsi, "Failed to register data mr"); + return NULL; + } + } + + if (data_size == 0) { + tx_desc->data_buff = NULL; + tx_desc->data_mr = NULL; + return tx_desc; + } else if (data_size > DATA_BUFFER_CHUNK_SIZE) { + tx_desc->data_buff = iscsi_malloc(iscsi, data_size); + if (tx_desc->data_buff == NULL) { + iscsi_set_error(iscsi, "Out-Of-Memory, failed to allocate data buffer"); + goto release; + } + + tx_desc->data_mr = ibv_reg_mr(iser_conn->pd, tx_desc->data_buff, data_size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); + if (tx_desc->data_mr == NULL) { + iscsi_free(iscsi, tx_desc->data_buff); + iscsi_set_error(iscsi, "Failed to register data mr"); + goto release; + } + return tx_desc; + } + + want = fls((data_size * 2 - 1) / DATA_BUFFER_UNIT_SIZE / 2) + 1; + + for (; *buf_chunk != NULL; buf_chunk = &(*buf_chunk)->next) { + buf = iser_buf_chunk_alloc(*buf_chunk, want); + if (buf != NULL) { + tx_desc->data_buff = buf; + tx_desc->data_mr = (*buf_chunk)->mr; + return tx_desc; + } + } + + chunk = iscsi_malloc(iscsi, sizeof(struct iser_buf_chunk)); + if (chunk == NULL) { + iscsi_set_error(iscsi, "Out-Of-Memory, failed to allocate data buffer"); + goto release; + } + + chunk->buf = iscsi_malloc(iscsi, DATA_BUFFER_CHUNK_SIZE); + if (chunk == NULL) { + iscsi_set_error(iscsi, "Out-Of-Memory, failed to allocate data buffer"); + goto free_chunk; + } + + chunk->mr = ibv_reg_mr(iser_conn->pd, chunk->buf, DATA_BUFFER_CHUNK_SIZE, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); + + if (chunk->mr == NULL) { + iscsi_set_error(iscsi, "Failed to register data mr"); + goto free_chunk_buf; + } + + for (i = 0; i < DATA_BUFFER_CHUNK_UNITS_SHIFT + 1; i++) { + memset(chunk->tree + (1 << i), + DATA_BUFFER_CHUNK_UNITS_SHIFT - i + 1, (1 << i)); + } + chunk->next = NULL; + *buf_chunk = chunk; + + tx_desc->data_buff = iser_buf_chunk_alloc(chunk, want); + tx_desc->data_mr = chunk->mr; + + return tx_desc; + +free_chunk_buf: + iscsi_free(iscsi, chunk->buf); + +free_chunk: + iscsi_free(iscsi, chunk); + +release: + ISCSI_LIST_ADD(&iser_conn->tx_desc, tx_desc); + + return NULL; +} + /* * iser_free_rx_descriptors() - freeing descriptors memory * @iser_conn: ib connection context @@ -145,34 +349,16 @@ iser_free_rx_descriptors(struct iser_conn *iser_conn) return; } -/* - * iser_free_login_buf() - freeing login buffer - * @iser_conn: ib connection context - */ -static void -iser_free_login_buf(struct iser_conn *iser_conn) -{ - struct iscsi_context *iscsi = iser_conn->cma_id->context; - - iscsi_free(iscsi, iser_conn->login_buf); - iser_conn->login_buf = NULL; - - return; -} - static void iser_free_reg_mr(struct iser_conn *iser_conn) { struct iser_tx_desc *tx_desc = iser_conn->tx_desc; struct iscsi_context *iscsi = iser_conn->cma_id->context; struct iser_tx_desc *temp_tx_desc; + struct iser_buf_chunk *chunk, *temp_chunk; while (tx_desc) { ibv_dereg_mr(tx_desc->hdr_mr); - ibv_dereg_mr(tx_desc->data_mr); - - if (tx_desc->data_buff) - iscsi_free(iscsi, tx_desc->data_buff); temp_tx_desc = tx_desc; tx_desc = tx_desc->next; @@ -180,6 +366,16 @@ iser_free_reg_mr(struct iser_conn *iser_conn) } iser_conn->tx_desc = NULL; + for (chunk = iser_conn->buf_chunk; chunk; ) { + ibv_dereg_mr(chunk->mr); + iscsi_free(iscsi, chunk->buf); + + temp_chunk = chunk; + chunk = temp_chunk->next; + iscsi_free(iscsi, temp_chunk); + } + iser_conn->buf_chunk = NULL; + return; } @@ -207,9 +403,6 @@ iser_free_iser_conn_res(struct iser_conn *iser_conn, bool destroy) iser_free_reg_mr(iser_conn); - if (iser_conn->login_buf) - iser_free_login_buf(iser_conn); - if (iser_conn->rx_descs) { iser_free_rx_descriptors(iser_conn); iser_conn->rx_descs = NULL; @@ -221,6 +414,11 @@ iser_free_iser_conn_res(struct iser_conn *iser_conn, bool destroy) iscsi_set_error(iscsi, "Failed to deregister login response mr"); } + if (iser_conn->login_resp_buf) { + iscsi_free(iscsi, iser_conn->login_resp_buf); + iser_conn->login_resp_buf = NULL; + } + if (iser_conn->cq) { ret = ibv_destroy_cq(iser_conn->cq); if (ret) @@ -330,6 +528,11 @@ iscsi_iser_free_pdu(struct iscsi_context *iscsi, struct iscsi_pdu *pdu) iser_pdu = container_of(pdu, struct iser_pdu, iscsi_pdu); + if (iser_pdu->desc != NULL) { + iser_tx_desc_free(iscsi, iser_pdu->desc); + iser_pdu->desc = NULL; + } + if (pdu->outdata.size <= iscsi->smalloc_size) { iscsi_sfree(iscsi, pdu->outdata.data); } else { @@ -437,6 +640,15 @@ iser_post_send(struct iser_conn *iser_conn, struct iser_tx_desc *tx_desc, bool s return 0; } +static inline int +get_data_size(struct iser_pdu *iser_pdu) +{ + if (!iser_pdu->iscsi_pdu.scsi_cbdata.task) + return iser_pdu->iscsi_pdu.outdata.size - ISCSI_RAW_HEADER_SIZE; + + return iser_pdu->iscsi_pdu.scsi_cbdata.task->expxferlen; +} + /* * iser_send_control() - sending iscsi pdu of type CONTROL * @@ -465,19 +677,12 @@ iser_send_control(struct iser_conn *iser_conn, struct iser_pdu *iser_pdu) { char* data = (char*)&iser_pdu->iscsi_pdu.outdata.data[ISCSI_RAW_HEADER_SIZE]; struct ibv_sge *tx_dsg = &tx_desc->tx_sg[1]; - iser_conn->login_req_mr = ibv_reg_mr(iser_conn->pd, iser_conn->login_req_buf, - datalen , IBV_ACCESS_LOCAL_WRITE); - if (iser_conn->login_req_mr == NULL) { - iscsi_set_error(iscsi, "Failed Reg iser_conn->login_req_mr"); - return -1; - } + memcpy(tx_desc->data_buff, data, datalen); - memcpy(iser_conn->login_req_buf, data, datalen); - - tx_dsg->addr = (uintptr_t)iser_conn->login_req_buf; + tx_dsg->addr = (uintptr_t)tx_desc->data_buff; tx_dsg->length = datalen; - tx_dsg->lkey = iser_conn->login_req_mr->lkey; - tx_desc->num_sge = 2; + tx_dsg->lkey = tx_desc->data_mr->lkey; + tx_desc->num_sge = 2; } if (iser_pdu->iscsi_pdu.response_opcode == ISCSI_PDU_LOGIN_RESPONSE || @@ -506,12 +711,14 @@ iser_send_control(struct iser_conn *iser_conn, struct iser_pdu *iser_pdu) { * @iser_conn: iser_connection context */ static int -iser_initialize_headers(struct iser_pdu *iser_pdu, struct iser_conn *iser_conn) +iser_initialize_headers(struct iser_pdu *iser_pdu, struct iscsi_context *iscsi) { struct iser_tx_desc *tx_desc; - tx_desc = iser_conn->tx_desc; - ISCSI_LIST_REMOVE(&iser_conn->tx_desc, tx_desc); + tx_desc = iser_tx_desc_alloc(iscsi, get_data_size(iser_pdu)); + if (tx_desc == NULL) { + return -1; + } iser_pdu->desc = tx_desc; @@ -640,19 +847,6 @@ is_control_opcode(uint8_t opcode) return is_control; } -static int -overflow_data_size(struct iser_pdu *iser_pdu) -{ - int data_size; - - if (!iser_pdu->iscsi_pdu.scsi_cbdata.task) { - return 0; - } - data_size = iser_pdu->iscsi_pdu.scsi_cbdata.task->expxferlen; - - return (data_size > DATA_BUFFER_SIZE); -} - /* * iser_send_command() - sending iscsi pdu of type COMMAND * @@ -673,11 +867,6 @@ iser_send_command(struct iser_conn *iser_conn, struct iser_pdu *iser_pdu) iser_create_send_desc(iser_pdu); - if (overflow_data_size(iser_pdu)) { - iscsi_set_error(iscsi, "Libiscsi-iSER supports messages smaller than 512k\n"); - return -1; - } - if (iser_pdu->desc->iscsi_header[1] & BHSSC_FLAGS_R) { err = iser_prepare_read_cmd(iser_conn, iser_pdu); if (err) { @@ -717,7 +906,7 @@ iscsi_iser_send_pdu(struct iscsi_context *iscsi, struct iscsi_pdu *pdu) { if (!iser_conn) return 0; - if (iser_initialize_headers(iser_pdu, iser_conn)) { + if (iser_initialize_headers(iser_pdu, iscsi)) { iscsi_set_error(iscsi, "initialize headers Failed\n"); return -1; } @@ -904,20 +1093,17 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id) { goto cq_error; } - iser_conn->login_buf = iscsi_malloc(iscsi, ISCSI_DEF_MAX_RECV_SEG_LEN + ISER_RX_LOGIN_SIZE); - if (!iser_conn->login_buf) { - iscsi_set_error(iscsi, "Failed to allocate memory for login_buf\n"); - iscsi_free(iscsi, iser_conn->login_buf); + iser_conn->login_resp_buf = iscsi_malloc(iscsi, ISER_RX_LOGIN_SIZE); + if (!iser_conn->login_resp_buf) { + iscsi_set_error(iscsi, "Failed to allocate memory for login_resp_buf\n"); goto cq_error; } - iser_conn->login_req_buf = iser_conn->login_buf; - iser_conn->login_resp_buf = iser_conn->login_buf + ISCSI_DEF_MAX_RECV_SEG_LEN; iser_conn->login_resp_mr = ibv_reg_mr(iser_conn->pd, iser_conn->login_resp_buf, ISER_RX_LOGIN_SIZE, IBV_ACCESS_LOCAL_WRITE); if(!iser_conn->login_resp_mr) { iscsi_set_error(iscsi, "Failed to reg login_resp_mr\n"); - iscsi_free(iscsi, iser_conn->login_buf); + iscsi_free(iscsi, iser_conn->login_resp_buf); goto cq_error; } @@ -1072,46 +1258,6 @@ iser_post_recvm(struct iser_conn *iser_conn, int count) return ret; } -static int -iser_reg_mr(struct iser_conn *iser_conn) -{ - int i; - struct iscsi_context *iscsi = iser_conn->cma_id->context; - struct iser_tx_desc *tx_desc; - - for (i = 0 ; i < NUM_MRS ; i++) { - - tx_desc = iscsi_zmalloc(iscsi, sizeof(*tx_desc)); - if (tx_desc == NULL) { - iscsi_set_error(iscsi, "Out-Of-Memory, failed to allocate data buffer"); - return -1; - } - - tx_desc->hdr_mr = ibv_reg_mr(iser_conn->pd, tx_desc, ISER_HEADERS_LEN, IBV_ACCESS_LOCAL_WRITE); - if (tx_desc->hdr_mr == NULL) { - iscsi_set_error(iscsi, "Failed to register data mr"); - return -1; - } - - tx_desc->data_buff = iscsi_malloc(iscsi, DATA_BUFFER_SIZE); - if (tx_desc->data_buff == NULL) { - iscsi_set_error(iscsi, "Out-Of-Memory, failed to allocate data buffer"); - return -1; - } - - tx_desc->data_mr = ibv_reg_mr(iser_conn->pd, tx_desc->data_buff, DATA_BUFFER_SIZE, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); - if (tx_desc->data_mr == NULL) { - iscsi_set_error(iscsi, "Failed to register data mr"); - return -1; - } - - ISCSI_LIST_ADD_END(&iser_conn->tx_desc, tx_desc); - } - - return 0; -} - /** * iser_rcv_completion() - handling and processing receive completion * @@ -1187,8 +1333,6 @@ iser_rcv_completion(struct iser_rx_desc *rx_desc, } } - ISCSI_LIST_ADD_END(&iser_conn->tx_desc, iser_pdu->desc); - nop_target: /* decrementing conn->post_recv_buf_count only --after-- freeing the * * task eliminates the need to worry on tasks which are completed in * @@ -1346,8 +1490,7 @@ static int iser_connected_handler(struct rdma_cm_id *cma_id) { iser_conn->post_recv_buf_count = 0; - return iser_reg_mr(iser_conn); - + return 0; } /*