libmooshika
mooshika.h
Go to the documentation of this file.
1 /*
2  *
3  * Copyright CEA/DAM/DIF (2012)
4  * contributor : Dominique Martinet dominique.martinet@cea.fr
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 3 of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  *
20  * ---------------------------------------
21  */
22 
32 #ifndef _MOOSHIKA_H
33 #define _MOOSHIKA_H
34 
35 #include <rdma/rdma_cma.h>
36 
37 #define MOOSHIKA_API_VERSION 5
38 
39 typedef struct msk_trans msk_trans_t;
41 
46 typedef struct msk_data {
47  uint32_t max_size;
48  uint32_t size;
49  uint8_t *data;
50  struct msk_data *next;
51  struct ibv_mr *mr;
52  enum ibv_wc_status status;
53 } msk_data_t;
54 
55 typedef union sockaddr_union {
56  struct sockaddr sa;
57  struct sockaddr_in sa_in;
58  struct sockaddr_in6 sa_int6;
59  struct sockaddr_storage sa_stor;
61 
62 struct msk_stats {
63  uint64_t rx_bytes;
64  uint64_t rx_pkt;
65  uint64_t rx_err;
66  uint64_t tx_bytes;
67  uint64_t tx_pkt;
68  uint64_t tx_err;
69  /* times only set if debug has MSK_DEBUG_SPEED */
70  uint64_t nsec_callback;
71  uint64_t nsec_compevent;
72 };
73 
74 struct msk_pd {
75  struct ibv_context *context;
76  struct ibv_pd *pd;
77  struct ibv_srq *srq;
78  struct msk_ctx *rctx;
79  void *private;
80  uint32_t refcnt;
81  uint32_t used;
82 };
83 #define PD_GUARD ((void*)-1)
84 
85 typedef void (*disconnect_callback_t) (msk_trans_t *trans);
86 
87 #define MSK_CLIENT 0
88 #define MSK_SERVER_CHILD -1
89 
94 struct msk_trans {
95  enum msk_state {
105  } state;
106  struct rdma_cm_id *cm_id;
107  struct rdma_event_channel *event_channel;
108  struct ibv_comp_channel *comp_channel;
109  struct msk_pd *pd;
110  struct ibv_qp *qp;
111  struct ibv_srq *srq;
112  struct ibv_cq *cq;
115  long timeout;
116  int sq_depth;
118  int rq_depth;
120  char *node;
121  char *port;
122  int conn_type;
123  int server;
125  int privport;
126  uint32_t debug;
127  struct rdma_cm_id **conn_requests;
128  struct msk_ctx *wctx;
129  struct msk_ctx *rctx;
130  pthread_mutex_t cm_lock;
131  pthread_cond_t cm_cond;
132  struct ibv_recv_wr *bad_recv_wr;
133  struct ibv_send_wr *bad_send_wr;
134  struct msk_stats stats;
137 };
138 
141  int debug;
142  int server;
144  int privport;
145  long timeout;
146  int sq_depth;
148  int use_srq;
149  int rq_depth;
153  enum rdma_port_space conn_type;
154  char *node;
155  char *port;
156  struct msk_pd *pd;
158 };
159 
160 #define MSK_DEBUG_EVENT 0x0001
161 #define MSK_DEBUG_SETUP 0x0002
162 #define MSK_DEBUG_SEND 0x0004
163 #define MSK_DEBUG_RECV 0x0008
164 #define MSK_DEBUG_WORKERS (MSK_DEBUG_SEND | MSK_DEBUG_RECV)
165 #define MSK_DEBUG_CM_LOCKS 0x0010
166 #define MSK_DEBUG_CTX 0x0020
167 #define MSK_DEBUG_SPEED 0x8000
168 
169 
170 typedef void (*ctx_callback_t)(msk_trans_t *trans, msk_data_t *data, void *arg);
171 
172 
177 typedef struct msk_rloc {
178  uint64_t raddr;
179  uint32_t rkey;
180  uint32_t size;
181 } msk_rloc_t;
182 
183 
184 
185 int msk_post_n_recv(msk_trans_t *trans, msk_data_t *data, int num_sge, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg);
186 int msk_post_n_send(msk_trans_t *trans, msk_data_t *data, int num_sge, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg);
187 int msk_wait_n_recv(msk_trans_t *trans, msk_data_t *data, int num_sge);
188 int msk_wait_n_send(msk_trans_t *trans, msk_data_t *data, int num_sge);
189 int msk_post_n_read(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc, ctx_callback_t callback, ctx_callback_t err_callback, void* callback_arg);
190 int msk_post_n_write(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc, ctx_callback_t callback, ctx_callback_t err_callback, void* callback_arg);
191 int msk_wait_n_read(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc);
192 int msk_wait_n_write(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc);
193 
194 static inline int msk_post_recv(msk_trans_t *trans, msk_data_t *data, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg) {
195  return msk_post_n_recv(trans, data, 1, callback, err_callback, callback_arg);
196 }
197 static inline int msk_post_send(msk_trans_t *trans, msk_data_t *data, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg) {
198  return msk_post_n_send(trans, data, 1, callback, err_callback, callback_arg);
199 }
200 
201 static inline int msk_wait_recv(msk_trans_t *trans, msk_data_t *data) {
202  return msk_wait_n_recv(trans, data, 1);
203 }
204 
205 static inline int msk_wait_send(msk_trans_t *trans, msk_data_t *data) {
206  return msk_wait_n_send(trans, data, 1);
207 }
208 
209 static inline int msk_post_read(msk_trans_t *trans, msk_data_t *data, msk_rloc_t *rloc, ctx_callback_t callback, ctx_callback_t err_callback, void* callback_arg) {
210  return msk_post_n_read(trans, data, 1, rloc, callback, err_callback, callback_arg);
211 }
212 
213 static inline int msk_post_write(msk_trans_t *trans, msk_data_t *data, msk_rloc_t *rloc, ctx_callback_t callback, ctx_callback_t err_callback, void* callback_arg) {
214  return msk_post_n_write(trans, data, 1, rloc, callback, err_callback, callback_arg);
215 }
216 
217 static inline int msk_wait_read(msk_trans_t *trans, msk_data_t *data, msk_rloc_t *rloc) {
218  return msk_wait_n_read(trans, data, 1, rloc);
219 }
220 
221 static inline int msk_wait_write(msk_trans_t *trans, msk_data_t *data, msk_rloc_t *rloc) {
222  return msk_wait_n_write(trans, data, 1, rloc);
223 }
224 
225 
226 
227 int msk_init(msk_trans_t **ptrans, msk_trans_attr_t *attr);
228 
229 // server specific:
230 int msk_bind_server(msk_trans_t *trans);
231 msk_trans_t *msk_accept_one_wait(msk_trans_t *trans, int msleep);
232 msk_trans_t *msk_accept_one_timedwait(msk_trans_t *trans, struct timespec *abstime);
233 static inline msk_trans_t *msk_accept_one(msk_trans_t *trans) {
234  return msk_accept_one_timedwait(trans, NULL);
235 }
236 int msk_finalize_accept(msk_trans_t *trans);
237 void msk_destroy_trans(msk_trans_t **ptrans);
238 
239 int msk_connect(msk_trans_t *trans);
241 
242 
243 /* utility functions */
244 
245 struct ibv_mr *msk_reg_mr(msk_trans_t *trans, void *memaddr, size_t size, int access);
246 int msk_dereg_mr(struct ibv_mr *mr);
247 
248 msk_rloc_t *msk_make_rloc(struct ibv_mr *mr, uint64_t addr, uint32_t size);
249 
250 void msk_print_devinfo(msk_trans_t *trans);
251 
252 struct sockaddr *msk_get_dst_addr(msk_trans_t *trans);
253 struct sockaddr *msk_get_src_addr(msk_trans_t *trans);
254 uint16_t msk_get_src_port(msk_trans_t *trans);
255 uint16_t msk_get_dst_port(msk_trans_t *trans);
256 
257 struct msk_pd *msk_getpd(msk_trans_t *trans);
258 
259 const char *msk_wc_status_str(enum ibv_wc_status status);
260 
261 #endif /* _MOOSHIKA_H */
uint64_t nsec_callback
Definition: mooshika.h:70
Definition: mooshika.h:98
int server
0 if client, connection backlog on server, -1 (MSK_SERVER_CHILD) if server's accepted connection ...
Definition: mooshika.h:123
pthread_mutex_t cm_lock
lock for connection events
Definition: mooshika.h:130
int msk_wait_n_read(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc)
Definition: trans_rdma.c:2467
int msk_post_n_write(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg)
Definition: trans_rdma.c:2463
struct sockaddr_in sa_in
Definition: mooshika.h:57
struct ibv_cq * cq
Completion Queue pointer.
Definition: mooshika.h:112
struct ibv_mr * mr
Definition: mooshika.h:51
data size and content to send/just received
Definition: mooshika.h:46
struct sockaddr_storage sa_stor
Definition: mooshika.h:59
int server
0 if client, connection backlog on server
Definition: mooshika.h:142
uint64_t tx_pkt
Definition: mooshika.h:67
int msk_wait_n_recv(msk_trans_t *trans, msk_data_t *data, int num_sge)
Post a receive buffer and waits for that one and not any other to be filled.
Definition: trans_rdma.c:2413
Definition: mooshika.h:103
int debug
verbose output to stderr if set
Definition: mooshika.h:141
int max_send_sge
Maximum number of s/g elements per send.
Definition: mooshika.h:117
RDMA transport instance.
Definition: mooshika.h:94
int msk_dereg_mr(struct ibv_mr *mr)
msk_reg_mr: deregisters memory for rdma use (exactly ibv_dereg_mr)
Definition: trans_rdma.c:322
uint32_t size
size of the region we can write/read
Definition: mooshika.h:180
enum msk_trans::msk_state state
tracks the transport state machine for connection setup and tear down
struct rdma_cm_id ** conn_requests
temporary child cm_id, only used for server
Definition: mooshika.h:127
struct sockaddr sa
Definition: mooshika.h:56
int rq_depth
The depth of the Receive Queue.
Definition: mooshika.h:118
void * private_data
Definition: mooshika.h:114
disconnect_callback_t disconnect_callback
Definition: mooshika.h:113
char * stats_prefix
Definition: mooshika.h:135
uint64_t raddr
remote memory address
Definition: mooshika.h:178
struct ibv_context * context
Definition: mooshika.h:75
struct ibv_comp_channel * comp_channel
Definition: mooshika.h:108
uint32_t rkey
remote key
Definition: mooshika.h:179
const char * msk_wc_status_str(enum ibv_wc_status status)
Definition: trans_rdma.c:2517
struct msk_pd * pd
Protection Domain pointer.
Definition: mooshika.h:156
void msk_print_devinfo(msk_trans_t *trans)
Definition: trans_rdma.c:350
Definition: mooshika.h:100
int conn_type
RDMA Port space, probably RDMA_PS_TCP.
Definition: mooshika.h:122
uint8_t * data
opaque data
Definition: mooshika.h:49
Definition: mooshika.h:97
msk_trans_t * msk_accept_one_timedwait(msk_trans_t *trans, struct timespec *abstime)
msk_accept_one: given a listening trans, waits till one connection is requested and accepts it ...
Definition: trans_rdma.c:1897
int msk_bind_server(msk_trans_t *trans)
msk_bind_server
Definition: trans_rdma.c:1689
struct ibv_srq * srq
Shared Receive Queue pointer.
Definition: mooshika.h:111
Definition: mooshika.h:96
int privport
set to 1 if mooshika should use a reserved port for client side
Definition: mooshika.h:144
int sq_depth
The depth of the Send Queue.
Definition: mooshika.h:116
int privport
set to 1 if mooshika should use a reserved port for client side
Definition: mooshika.h:125
int worker_count
Number of worker threads - works only for the first init.
Definition: mooshika.h:151
Definition: mooshika.h:139
int rq_depth
The depth of the Receive Queue.
Definition: mooshika.h:149
Definition: mooshika.h:101
int msk_finalize_accept(msk_trans_t *trans)
msk_finalize_accept: does the real connection acceptance and wait for other side to be ready ...
Definition: trans_rdma.c:1846
struct msk_data msk_data_t
uint64_t nsec_compevent
Definition: mooshika.h:71
struct ibv_mr * msk_reg_mr(msk_trans_t *trans, void *memaddr, size_t size, int access)
msk_reg_mr: registers memory for rdma use (almost the same as ibv_reg_mr)
Definition: trans_rdma.c:304
char * node
The remote peer's hostname.
Definition: mooshika.h:120
int worker_queue_size
Size of the worker data queue - works only for the first init.
Definition: mooshika.h:152
int max_recv_sge
Maximum number of s/g elements per recv.
Definition: mooshika.h:119
int msk_connect(msk_trans_t *trans)
msk_connect: connects a client to a server
Definition: trans_rdma.c:2151
struct sockaddr * msk_get_src_addr(msk_trans_t *trans)
Definition: trans_rdma.c:2505
struct rdma_event_channel * event_channel
Definition: mooshika.h:107
int max_send_sge
Maximum number of s/g elements per send.
Definition: mooshika.h:147
uint64_t rx_err
Definition: mooshika.h:65
struct ibv_qp * qp
Queue Pair pointer.
Definition: mooshika.h:110
long timeout
Number of mSecs to wait for connection management events.
Definition: mooshika.h:115
union sockaddr_union sockaddr_union_t
msk_trans_t * msk_accept_one_wait(msk_trans_t *trans, int msleep)
Definition: trans_rdma.c:1957
msk_rloc_t * msk_make_rloc(struct ibv_mr *mr, uint64_t addr, uint32_t size)
msk_make_rloc: makes a rkey to send it for remote host use
Definition: trans_rdma.c:335
uint32_t size
size of the data to actually send/read
Definition: mooshika.h:48
struct sockaddr_in6 sa_int6
Definition: mooshika.h:58
msk_state
Definition: mooshika.h:95
struct msk_pd * msk_getpd(msk_trans_t *trans)
msk_getpd: helper function to get the right pd for a given trans
Definition: trans_rdma.c:222
uint64_t rx_bytes
Definition: mooshika.h:63
int msk_finalize_connect(msk_trans_t *trans)
msk_finalize_connect: tells the other side we're ready to receive stuff (does the actual rdma_connect...
Definition: trans_rdma.c:2098
long timeout
Number of mSecs to wait for connection management events.
Definition: mooshika.h:145
int msk_post_n_recv(msk_trans_t *trans, msk_data_t *data, int num_sge, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg)
msk_post_n_recv: Post a receive buffer.
Definition: trans_rdma.c:2218
enum ibv_wc_status status
work completion status, set upon reception of work completion
Definition: mooshika.h:52
void msk_destroy_trans(msk_trans_t **ptrans)
msk_destroy_trans: disconnects and free trans data
Definition: trans_rdma.c:1352
struct msk_ctx * wctx
pointer to actual context data
Definition: mooshika.h:128
int sq_depth
The depth of the Send Queue.
Definition: mooshika.h:146
void(* ctx_callback_t)(msk_trans_t *trans, msk_data_t *data, void *arg)
Definition: mooshika.h:170
char * stats_prefix
Definition: mooshika.h:157
int use_srq
Does the server use srq?
Definition: mooshika.h:148
int msk_post_n_send(msk_trans_t *trans, msk_data_t *data, int num_sge, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg)
Post a send buffer.
Definition: trans_rdma.c:2390
Definition: mooshika.h:55
struct msk_rloc msk_rloc_t
int msk_init(msk_trans_t **ptrans, msk_trans_attr_t *attr)
msk_init: part of the init that's the same for client and server
Definition: trans_rdma.c:1440
uint64_t tx_err
Definition: mooshika.h:68
struct rdma_cm_id * cm_id
The RDMA CM ID.
Definition: mooshika.h:106
struct msk_ctx * rctx
pointer to actual context data
Definition: mooshika.h:129
uint32_t debug
Definition: mooshika.h:126
struct msk_data * next
For recv/sends with multiple elements, used as a linked list.
Definition: mooshika.h:50
uint64_t rx_pkt
Definition: mooshika.h:64
Definition: mooshika.h:99
struct ibv_pd * pd
Definition: mooshika.h:76
int stats_sock
Definition: mooshika.h:136
struct ibv_srq * srq
Definition: mooshika.h:77
struct msk_pd * pd
Protection Domain pointer list.
Definition: mooshika.h:109
char * node
The remote peer's hostname.
Definition: mooshika.h:154
int msk_post_n_read(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc, ctx_callback_t callback, ctx_callback_t err_callback, void *callback_arg)
Definition: trans_rdma.c:2459
struct msk_stats stats
Definition: mooshika.h:134
struct msk_ctx * rctx
Definition: mooshika.h:78
int msk_wait_n_send(msk_trans_t *trans, msk_data_t *data, int num_sge)
Post a send buffer and waits for that one to be completely sent.
Definition: trans_rdma.c:2437
uint64_t tx_bytes
Definition: mooshika.h:66
char * port
The service port (or name)
Definition: mooshika.h:121
uint32_t max_size
size of the data field
Definition: mooshika.h:47
void(* disconnect_callback_t)(msk_trans_t *trans)
Definition: mooshika.h:85
stores one remote address to write/read at
Definition: mooshika.h:177
int msk_wait_n_write(msk_trans_t *trans, msk_data_t *data, int num_sge, msk_rloc_t *rloc)
Definition: trans_rdma.c:2484
uint32_t used
Definition: mooshika.h:81
struct sockaddr * msk_get_dst_addr(msk_trans_t *trans)
Definition: trans_rdma.c:2501
uint16_t msk_get_src_port(msk_trans_t *trans)
Definition: trans_rdma.c:2509
Definition: mooshika.h:102
enum rdma_port_space conn_type
RDMA Port space, probably RDMA_PS_TCP.
Definition: mooshika.h:153
int max_recv_sge
Maximum number of s/g elements per recv.
Definition: mooshika.h:150
Definition: mooshika.h:74
Definition: mooshika.h:62
uint16_t msk_get_dst_port(msk_trans_t *trans)
Definition: trans_rdma.c:2513
char * port
The service port (or name)
Definition: mooshika.h:155
Definition: mooshika.h:104
disconnect_callback_t disconnect_callback
Definition: mooshika.h:140
struct ibv_send_wr * bad_send_wr
Definition: mooshika.h:133
uint32_t refcnt
Definition: mooshika.h:80
pthread_cond_t cm_cond
cond for connection events
Definition: mooshika.h:131
int destroy_on_disconnect
set to 1 if mooshika should perform cleanup
Definition: mooshika.h:143
int destroy_on_disconnect
set to 1 if mooshika should perform cleanup
Definition: mooshika.h:124
struct ibv_recv_wr * bad_recv_wr
Definition: mooshika.h:132