Skip to content

Commit 364c3f9

Browse files
author
Sean Hefty
committed
prov/sockets: Fix use after free error in CM threads
Problem reported by Address Sanitizer: ================================================================= ==25220==ERROR: AddressSanitizer: heap-use-after-free on address 0x6270000072e0 at pc 0x00010b926a3c bp 0x700001bd1c30 sp 0x700001bd1c28 READ of size 4 at 0x6270000072e0 thread T4 #0 0x10b926a3b in sock_conn_listener_thread (libfabric.1.dylib:x86_64+0xdca3b) #1 0x7fff7e2d5660 in _pthread_body (libsystem_pthread.dylib:x86_64+0x3660) #2 0x7fff7e2d550c in _pthread_start (libsystem_pthread.dylib:x86_64+0x350c) #3 0x7fff7e2d4bf8 in thread_start (libsystem_pthread.dylib:x86_64+0x2bf8) 0x6270000072e0 is located 480 bytes inside of 12944-byte region [0x627000007100,0x62700000a390) freed by thread T0 here: #0 0x10baf1a9d in wrap_free (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x56a9d) #1 0x10b9016bf in sock_ep_close (libfabric.1.dylib:x86_64+0xb76bf) #2 0x10b7f4a8f in fi_close fabric.h:593 #3 0x10b7f4209 in main shared_ctx.c:649 #4 0x7fff7dfbd014 in start (libdyld.dylib:x86_64+0x1014) previously allocated by thread T0 here: #0 0x10baf1e27 in wrap_calloc (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x56e27) #1 0x10b906df4 in sock_alloc_endpoint (libfabric.1.dylib:x86_64+0xbcdf4) #2 0x10b8f7fdb in sock_msg_ep (libfabric.1.dylib:x86_64+0xadfdb) #3 0x10b7f7c93 in fi_endpoint fi_endpoint.h:164 #4 0x10b7f5e40 in server_connect shared_ctx.c:471 #5 0x10b7f49ba in run shared_ctx.c:573 #6 0x10b7f411b in main shared_ctx.c:647 #7 0x7fff7dfbd014 in start (libdyld.dylib:x86_64+0x1014) Thread T4 created by T0 here: #0 0x10bae999d in wrap_pthread_create (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x4e99d) #1 0x10b925f9b in sock_conn_start_listener_thread (libfabric.1.dylib:x86_64+0xdbf9b) #2 0x10b8e7eb2 in sock_domain (libfabric.1.dylib:x86_64+0x9deb2) #3 0x10b7f87d3 in fi_domain fi_domain.h:306 #4 0x10b7f5c9f in server_connect shared_ctx.c:460 #5 0x10b7f49ba in run shared_ctx.c:573 #6 0x10b7f411b in main shared_ctx.c:647 #7 0x7fff7dfbd014 in start (libdyld.dylib:x86_64+0x1014) The issue shows up more frequently on OS X, which emulates epoll. However, I believe the problem could occur on any platform. In sock_ep_close, we remove the socket from the epoll fd, then free the endpoint. However, if the listener thread has received an event on the socket, but has not yet started processing it, then a race can occur. The listener thread could have returned from ofi_epoll_wait, but suspended trying to acquire the signal_lock. The signal_lock is acquired from sock_ep_close, where ofi_epoll_del is called, then released. The endpoint is then freed. The listener thread can now acquire the signal_lock, where it will attempt to access the freed endpoint data. To avoid the race, we add a change boolean to the listener. That boolean is only changed while holding the signal_lock. When a socket is removed from the epollfd, we mark the listener state as 'changed'. The listener thread checks the changed state prior to processing any events. If set, it clears the state, and calls ofi_epoll_wait again to get a new set of events to process. Note that this works for epoll set to level-triggered (poll semantics). Sockets that reported events will report those same events when wait is called a second time. Sockets which were removed from the epoll set would have their events removed, as they are no longer being monitored. This fix is applied both to the listener thread and cm thread. Signed-off-by: Sean Hefty <sean.hefty@intel.com>
1 parent 8337e71 commit 364c3f9

4 files changed

Lines changed: 26 additions & 0 deletions

File tree

prov/sockets/include/sock.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ struct sock_conn_listener {
210210
fastlock_t signal_lock; /* acquire before map lock */
211211
pthread_t listener_thread;
212212
int do_listen;
213+
bool removed_from_epollfd;
213214
};
214215

215216
struct sock_ep_cm_head {
@@ -219,6 +220,7 @@ struct sock_ep_cm_head {
219220
pthread_t listener_thread;
220221
struct dlist_entry msg_list;
221222
int do_listen;
223+
bool removed_from_epollfd;
222224
};
223225

224226
struct sock_domain {

prov/sockets/src/sock_conn.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,15 @@ static void *sock_conn_listener_thread(void *arg)
332332
}
333333

334334
fastlock_acquire(&conn_listener->signal_lock);
335+
if (conn_listener->removed_from_epollfd) {
336+
/* The epoll set changed between calling wait and wait
337+
* returning. Get an updated set of events to avoid
338+
* possible use after free error.
339+
*/
340+
conn_listener->removed_from_epollfd = false;
341+
goto skip;
342+
}
343+
335344
for (i = 0; i < num_fds; i++) {
336345
conn_handle = ep_contexts[i];
337346

@@ -360,6 +369,7 @@ static void *sock_conn_listener_thread(void *arg)
360369
fastlock_release(&ep_attr->cmap.lock);
361370
sock_pe_signal(ep_attr->domain->pe);
362371
}
372+
skip:
363373
fastlock_release(&conn_listener->signal_lock);
364374
}
365375

@@ -393,6 +403,7 @@ int sock_conn_start_listener_thread(struct sock_conn_listener *conn_listener)
393403
}
394404

395405
conn_listener->do_listen = 1;
406+
conn_listener->removed_from_epollfd = false;
396407
ret = pthread_create(&conn_listener->listener_thread, NULL,
397408
sock_conn_listener_thread, conn_listener);
398409
if (ret < 0) {

prov/sockets/src/sock_ep.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,7 @@ static int sock_ep_close(struct fid *fid)
682682
fastlock_acquire(&sock_ep->attr->domain->conn_listener.signal_lock);
683683
ofi_epoll_del(sock_ep->attr->domain->conn_listener.epollfd,
684684
sock_ep->attr->conn_handle.sock);
685+
sock_ep->attr->domain->conn_listener.removed_from_epollfd = true;
685686
fastlock_release(&sock_ep->attr->domain->conn_listener.signal_lock);
686687
ofi_close_socket(sock_ep->attr->conn_handle.sock);
687688
sock_ep->attr->conn_handle.do_listen = 0;

prov/sockets/src/sock_ep_msg.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ sock_ep_cm_unmonitor_handle_locked(struct sock_ep_cm_head *cm_head,
255255
SOCK_LOG_ERROR("failed to unmonitor fd %d: %d\n",
256256
handle->sock_fd, ret);
257257
handle->monitored = 0;
258+
cm_head->removed_from_epollfd = true;
258259
}
259260

260261
/* Multiple threads might call sock_ep_cm_unmonitor_handle() at the
@@ -1174,6 +1175,15 @@ static void *sock_ep_cm_thread(void *arg)
11741175
}
11751176

11761177
pthread_mutex_lock(&cm_head->signal_lock);
1178+
if (cm_head->removed_from_epollfd) {
1179+
/* If we removed a socket from the epollfd after
1180+
* ofi_epoll_wait returned, we can hit a use after
1181+
* free error. If a change was made, we skip processing
1182+
* and recheck for events.
1183+
*/
1184+
cm_head->removed_from_epollfd = false;
1185+
goto skip;
1186+
}
11771187
for (i = 0; i < num_fds; i++) {
11781188
handle = ep_contexts[i];
11791189

@@ -1195,6 +1205,7 @@ static void *sock_ep_cm_thread(void *arg)
11951205
assert(handle->sock_fd != INVALID_SOCKET);
11961206
sock_ep_cm_handle_rx(cm_head, handle);
11971207
}
1208+
skip:
11981209
pthread_mutex_unlock(&cm_head->signal_lock);
11991210
}
12001211
return NULL;
@@ -1230,6 +1241,7 @@ int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head)
12301241
}
12311242

12321243
cm_head->do_listen = 1;
1244+
cm_head->removed_from_epollfd = false;
12331245
ret = pthread_create(&cm_head->listener_thread, 0,
12341246
sock_ep_cm_thread, cm_head);
12351247
if (ret) {

0 commit comments

Comments
 (0)