Skip to content

Commit 508867b

Browse files
committed
Updates for OpenCL stability
1 parent 44e937f commit 508867b

11 files changed

Lines changed: 131 additions & 79 deletions

File tree

.github/workflows/check-unit-tests-arm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Workflow syntax:
22
# https://help.github.com/en/articles/workflow-syntax-for-github-actions
3-
name: unit-test-checker-arm
3+
name: Unit Tests ARM GPUs
44

55
on:
66
pull_request:

.github/workflows/check-unit-tests-intel.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Workflow syntax:
22
# https://help.github.com/en/articles/workflow-syntax-for-github-actions
3-
name: unit-test-checker-intel
3+
name: Unit Tests Intel GPUs
44

55
on:
66
pull_request:

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# chipStar
22

3+
![Unit Tests Intel GPUs](https://github.com/CHIP-SPV/chipStar/workflows/Unit%20Tests%20Intel%20GPUs/badge.svg)
4+
![Unit Tests ARM GPUs](https://github.com/CHIP-SPV/chipStar/workflows/Unit%20Tests%20ARM%20GPUs/badge.svg)
5+
![Docker Build and Publish](https://github.com/CHIP-SPV/chipStar/workflows/Docker%20Build%20and%20Publish/badge.svg)
6+
37
chipStar enables compiling and running HIP and CUDA applications on platforms which support
48
SPIR-V as the device intermediate representation. It supports
59
OpenCL and Level Zero as the low-level runtime alternatives.
@@ -29,6 +33,13 @@ The following libraries have been ported and should work on any platform:
2933

3034
*If there is a library that you need that is not yet supported, please open an issue stating which libraries you require and what application you are trying to build.*
3135

36+
## Applications
37+
38+
chipStar has so far been tested using the following applications:
39+
- [libCEED](https://github.com/CHIP-SPV/libCEED) Our fork includes some workarounds.
40+
- [GAMESS](https://www.msg.chem.iastate.edu/gamess/) Source code is not public.
41+
- [HcBench](https://github.com/zjin-lcf/HeCBench) CUDA Benchmarks.
42+
3243
## Getting Started
3344

3445
Quickest way to get started is by using a prebuilt Docker container. Please refer to [Docker README](docker/docker.md)

scripts/check.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
parser.add_argument('-d', '--dry-run', '-N', action='store_true', help='dry run')
2222
parser.add_argument('--regex-include', type=str, nargs='?', default="", help='Tests to be run must also match this regex (known failures will still be excluded)')
2323
parser.add_argument('--regex-exclude', type=str, nargs='?', default="", help='Specifically exclude tests that match this regex (known failures will still be excluded)')
24+
parser.add_argument('--test-mode-failures', type=str, choices=['exclude', 'include'], default='exclude', help='Control how to handle test failures: exclude (default) or include')
2425

2526
# --total-runtime cannot be used with --num-tries
2627
group = parser.add_mutually_exclusive_group()
@@ -112,15 +113,36 @@ def run_cmd(cmd):
112113
failed_test_list = f"./test_lists/{args.backend.upper()}_{device_type_stripped.upper()}.txt"
113114

114115
def run_tests(num_tries):
115-
if len(args.regex_exclude) > 0:
116-
args.regex_exclude = f"{args.regex_exclude}|"
117-
if len(args.regex_include) > 0:
118-
args.regex_include = f"-R {args.regex_include}"
119-
# if failed_test_list is not empty, separator is |, otherwise it is empty
120-
separator = "|" if os.path.exists(failed_test_list) and os.path.getsize(failed_test_list) > 0 else ""
121-
cmd = f"{modules} {env_vars} ctest --output-on-failure --timeout {args.timeout} --repeat until-fail:{num_tries} -j {args.num_threads} {args.regex_include} -E \"{args.regex_exclude}`cat {failed_test_list}`{separator}`cat {all_test_list}`{texture_cmd}\" -O checkpy_{args.backend}_{args.device_type}.txt"
122-
res, err = run_cmd(cmd)
123-
return res, err
116+
if len(args.regex_exclude) > 0:
117+
args.regex_exclude = f"{args.regex_exclude}|"
118+
if len(args.regex_include) > 0:
119+
args.regex_include = f"{args.regex_include}|"
120+
121+
# Determine the test mode based on the new argument
122+
test_mode = "-E" if args.test_mode_failures == "exclude" else "-R"
123+
124+
# if failed_test_list is not empty, separator is |, otherwise it is empty
125+
separator = "|" if os.path.exists(failed_test_list) and os.path.getsize(failed_test_list) > 0 else ""
126+
127+
cmd = f"{modules} {env_vars} ctest --output-on-failure --timeout {args.timeout} --repeat until-fail:{num_tries} -j {args.num_threads} {test_mode} \"{args.regex_exclude}{args.regex_include}`cat {failed_test_list}`{separator}`cat {all_test_list}`{texture_cmd}{double_cmd}\" -O checkpy_{args.backend}_{args.device_type}.txt"
128+
res, err = run_cmd(cmd)
129+
130+
# If using -R, print the tests that passed
131+
if test_mode == "-R":
132+
passed_tests = []
133+
for line in res.split('\n'):
134+
if "Test" in line and "Passed" in line:
135+
test_name = line.split(':')[1].strip()
136+
passed_tests.append(test_name)
137+
138+
if passed_tests:
139+
print("The following tests FAILED:")
140+
for test in passed_tests:
141+
print(f" {test}")
142+
else:
143+
print("No tests passed.")
144+
145+
return res, err
124146

125147

126148
# if --total-runtime is set, calculate the number of tries by running run_tests and checking the time

src/CHIPBindings.cc

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2747,7 +2747,9 @@ hipError_t hipEventSynchronize(hipEvent_t Event) {
27472747
NULLCHECK(Event);
27482748
chipstar::Event *ChipEvent = static_cast<chipstar::Event *>(Event);
27492749

2750-
ChipEvent->wait();
2750+
if (ChipEvent->isRecordingOrRecorded())
2751+
ChipEvent->wait();
2752+
27512753
RETURN(hipSuccess);
27522754

27532755
CHIP_CATCH
@@ -2762,21 +2764,28 @@ hipError_t hipEventElapsedTime(float *Ms, hipEvent_t Start, hipEvent_t Stop) {
27622764
NULLCHECK(Start, Stop);
27632765
chipstar::Event *ChipEventStart = static_cast<chipstar::Event *>(Start);
27642766
chipstar::Event *ChipEventStop = static_cast<chipstar::Event *>(Stop);
2767+
2768+
if (ChipEventStart->getFlags().isDisableTiming() ||
2769+
ChipEventStop->getFlags().isDisableTiming())
2770+
CHIPERR_LOG_AND_THROW("One of the events has timings disabled. "
2771+
"Unable to return elasped time",
2772+
hipErrorInvalidHandle);
2773+
27652774
if (!ChipEventStart->isRecordingOrRecorded() ||
27662775
!ChipEventStop->isRecordingOrRecorded()) {
27672776
CHIPERR_LOG_AND_THROW("One of the events was not recorded",
27682777
hipErrorInvalidHandle);
27692778
}
2770-
if (ChipEventStart->getFlags().isDisableTiming() ||
2771-
ChipEventStop->getFlags().isDisableTiming())
2772-
CHIPERR_LOG_AND_THROW("One of the events has timings disabled. "
2773-
"Unable to return elasped time",
2774-
hipErrorInvalidResourceHandle);
2779+
2780+
if (!ChipEventStart->getEventStatus() == EVENT_STATUS_RECORDING)
2781+
RETURN(hipErrorNotReady);
2782+
if (!ChipEventStop->getEventStatus() == EVENT_STATUS_RECORDING)
2783+
RETURN(hipErrorNotReady);
27752784

27762785
*Ms = ChipEventStart->getElapsedTime(ChipEventStop);
2777-
RETURN(hipSuccess);
27782786

27792787
CHIP_CATCH
2788+
RETURN(hipSuccess);
27802789
}
27812790

27822791
hipError_t hipEventQuery(hipEvent_t Event) {

src/backend/Level0/zeHipErrorConversion.hh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -345,14 +345,16 @@ inline hipError_t hip_convert_error(ze_result_t zeStatus, FuncPtr func) {
345345
#undef CHIPERR_CHECK_LOG_AND_THROW_TABLE
346346
#define CHIPERR_CHECK_LOG_AND_THROW_TABLE(func, ...) \
347347
do { \
348-
if (zeStatus != ZE_RESULT_SUCCESS) { \
349-
hipError_t err = hip_convert_error(zeStatus, func); \
348+
if (zeStatus != ZE_RESULT_SUCCESS) { \
349+
hipError_t err = hip_convert_error(zeStatus, func); \
350350
if (err == hipErrorTbd) { \
351351
std::cerr << "Error: Unmapped API or API Error Code encountered at " \
352352
<< __FILE__ << ":" << __LINE__ << std::endl; \
353+
std::cerr << "API call: " << #func << std::endl; \
354+
std::cerr << "Error code: " << resultToString(zeStatus) << std::endl; \
353355
std::abort(); \
354356
} \
355-
std::string error_msg = std::string(resultToString(zeStatus)); \
357+
std::string error_msg = std::string(resultToString(zeStatus)); \
356358
std::string custom_msg = std::string(__VA_ARGS__); \
357359
std::string msg_ = error_msg + " " + custom_msg; \
358360
CHIPERR_LOG_AND_THROW(msg_, err); \
@@ -362,8 +364,8 @@ inline hipError_t hip_convert_error(ze_result_t zeStatus, FuncPtr func) {
362364
#undef CHIPERR_CHECK_LOG_AND_ABORT
363365
#define CHIPERR_CHECK_LOG_AND_ABORT(...) \
364366
do { \
365-
if (zeStatus != ZE_RESULT_SUCCESS) { \
366-
std::string error_msg = std::string(resultToString(zeStatus)); \
367+
if (zeStatus != ZE_RESULT_SUCCESS) { \
368+
std::string error_msg = std::string(resultToString(zeStatus)); \
367369
std::string custom_msg = std::string(__VA_ARGS__); \
368370
std::string msg_ = error_msg + " " + custom_msg; \
369371
std::cout << msg_ << std::endl; \
@@ -374,8 +376,8 @@ inline hipError_t hip_convert_error(ze_result_t zeStatus, FuncPtr func) {
374376
#undef CHIPERR_CHECK_LOG_AND_THROW
375377
#define CHIPERR_CHECK_LOG_AND_THROW(errtype, ...) \
376378
do { \
377-
if (zeStatus != ZE_RESULT_SUCCESS) { \
378-
std::string error_msg = std::string(resultToString(zeStatus)); \
379+
if (zeStatus != ZE_RESULT_SUCCESS) { \
380+
std::string error_msg = std::string(resultToString(zeStatus)); \
379381
std::string custom_msg = std::string(__VA_ARGS__); \
380382
std::string msg_ = error_msg + " " + custom_msg; \
381383
CHIPERR_LOG_AND_THROW(msg_, errtype); \

src/backend/OpenCL/CHIPBackendOpenCL.cc

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -666,8 +666,12 @@ CHIPEventOpenCL::CHIPEventOpenCL(CHIPContextOpenCL *ChipContext,
666666
uint64_t CHIPEventOpenCL::getFinishTime() {
667667
int clStatus;
668668
uint64_t Ret;
669+
size_t returnedSize;
669670
clStatus = clGetEventProfilingInfo(ClEvent, CL_PROFILING_COMMAND_END,
670-
sizeof(Ret), &Ret, NULL);
671+
sizeof(Ret), &Ret, &returnedSize);
672+
if (returnedSize != sizeof(Ret)) {
673+
std::abort();
674+
}
671675

672676
if (clStatus != CL_SUCCESS) {
673677
clStatus = clGetEventInfo(ClEvent, CL_EVENT_COMMAND_EXECUTION_STATUS,
@@ -730,6 +734,8 @@ void CHIPEventOpenCL::recordEventCopy(
730734
this->ClEvent = Other->ClEvent;
731735
this->RecordedEvent = Other;
732736
this->Msg = "recordEventCopy: " + Other->Msg;
737+
this->HostTimeStamp =
738+
std::chrono::high_resolution_clock::now().time_since_epoch().count();
733739
}
734740

735741
bool CHIPEventOpenCL::wait() {
@@ -769,10 +775,6 @@ bool CHIPEventOpenCL::updateFinishStatus(bool ThrowErrorIfNotReady) {
769775
}
770776

771777
float CHIPEventOpenCL::getElapsedTime(chipstar::Event *OtherIn) {
772-
// Why do I need to lock the context mutex?
773-
// Can I lock the mutex of this and the other event?
774-
//
775-
776778
CHIPEventOpenCL *Other = (CHIPEventOpenCL *)OtherIn;
777779

778780
if (this->getContext() != Other->getContext())
@@ -781,8 +783,10 @@ float CHIPEventOpenCL::getElapsedTime(chipstar::Event *OtherIn) {
781783
"the same context",
782784
hipErrorTbd);
783785

784-
this->updateFinishStatus(true);
785-
Other->updateFinishStatus(true);
786+
if (this->getEventStatus() == EVENT_STATUS_RECORDING)
787+
this->updateFinishStatus(false);
788+
if (Other->getEventStatus() == EVENT_STATUS_RECORDING)
789+
Other->updateFinishStatus(false);
786790

787791
if (!this->isRecordingOrRecorded() || !Other->isRecordingOrRecorded())
788792
CHIPERR_LOG_AND_THROW("one of the events isn't/hasn't recorded",
@@ -792,23 +796,40 @@ float CHIPEventOpenCL::getElapsedTime(chipstar::Event *OtherIn) {
792796
CHIPERR_LOG_AND_THROW("one of the events hasn't finished",
793797
hipErrorNotReady);
794798

795-
uint64_t Started = this->getFinishTime();
796-
uint64_t Finished = Other->getFinishTime();
799+
uint64_t BeginGPU = this->getFinishTime();
800+
uint64_t EndGPU = Other->getFinishTime();
801+
uint64_t BeginCPU = this->HostTimeStamp;
802+
uint64_t EndCPU = Other->HostTimeStamp;
797803

798-
logTrace("EventElapsedTime: STARTED {} / {} FINISHED {} / {} \n",
799-
(void *)this, Started, (void *)Other, Finished);
804+
bool ReversedEvents = false;
805+
if (EndCPU < BeginCPU) {
806+
ReversedEvents = true;
807+
std::swap(BeginGPU, EndGPU);
808+
}
800809

801-
// apparently fails for Intel NEO, god knows why
802-
// assert(Finished >= Started);
810+
// Handle overflow
803811
int64_t Elapsed;
812+
const uint64_t MaxValue = std::numeric_limits<uint64_t>::max();
813+
if (EndGPU < BeginGPU) {
814+
logError("Overflow detected in CHIPEventOpenCL::getElapsedTime()");
815+
logError("BeginGPU: {}, EndGPU: {}", BeginGPU, EndGPU);
816+
Elapsed =
817+
(MaxValue - BeginGPU) + EndGPU + 1; // +1 to account for wraparound
818+
} else {
819+
Elapsed = EndGPU - BeginGPU;
820+
}
821+
804822
const int64_t NANOSECS = 1000000000;
805-
if (Finished < Started)
806-
logWarn("Finished < Started\n");
807-
Elapsed = Finished - Started;
808823
int64_t MS = (Elapsed / NANOSECS) * 1000;
809824
int64_t NS = Elapsed % NANOSECS;
810825
float FractInMS = ((float)NS) / 1000000.0f;
811-
return (float)MS + FractInMS;
826+
float Ms = (float)MS + FractInMS;
827+
828+
Ms = std::abs(Ms);
829+
if (ReversedEvents)
830+
Ms = -Ms;
831+
832+
return Ms;
812833
}
813834

814835
void CHIPEventOpenCL::hostSignal() { UNIMPLEMENTED(); }
@@ -1069,6 +1090,7 @@ struct HipStreamCallbackData {
10691090
void *UserData;
10701091
hipStreamCallback_t Callback;
10711092
std::shared_ptr<chipstar::Event> CallbackFinishEvent;
1093+
std::shared_ptr<chipstar::Event> CallbackCompleted;
10721094
};
10731095

10741096
void CL_CALLBACK pfn_notify(cl_event Event, cl_int CommandExecStatus,
@@ -1084,6 +1106,7 @@ void CL_CALLBACK pfn_notify(cl_event Event, cl_int CommandExecStatus,
10841106
std::static_pointer_cast<CHIPEventOpenCL>(Cbo->CallbackFinishEvent)
10851107
->ClEvent,
10861108
CL_COMPLETE);
1109+
CHIPERR_CHECK_LOG_AND_THROW_TABLE(clSetUserEventStatus);
10871110
}
10881111
delete Cbo;
10891112
}
@@ -1176,21 +1199,12 @@ void CHIPQueueOpenCL::addCallback(hipStreamCallback_t Callback,
11761199
cl::Context *ClContext_ = ((CHIPContextOpenCL *)ChipContext_)->get();
11771200
cl_int Err;
11781201

1179-
std::shared_ptr<chipstar::Event> HoldBackEvent =
1180-
static_cast<CHIPBackendOpenCL *>(Backend)->createEventShared(
1181-
ChipContext_);
1182-
1183-
std::static_pointer_cast<CHIPEventOpenCL>(HoldBackEvent)->ClEvent =
1184-
clCreateUserEvent(ClContext_->get(), &Err);
1185-
1186-
std::vector<std::shared_ptr<chipstar::Event>> WaitForEvents{HoldBackEvent};
1187-
11881202
// Enqueue a barrier used to ensure the callback is not called too early,
11891203
// otherwise it would be (at worst) executed in this host thread when
11901204
// setting it, blocking the execution, while the clients might expect
11911205
// parallel execution.
11921206
std::shared_ptr<chipstar::Event> HoldbackBarrierCompletedEv =
1193-
enqueueBarrier(WaitForEvents);
1207+
enqueueBarrier(std::vector<std::shared_ptr<chipstar::Event>>{});
11941208

11951209
// OpenCL event callbacks have undefined execution ordering/finishing
11961210
// guarantees. We need to enforce CUDA ordering using user events.
@@ -1208,10 +1222,10 @@ void CHIPQueueOpenCL::addCallback(hipStreamCallback_t Callback,
12081222
// finishing the user CB's execution.
12091223

12101224
HipStreamCallbackData *Cb = new HipStreamCallbackData{
1211-
this, hipSuccess, UserData, Callback, CallbackEvent};
1225+
this, hipSuccess, UserData, Callback, CallbackEvent, nullptr};
12121226

12131227
std::vector<std::shared_ptr<chipstar::Event>> WaitForEventsCBB{CallbackEvent};
1214-
auto CallbackCompleted = enqueueBarrier(WaitForEventsCBB);
1228+
Cb->CallbackCompleted = enqueueBarrier(WaitForEventsCBB);
12151229

12161230
// We know that the callback won't be yet launched since it's depending
12171231
// on the barrier which waits for the user event.
@@ -1221,17 +1235,17 @@ void CHIPQueueOpenCL::addCallback(hipStreamCallback_t Callback,
12211235
CL_COMPLETE, pfn_notify, Cb);
12221236
CHIPERR_CHECK_LOG_AND_THROW_TABLE(clSetEventCallback);
12231237

1224-
updateLastEvent(CallbackCompleted);
1225-
get()->flush();
1238+
updateLastEvent(Cb->CallbackCompleted);
12261239

12271240
// Now the CB can start executing in the background:
12281241
clSetUserEventStatus(
1229-
std::static_pointer_cast<CHIPEventOpenCL>(HoldBackEvent)->ClEvent,
1242+
std::static_pointer_cast<CHIPEventOpenCL>(HoldbackBarrierCompletedEv)
1243+
->ClEvent,
12301244
CL_COMPLETE);
1231-
// HoldBackEvent->decreaseRefCount("Notified finished.");
1245+
CHIPERR_CHECK_LOG_AND_THROW_TABLE(clSetUserEventStatus);
12321246

12331247
return;
1234-
};
1248+
}
12351249

12361250
std::shared_ptr<chipstar::Event> CHIPQueueOpenCL::enqueueMarkerImpl() {
12371251
std::shared_ptr<chipstar::Event> MarkerEvent =
@@ -1512,7 +1526,7 @@ void CHIPQueueOpenCL::finish() {
15121526
LOCK(Backend->DubiousLockOpenCL)
15131527
#endif
15141528
clStatus = get()->finish();
1515-
// CHIPERR_CHECK_LOG_AND_ABORT(clStatus, CL_SUCCESS, hipErrorTbd);
1529+
CHIPERR_CHECK_LOG_AND_THROW_TABLE(clFinish);
15161530
}
15171531

15181532
std::shared_ptr<chipstar::Event>

src/backend/OpenCL/CHIPBackendOpenCL.hh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ public:
116116
cl_event ClEvent;
117117
friend class CHIPEventOpenCL;
118118
std::shared_ptr<chipstar::Event> RecordedEvent;
119+
uint64_t HostTimeStamp;
119120

120121
public:
121122
CHIPEventOpenCL(CHIPContextOpenCL *ChipContext, cl_event ClEvent,

src/backend/OpenCL/clHipErrorConversion.hh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,8 @@ const std::unordered_map<void *, cl_hip_error_map_t> CL_HIP_ERROR_MAPS = {
164164
{(void *)&clFinish,
165165
{{CL_SUCCESS, hipSuccess},
166166
{CL_INVALID_COMMAND_QUEUE, hipErrorInvalidResourceHandle},
167-
{CL_OUT_OF_HOST_MEMORY, hipErrorOutOfMemory}}},
167+
{CL_OUT_OF_HOST_MEMORY, hipErrorOutOfMemory},
168+
{CL_OUT_OF_RESOURCES, hipErrorOutOfMemory}}},
168169

169170
{(void *)&clFlush,
170171
{{CL_SUCCESS, hipSuccess},

0 commit comments

Comments
 (0)