-
Notifications
You must be signed in to change notification settings - Fork 491
Write standalone --debug output to CSV file #15182
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
d1e320d
61a9e88
0a93b3a
d1456c5
408e624
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,9 @@ | |
|
|
||
| #include <atomic> | ||
| #include <ctime> | ||
| #include <iostream> | ||
| #include <format> | ||
| #include <string> | ||
|
|
||
| #ifndef _WIN32 | ||
| #include <unistd.h> | ||
|
|
@@ -212,6 +215,89 @@ int32_t GPUReconstructionCPU::ExitDevice() | |
| return 0; | ||
| } | ||
|
|
||
| namespace | ||
| { | ||
| void writeHeaderMarkdown(std::ostream& stream) | ||
| { | ||
| stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; | ||
| stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; | ||
| } | ||
|
|
||
| void writeHeaderCSV(std::ostream& stream) | ||
| { | ||
| stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; | ||
| } | ||
|
|
||
| struct Row { | ||
| char type = ' '; | ||
| std::string name; | ||
| uint32_t count = 0; | ||
| double gpu_time = -1.0; | ||
| double cpu_time = -1.0; | ||
| double total_time = -1.0; | ||
| uint32_t memSize = 0; | ||
| uint32_t statNEvents; | ||
|
|
||
| void writeMarkdown(std::ostream& stream) | ||
| { | ||
| double scale = 1000000.0 / statNEvents; | ||
| stream << "| " << type << " | "; | ||
| if (count != 0) | ||
| stream << std::format("{:6} |", count); | ||
| else | ||
| stream << " |"; | ||
| stream << std::format(" {:42}|", name); | ||
| if (gpu_time != -1.0) | ||
| stream << std::format("{:10.0f} |", gpu_time * scale); | ||
| else | ||
| stream << " |"; | ||
| if (cpu_time != -1.0) | ||
| stream << std::format("{:10.0f} |", cpu_time * scale); | ||
| else | ||
| stream << " |"; | ||
| if (cpu_time != -1.0 && total_time != -1.0) | ||
| stream << std::format("{:8.2f} |", cpu_time / total_time); | ||
| else | ||
| stream << " |"; | ||
| if (total_time != -1.0) | ||
| stream << std::format("{:10.0f} |", total_time * scale); | ||
| else | ||
| stream << " |"; | ||
| if (memSize != 0 && count != 0) | ||
| stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); | ||
| else | ||
| stream << " | | |"; | ||
| stream << std::endl; | ||
| } | ||
|
|
||
| void writeCSV(std::ostream& stream) | ||
| { | ||
| double scale = 1000000.0 / statNEvents; | ||
| stream << type << ","; | ||
| if (count != 0) | ||
| stream << count; | ||
| stream << "," << name << ","; | ||
| if (gpu_time != -1.0) | ||
| stream << std::format("{:.0f}", gpu_time * scale); | ||
| stream << ","; | ||
| if (cpu_time != -1.0) | ||
| stream << std::format("{:.0f}", cpu_time * scale); | ||
| stream << ","; | ||
| if (cpu_time != -1.0 && total_time != -1.0) | ||
| stream << std::format("{:.2f}", cpu_time / total_time); | ||
| stream << ","; | ||
| if (total_time != -1.0) | ||
| stream << std::format("{:.0f}", total_time * scale); | ||
| stream << ","; | ||
| if (memSize != 0 && count != 0) | ||
| stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); | ||
| else | ||
| stream << ",,"; | ||
| stream << std::endl; | ||
| } | ||
| }; | ||
| } // namespace | ||
|
|
||
| int32_t GPUReconstructionCPU::RunChains() | ||
| { | ||
| mMemoryScalers->temporaryFactor = 1.; | ||
|
|
@@ -256,15 +342,26 @@ int32_t GPUReconstructionCPU::RunChains() | |
| PrintMemoryOverview(); | ||
| } | ||
|
|
||
| mStatWallTime = (mTimerTotal.GetElapsedTime() * 1000000. / mStatNEvents); | ||
| mStatWallTime = mTimerTotal.GetElapsedTime(); | ||
| std::string nEventReport; | ||
| if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) { | ||
| nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)"; | ||
| } | ||
| double kernelTotal = 0; | ||
| std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.); | ||
|
|
||
| std::ofstream benchmarkCSV; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also here, let's try to encapsulate timing code to GPUReconstructionDebug.cxx |
||
| if (!GetProcessingSettings().timingCSV.empty()) { | ||
| benchmarkCSV.open(GetProcessingSettings().timingCSV, std::ios::out | std::ios::app); | ||
| if (!benchmarkCSV.is_open()) { | ||
| GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str()); | ||
| } else if (mNEventsProcessed == 1) { | ||
| writeHeaderCSV(benchmarkCSV); | ||
| } | ||
| } | ||
|
|
||
| if (GetProcessingSettings().debugLevel >= 1) { | ||
| writeHeaderMarkdown(std::cout); | ||
| for (uint32_t i = 0; i < mTimers.size(); i++) { | ||
| double time = 0; | ||
| if (mTimers[i] == nullptr) { | ||
|
|
@@ -284,11 +381,19 @@ int32_t GPUReconstructionCPU::RunChains() | |
| int32_t stepNum = getRecoStepNum(mTimers[i]->step); | ||
| kernelStepTimes[stepNum] += time; | ||
| } | ||
| char bandwidth[256] = ""; | ||
| Row task_row; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer a more compact form, couldn't you initialize all the task_row members in one line with an initializer list? |
||
| task_row.type = 'K'; | ||
| task_row.name = mTimers[i]->name.c_str(); | ||
| task_row.gpu_time = time; | ||
| task_row.count = mTimers[i]->count; | ||
| task_row.statNEvents = mStatNEvents; | ||
| if (mTimers[i]->memSize && mStatNEvents && time != 0.) { | ||
| snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); | ||
| task_row.memSize = mTimers[i]->memSize; | ||
| } | ||
| if (benchmarkCSV.is_open()) { | ||
| task_row.writeCSV(benchmarkCSV); | ||
| } | ||
| printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); | ||
| task_row.writeMarkdown(std::cout); | ||
| if (GetProcessingSettings().resetTimers) { | ||
| mTimers[i]->count = 0; | ||
| mTimers[i]->memSize = 0; | ||
|
|
@@ -298,16 +403,42 @@ int32_t GPUReconstructionCPU::RunChains() | |
| if (GetProcessingSettings().recoTaskTiming) { | ||
| for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) { | ||
| if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { | ||
| printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks", | ||
| gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime()); | ||
| Row reco_step_row; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And actually, this is quite redundant. Also the Then whatever old parsing scripts still exist, are still compatible. and it will reduce the lines of code in GPUReconstructionCPU.cxx. |
||
| reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)"; | ||
| reco_step_row.gpu_time = kernelStepTimes[i]; | ||
| reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU; | ||
| reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime(); | ||
| reco_step_row.statNEvents = mStatNEvents; | ||
| if (benchmarkCSV.is_open()) { | ||
| reco_step_row.writeCSV(benchmarkCSV); | ||
| } | ||
| reco_step_row.writeMarkdown(std::cout); | ||
| } | ||
| if (mTimersRecoSteps[i].bytesToGPU) { | ||
| printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents, | ||
| mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU); | ||
| Row reco_step_row; | ||
| reco_step_row.type = 'D'; | ||
| reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)"; | ||
| reco_step_row.gpu_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); | ||
| reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU; | ||
| reco_step_row.count = mTimersRecoSteps[i].countToGPU; | ||
| reco_step_row.statNEvents = mStatNEvents; | ||
| if (benchmarkCSV.is_open()) { | ||
| reco_step_row.writeCSV(benchmarkCSV); | ||
| } | ||
| reco_step_row.writeMarkdown(std::cout); | ||
| } | ||
| if (mTimersRecoSteps[i].bytesToHost) { | ||
| printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, | ||
| mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); | ||
| Row reco_step_row; | ||
| reco_step_row.type = 'D'; | ||
| reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)"; | ||
| reco_step_row.gpu_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); | ||
| reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost; | ||
| reco_step_row.count = mTimersRecoSteps[i].countToHost; | ||
| reco_step_row.statNEvents = mStatNEvents; | ||
| if (benchmarkCSV.is_open()) { | ||
| reco_step_row.writeCSV(benchmarkCSV); | ||
| } | ||
| reco_step_row.writeMarkdown(std::cout); | ||
| } | ||
| if (GetProcessingSettings().resetTimers) { | ||
| mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0; | ||
|
|
@@ -321,16 +452,30 @@ int32_t GPUReconstructionCPU::RunChains() | |
| } | ||
| for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) { | ||
| if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { | ||
| printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents); | ||
| Row general_step_row; | ||
| general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i]; | ||
| general_step_row.gpu_time = mTimersGeneralSteps[i].GetElapsedTime(); | ||
| general_step_row.statNEvents = mStatNEvents; | ||
| if (benchmarkCSV.is_open()) { | ||
| general_step_row.writeCSV(benchmarkCSV); | ||
| } | ||
| general_step_row.writeMarkdown(std::cout); | ||
| } | ||
| } | ||
| Row wall_row; | ||
| wall_row.name = "Wall"; | ||
| if (GetProcessingSettings().debugLevel >= 1) { | ||
| mStatKernelTime = kernelTotal * 1000000 / mStatNEvents; | ||
| printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str()); | ||
| wall_row.gpu_time = kernelTotal; | ||
| } | ||
| wall_row.cpu_time = mStatCPUTime; | ||
| wall_row.total_time = mStatWallTime; | ||
| wall_row.statNEvents = mStatNEvents; | ||
| if (benchmarkCSV.is_open()) { | ||
| wall_row.writeCSV(benchmarkCSV); | ||
| } | ||
| printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str()); | ||
| wall_row.writeMarkdown(std::cout); | ||
| } else if (GetProcessingSettings().debugLevel >= 0) { | ||
| GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); | ||
| GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime * 1000000 / mStatNEvents, nEventReport.c_str()); | ||
| } | ||
| if (GetProcessingSettings().resetTimers) { | ||
| mStatNEvents = 0; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -329,12 +329,13 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, | |
| AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") | ||
| AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file") | ||
| AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") | ||
| AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.") | ||
| AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") | ||
| AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") | ||
| AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) | ||
| AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics") | ||
| AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression") | ||
| AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you change this here to 0, the timers will not be reset anymore, thus when debug mode is enabled in O2, timers will not be reset between time frames. |
||
| AddOption(resetTimers, int8_t, 0, "", 0, "Reset timers every event") | ||
| AddOption(deviceTimers, bool, true, "", 0, "Use device timers instead of host-based time measurement") | ||
| AddOption(keepAllMemory, bool, false, "", 0, "Allocate all memory on both device and host, and do not reuse") | ||
| AddOption(keepDisplayMemory, bool, false, "", 0, "Like keepAllMemory, but only for memory required for event display") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would propose we try to move additional functions to GPUReconstructionDebug.cxx, in order not to blow up GPUReconstructionGPU.cxx with timing and debug code.